[wip-PATCH] rfi: PAGE_CACHE

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [wip-PATCH] rfi: PAGE_CACHE_SIZE suppoort
@ 2001-07-05  5:06 Ben LaHaise
  2001-07-05  5:55 ` Linus Torvalds
  0 siblings, 1 reply; 20+ messages in thread
From: Ben LaHaise @ 2001-07-05  5:06 UTC (permalink / raw)
  To: torvalds; +Cc: linux-mm, linux-kernel, Alexander Viro

Hello Linus, Al et al,

I attacked the PAGE_CACHE_SIZE support in the kernel for the last few days
in an attempt to get multipage PAGE_CACHE_SIZE support working and below
is what I've come up with.  It currently boots to single user read only,
doesn't quite have write support fixed properly yet, but is going pretty
well.  The reason for sending this out now is the question of what to do
about kmap() support.  In going through the ext2 dirs in pagecache code, I
had to fix the broken kmap() usage in the code.  Once that was done,
adding support for multipage page cache pages resulted in loops and other
oddities all over the code that might be better hidden from the
filesystem.  So the question is: do we want to make kmap support > order 0
mappings?  I'm looking forward to any input received, as well as feedback
on the patch.  Thanks.

Oh, Al, I'll extract the ext2 fixes for highmem from this patch for you
tomorrow, but have a look over them and see if you can spot anything that
I've missed.  Cheers,

		-ben

.... ~/patches/v2.4.6-pre8-pgc-A0.diff ....
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/boot/install.sh pgc-2.4.6-pre8/arch/i386/boot/install.sh
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/boot/install.sh	Tue Jan  3 06:57:26 1995
+++ pgc-2.4.6-pre8/arch/i386/boot/install.sh	Wed Jul  4 16:42:32 2001
@@ -21,6 +21,7 @@

 # User may have a custom install script

+if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
 if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi

 # Default install - same as make zlilo
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in pgc-2.4.6-pre8/arch/i386/config.in
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in	Sun Jul  1 21:45:04 2001
+++ pgc-2.4.6-pre8/arch/i386/config.in	Sun Jul  1 21:49:20 2001
@@ -180,6 +180,8 @@
 if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
    define_bool CONFIG_HAVE_DEC_LOCK y
 fi
+
+int 'Page cache shift' CONFIG_PAGE_CACHE_SHIFT 0
 endmenu

 mainmenu_option next_comment
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c pgc-2.4.6-pre8/fs/buffer.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c	Sat Jun 30 14:04:27 2001
+++ pgc-2.4.6-pre8/fs/buffer.c	Thu Jul  5 00:49:54 2001
@@ -774,6 +774,7 @@

 	/* This is a temporary buffer used for page I/O. */
 	page = bh->b_page;
+	page = page_cache_page(page);

 	if (!uptodate)
 		SetPageError(page);
@@ -1252,8 +1253,10 @@

 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
 {
+	page += offset >> PAGE_SHIFT;
+	offset &= PAGE_SIZE - 1;
 	bh->b_page = page;
-	if (offset >= PAGE_SIZE)
+	if (offset >= PAGE_CACHE_SIZE)
 		BUG();
 	if (PageHighMem(page))
 		/*
@@ -1280,7 +1283,9 @@

 try_again:
 	head = NULL;
-	offset = PAGE_SIZE;
+	if (!PageCachePage(page))
+		BUG();
+	offset = PAGE_CACHE_SIZE;
 	while ((offset -= size) >= 0) {
 		bh = get_unused_buffer_head(async);
 		if (!bh)
@@ -1664,6 +1669,8 @@
 	unsigned int blocksize, blocks;
 	int nr, i;

+	if (!PageCachePage(page))
+		BUG();
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 	blocksize = inode->i_sb->s_blocksize;
@@ -1688,9 +1695,13 @@
 					continue;
 			}
 			if (!buffer_mapped(bh)) {
-				memset(kmap(page) + i*blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap(page);
+				struct page *map = page;
+				unsigned offset = i * blocksize;
+				map += offset >> PAGE_SHIFT;
+				offset &= PAGE_SIZE - 1;
+				memset(kmap(map) + offset, 0, blocksize);
+				flush_dcache_page(map);
+				kunmap(map);
 				set_bit(BH_Uptodate, &bh->b_state);
 				continue;
 			}
@@ -2228,7 +2239,7 @@
 		return 0;
 	}

-	page = alloc_page(GFP_NOFS);
+	page = __page_cache_alloc(GFP_NOFS);
 	if (!page)
 		goto out;
 	LockPage(page);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/dir.c pgc-2.4.6-pre8/fs/ext2/dir.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/dir.c	Sat Jun 30 14:04:27 2001
+++ pgc-2.4.6-pre8/fs/ext2/dir.c	Thu Jul  5 00:32:56 2001
@@ -38,7 +38,6 @@

 static inline void ext2_put_page(struct page *page)
 {
-	kunmap(page);
 	page_cache_release(page);
 }

@@ -58,29 +57,38 @@
 	return err;
 }

-static void ext2_check_page(struct page *page)
+static inline void __ext2_check_page(struct page *base, struct page *page)
 {
-	struct inode *dir = page->mapping->host;
+	struct inode *dir = base->mapping->host;
 	struct super_block *sb = dir->i_sb;
 	unsigned chunk_size = ext2_chunk_size(dir);
-	char *kaddr = page_address(page);
+	char *kaddr = NULL;
 	u32 max_inumber = le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_CACHE_SIZE;
+	unsigned limit = PAGE_SIZE;
 	ext2_dirent *p;
 	char *error;

-	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_CACHE_MASK;
+	kaddr = kmap(page);
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == base->index) {
+		limit = dir->i_size & (PAGE_CACHE_SIZE-1);
+		if (limit <= ((page - base) << PAGE_SHIFT))
+			goto out;
+		limit -= (page - base) << PAGE_SHIFT;
+		if (!limit)
+			goto out;
+		if (limit > PAGE_SIZE)
+			limit = PAGE_SIZE;
 		if (limit & (chunk_size - 1))
 			goto Ebadsize;
-		for (offs = limit; offs<PAGE_CACHE_SIZE; offs += chunk_size) {
-			ext2_dirent *p = (ext2_dirent*)(kaddr + offs);
+		for (offs = limit; offs<PAGE_SIZE; offs += chunk_size) {
+			ext2_dirent *p;
+			p = (ext2_dirent*)(kaddr + offs);
 			p->rec_len = cpu_to_le16(chunk_size);
 		}
-		if (!limit)
-			goto out;
 	}
+
+printk("limit=%u  idx=%d  size=%Ld\n", limit, page - base, dir->i_size);
 	for (offs = 0; offs <= limit - EXT2_DIR_REC_LEN(1); offs += rec_len) {
 		p = (ext2_dirent *)(kaddr + offs);
 		rec_len = le16_to_cpu(p->rec_len);
@@ -99,7 +107,7 @@
 	if (offs != limit)
 		goto Eend;
 out:
-	SetPageChecked(page);
+	kunmap(page);
 	return;

 	/* Too bad, we had an error */
@@ -127,7 +135,7 @@
 bad_entry:
 	ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - "
 		"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		dir->i_ino, error, (base->index<<PAGE_CACHE_SHIFT)+offs,
 		(unsigned long) le32_to_cpu(p->inode),
 		rec_len, p->name_len);
 	goto fail;
@@ -136,11 +144,19 @@
 	ext2_error (sb, "ext2_check_page",
 		"entry in directory #%lu spans the page boundary"
 		"offset=%lu, inode=%lu",
-		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		dir->i_ino, (base->index<<PAGE_CACHE_SHIFT)+offs,
 		(unsigned long) le32_to_cpu(p->inode));
 fail:
+	kunmap(page);
+	SetPageError(base);
+}
+
+static void ext2_check_page(struct page *page)
+{
+	unsigned i;
+	for (i=0; i<PAGE_CACHE_PAGES; i++)
+		__ext2_check_page(page, page+i);
 	SetPageChecked(page);
-	SetPageError(page);
 }

 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
@@ -150,7 +166,6 @@
 				(filler_t*)mapping->a_ops->readpage, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page(page);
-		kmap(page);
 		if (!Page_Uptodate(page))
 			goto fail;
 		if (!PageChecked(page))
@@ -248,20 +263,24 @@
 	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
 		types = ext2_filetype_table;

+	npages <<= PAGE_CACHE_ORDER;
+
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
 		ext2_dirent *de;
-		struct page *page = ext2_get_page(inode, n);
+		struct page *page, *map;

+		page = ext2_get_page(inode, n >> PAGE_CACHE_ORDER);
 		if (IS_ERR(page))
 			continue;
-		kaddr = page_address(page);
+		map = page + (n & PAGE_CACHE_PMASK);
+		kaddr = kmap(map);
 		if (need_revalidate) {
 			offset = ext2_validate_entry(kaddr, offset, chunk_mask);
 			need_revalidate = 0;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
-		limit = kaddr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1);
+		limit = kaddr + PAGE_SIZE - EXT2_DIR_REC_LEN(1);
 		for ( ;(char*)de <= limit; de = ext2_next_entry(de))
 			if (de->inode) {
 				int over;
@@ -272,18 +291,20 @@

 				offset = (char *)de - kaddr;
 				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
+						(n << PAGE_SHIFT) | offset,
 						le32_to_cpu(de->inode), d_type);
 				if (over) {
+					kunmap(map);
 					ext2_put_page(page);
 					goto done;
 				}
 			}
+		kunmap(map);
 		ext2_put_page(page);
 	}

 done:
-	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
+	filp->f_pos = (n << PAGE_SHIFT) | offset;
 	filp->f_version = inode->i_version;
 	UPDATE_ATIME(inode);
 	return 0;
@@ -313,23 +334,26 @@

 	for (n = 0; n < npages; n++) {
 		char *kaddr;
+		unsigned i;
 		page = ext2_get_page(dir, n);
 		if (IS_ERR(page))
 			continue;

-		kaddr = page_address(page);
-		de = (ext2_dirent *) kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
-		for ( ; (char *) de <= kaddr ; de = ext2_next_entry(de))
-			if (ext2_match (namelen, name, de))
-				goto found;
+		for (i=0; i<PAGE_CACHE_PAGES; i++) {
+			struct page *map = page + i;
+			kaddr = kmap(map);
+			de = (ext2_dirent *) kaddr;
+			kaddr += PAGE_SIZE - reclen;
+			for ( ; (char *) de <= kaddr ; de = ext2_next_entry(de))
+				if (ext2_match (namelen, name, de)) {
+					*res_page = map;
+					return de;
+				}
+			kunmap(map);
+		}
 		ext2_put_page(page);
 	}
 	return NULL;
-
-found:
-	*res_page = page;
-	return de;
 }

 struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
@@ -338,7 +362,7 @@
 	ext2_dirent *de = NULL;

 	if (!IS_ERR(page)) {
-		de = ext2_next_entry((ext2_dirent *) page_address(page));
+		de = ext2_next_entry((ext2_dirent *) kmap(page));
 		*p = page;
 	}
 	return de;
@@ -361,10 +385,11 @@

 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-			struct page *page, struct inode *inode)
+			struct page *map, struct inode *inode)
 {
-	unsigned from = (char *)de-(char*)page_address(page);
+	unsigned from = (char *)de-(char*)page_address(map);
 	unsigned to = from + le16_to_cpu(de->rec_len);
+	struct page *page = page_cache_page(map);
 	int err;

 	lock_page(page);
@@ -375,7 +400,7 @@
 	ext2_set_de_type (de, inode);
 	err = ext2_commit_chunk(page, from, to);
 	UnlockPage(page);
-	ext2_put_page(page);
+	ext2_put_page(map);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(dir);
 }
@@ -390,7 +415,7 @@
 	int namelen = dentry->d_name.len;
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
-	struct page *page = NULL;
+	struct page *page = NULL, *map;
 	ext2_dirent * de;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
@@ -400,24 +425,28 @@

 	/* We take care of directory expansion in the same loop */
 	for (n = 0; n <= npages; n++) {
+		unsigned i;
 		page = ext2_get_page(dir, n);
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
 			goto out;
-		kaddr = page_address(page);
-		de = (ext2_dirent *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
-		while ((char *)de <= kaddr) {
-			err = -EEXIST;
-			if (ext2_match (namelen, name, de))
-				goto out_page;
-			name_len = EXT2_DIR_REC_LEN(de->name_len);
-			rec_len = le16_to_cpu(de->rec_len);
-			if (!de->inode && rec_len >= reclen)
-				goto got_it;
-			if (rec_len >= name_len + reclen)
-				goto got_it;
-			de = (ext2_dirent *) ((char *) de + rec_len);
+		for (i=0; i<PAGE_CACHE_PAGES; i++) {
+			map = page + i;
+			kaddr = kmap(map);
+			de = (ext2_dirent *)kaddr;
+			kaddr += PAGE_SIZE - reclen;
+			while ((char *)de <= kaddr) {
+				err = -EEXIST;
+				if (ext2_match (namelen, name, de))
+					goto out_page;
+				name_len = EXT2_DIR_REC_LEN(de->name_len);
+				rec_len = le16_to_cpu(de->rec_len);
+				if (!de->inode && rec_len >= reclen)
+					goto got_it;
+				if (rec_len >= name_len + reclen)
+					goto got_it;
+				de = (ext2_dirent *) ((char *) de + rec_len);
+			}
 		}
 		ext2_put_page(page);
 	}
@@ -425,7 +454,7 @@
 	return -EINVAL;

 got_it:
-	from = (char*)de - (char*)page_address(page);
+	from = (char*)de - (kaddr - (PAGE_SIZE - reclen));
 	to = from + rec_len;
 	lock_page(page);
 	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
@@ -448,6 +477,7 @@
 out_unlock:
 	UnlockPage(page);
 out_page:
+	kunmap(map);
 	ext2_put_page(page);
 out:
 	return err;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/mm.h pgc-2.4.6-pre8/include/linux/mm.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/mm.h	Tue Jul  3 22:00:04 2001
+++ pgc-2.4.6-pre8/include/linux/mm.h	Wed Jul  4 01:57:05 2001
@@ -282,6 +282,7 @@
 #define PG_inactive_clean	11
 #define PG_highmem		12
 #define PG_checked		13	/* kill me in 2.5.<early>. */
+#define PG_pagecache		14
 				/* bits 21-29 unused */
 #define PG_arch_1		30
 #define PG_reserved		31
@@ -298,6 +299,9 @@
 #define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
 #define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
+#define PageCachePage(page)	test_bit(PG_pagecache, &(page)->flags)
+#define SetPageCache(page)	set_bit(PG_pagecache, &(page)->flags)
+#define ClearPageCache(page)	clear_bit(PG_pagecache, &(page)->flags)

 extern void __set_page_dirty(struct page *);

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/pagemap.h pgc-2.4.6-pre8/include/linux/pagemap.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/pagemap.h	Tue Jul  3 22:00:04 2001
+++ pgc-2.4.6-pre8/include/linux/pagemap.h	Thu Jul  5 00:50:10 2001
@@ -22,19 +22,53 @@
  * space in smaller chunks for same flexibility).
  *
  * Or rather, it _will_ be done in larger chunks.
+ *
+ * It is now configurable.  -ben 20010702
  */
-#define PAGE_CACHE_SHIFT	PAGE_SHIFT
-#define PAGE_CACHE_SIZE		PAGE_SIZE
-#define PAGE_CACHE_MASK		PAGE_MASK
+#define PAGE_CACHE_ORDER	(CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_PAGES	(1UL << CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_PMASK	(PAGE_CACHE_PAGES - 1)
+#define PAGE_CACHE_SHIFT	(PAGE_SHIFT + CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_SIZE		(1UL << PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_MASK		(~(PAGE_CACHE_SIZE - 1))
 #define PAGE_CACHE_ALIGN(addr)	(((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)

+#define __page_cache_page(page)	(page - ((page - mem_map) & PAGE_CACHE_PMASK))
+
+static inline struct page *page_cache_page(struct page *page)
+{
+	if (PageCachePage(page))
+		page = __page_cache_page(page);
+	return page;
+}
+
 #define page_cache_get(x)	get_page(x)
-#define page_cache_free(x)	__free_page(x)
-#define page_cache_release(x)	__free_page(x)
+#define __page_cache_free(x)	__free_pages(x, PAGE_CACHE_ORDER)
+#define page_cache_free(x)	page_cache_release(x)
+
+static inline void page_cache_release(struct page *page)
+{
+	if (PageCachePage(page))
+		__page_cache_free(__page_cache_page(page));
+	else
+		__free_page(page);
+}
+
+static inline struct page *__page_cache_alloc(int gfp)
+{
+	struct page *page;
+	page = alloc_pages(gfp, PAGE_CACHE_ORDER);
+	if (page) {
+		unsigned i;
+		for (i=0; i<PAGE_CACHE_PAGES; i++)
+			SetPageCache(page+i);
+	}
+	return page;
+}

 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return alloc_pages(x->gfp_mask, 0);
+	return __page_cache_alloc(x->gfp_mask);
 }

 /*
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/filemap.c pgc-2.4.6-pre8/mm/filemap.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/filemap.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/filemap.c	Thu Jul  5 00:19:39 2001
@@ -236,13 +236,12 @@
 		if ((offset >= start) || (*partial && (offset + 1) == start)) {
 			list_del(head);
 			list_add(head, curr);
+			page_cache_get(page);
 			if (TryLockPage(page)) {
-				page_cache_get(page);
 				spin_unlock(&pagecache_lock);
 				wait_on_page(page);
 				goto out_restart;
 			}
-			page_cache_get(page);
 			spin_unlock(&pagecache_lock);

 			if (*partial && (offset + 1) == start) {
@@ -1274,9 +1273,28 @@
 	if (size > count)
 		size = count;

-	kaddr = kmap(page);
-	left = __copy_to_user(desc->buf, kaddr + offset, size);
-	kunmap(page);
+	left = size;
+	page += offset >> PAGE_SHIFT;
+	offset &= PAGE_SIZE - 1;
+
+	do {
+		unsigned this = PAGE_SIZE - offset;
+
+		if (left < this)
+			this = left;
+
+		left -= this;
+		kaddr = kmap(page);
+		this = __copy_to_user(desc->buf, kaddr + offset, this);
+		kunmap(page);
+		offset = 0;
+
+		if (this) {
+			left += this;
+			break;
+		}
+		page++;
+	} while (left);

 	if (left) {
 		size -= left;
@@ -1499,8 +1517,11 @@
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page, **hash, *old_page;
 	unsigned long size, pgoff;
+	unsigned long offset;

-	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	pgoff = ((address - area->vm_start) >> PAGE_SHIFT) + area->vm_pgoff;
+	offset = pgoff & PAGE_CACHE_PMASK;
+	pgoff >>= PAGE_CACHE_ORDER;

 retry_all:
 	/*
@@ -1538,7 +1559,7 @@
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
 	 */
-	old_page = page;
+	old_page = page + offset;
 	if (no_share) {
 		struct page *new_page = alloc_page(GFP_HIGHUSER);

@@ -1652,6 +1673,7 @@
 	if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
 		struct page *page = pte_page(pte);
 		flush_tlb_page(vma, address);
+		page = page_cache_page(page);
 		set_page_dirty(page);
 	}
 	return 0;
@@ -1682,7 +1704,7 @@
 	do {
 		error |= filemap_sync_pte(pte, vma, address + offset, flags);
 		address += PAGE_SIZE;
-		pte++;
+		pte ++;
 	} while (address && (address < end));
 	return error;
 }
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/memory.c pgc-2.4.6-pre8/mm/memory.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/memory.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/memory.c	Thu Jul  5 00:49:38 2001
@@ -233,6 +233,7 @@
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
 				pte = pte_mkold(pte);
+				ptepage = page_cache_page(ptepage);
 				get_page(ptepage);

 cont_copy_pte_range:		set_pte(dst_pte, pte);
@@ -268,6 +269,7 @@
 		struct page *page = pte_page(pte);
 		if ((!VALID_PAGE(page)) || PageReserved(page))
 			return 0;
+		page = page_cache_page(page);
 		/*
 		 * free_page() used to be able to clear swap cache
 		 * entries.  We may now have to do it manually.
@@ -508,7 +510,7 @@
 		map = get_page_map(map);
 		if (map) {
 			flush_dcache_page(map);
-			atomic_inc(&map->count);
+			get_page(page_cache_page(map));
 		} else
 			printk (KERN_INFO "Mapped page missing [%d]\n", i);
 		spin_unlock(&mm->page_table_lock);
@@ -551,7 +553,7 @@

 	while (remaining > 0 && index < iobuf->nr_pages) {
 		page = iobuf->maplist[index];
-
+		page = page_cache_page(page);
 		if (!PageReserved(page))
 			SetPageDirty(page);

@@ -574,6 +576,7 @@
 	for (i = 0; i < iobuf->nr_pages; i++) {
 		map = iobuf->maplist[i];
 		if (map) {
+			map = page_cache_page(map);
 			if (iobuf->locked)
 				UnlockPage(map);
 			__free_page(map);
@@ -616,7 +619,7 @@
 			page = *ppage;
 			if (!page)
 				continue;
-
+			page = page_cache_page(page);
 			if (TryLockPage(page)) {
 				while (j--) {
 					page = *(--ppage);
@@ -687,6 +690,7 @@
 			page = *ppage;
 			if (!page)
 				continue;
+			page = page_cache_page(page);
 			UnlockPage(page);
 		}
 	}
@@ -894,12 +898,14 @@
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	unsigned long address, pte_t *page_table, pte_t pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *__old_page, *new_page;
+
+	__old_page = pte_page(pte);
+	old_page = page_cache_page(__old_page);

-	old_page = pte_page(pte);
 	if (!VALID_PAGE(old_page))
 		goto bad_wp_page;
-
+
 	/*
 	 * We can avoid the copy if:
 	 * - we're the only user (count == 1)
@@ -949,7 +955,7 @@
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
-		break_cow(vma, old_page, new_page, address, page_table);
+		break_cow(vma, __old_page, new_page, address, page_table);

 		/* Free the old page.. */
 		new_page = old_page;
@@ -1016,7 +1022,7 @@
 	if (!mapping->i_mmap && !mapping->i_mmap_shared)
 		goto out_unlock;

-	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	pgoff = (offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (mapping->i_mmap != NULL)
 		vmtruncate_list(mapping->i_mmap, pgoff);
 	if (mapping->i_mmap_shared != NULL)
@@ -1201,25 +1207,30 @@
 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	unsigned long address, int write_access, pte_t *page_table)
 {
-	struct page * new_page;
+	struct page *new_page, *ppage;
 	pte_t entry;
+	int no_share, offset, i;
+	unsigned long addr_min, addr_max;

 	if (!vma->vm_ops || !vma->vm_ops->nopage)
 		return do_anonymous_page(mm, vma, page_table, write_access, address);
 	spin_unlock(&mm->page_table_lock);

+	mm, vma, address, write_access, page_table);
 	/*
 	 * The third argument is "no_share", which tells the low-level code
 	 * to copy, not share the page even if sharing is possible.  It's
 	 * essentially an early COW detection.
 	 */
-	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
+	no_share = (vma->vm_flags & VM_SHARED) ? 0 : write_access;
+	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, no_share);

 	spin_lock(&mm->page_table_lock);
 	if (new_page == NULL)	/* no page was available -- SIGBUS */
 		return 0;
 	if (new_page == NOPAGE_OOM)
 		return -1;
+	ppage = page_cache_page(new_page);
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1231,25 +1242,70 @@
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (pte_none(*page_table)) {
-		++mm->rss;
+	if (!pte_none(*page_table)) {
+		/* One of our sibling threads was faster, back out. */
+		page_cache_release(ppage);
+		return 1;
+	}
+
+	addr_min = address & PMD_MASK;
+	addr_max = address | (PMD_SIZE - 1);
+
+	if (vma->vm_start > addr_min)
+		addr_min = vma->vm_start;
+	if (vma->vm_end < addr_max)
+		addr_max = vma->vm_end;
+
+	/* The following implements PAGE_CACHE_SIZE prefilling of
+	 * page tables.  The technique is essentially the same as
+	 * a cache burst using
+	 */
+	offset = address >> PAGE_SHIFT;
+	offset &= PAGE_CACHE_PMASK;
+	i = 0;
+	do {
+		if (!pte_none(*page_table))
+			goto next_page;
+
+		if ((address < addr_min) || (address > addr_max))
+			goto next_page;
+
+		if (!i)
+			page_cache_get(ppage);
+
+		mm->rss++;
 		flush_page_to_ram(new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		if (write_access) {
+		if (write_access && !i)
 			entry = pte_mkwrite(pte_mkdirty(entry));
-		} else if (page_count(new_page) > 1 &&
+		else if (page_count(ppage) > 1 &&
 			   !(vma->vm_flags & VM_SHARED))
 			entry = pte_wrprotect(entry);
+		if (i)
+			entry = pte_mkold(entry);
 		set_pte(page_table, entry);
-	} else {
-		/* One of our sibling threads was faster, back out. */
-		page_cache_release(new_page);
-		return 1;
-	}

-	/* no need to invalidate: a not-present page shouldn't be cached */
-	update_mmu_cache(vma, address, entry);
+		/* no need to invalidate: a not-present page shouldn't be cached */
+		update_mmu_cache(vma, address, entry);
+
+next_page:
+		if (!PageCachePage(ppage))
+			break;
+		if ((ppage + offset) != new_page)
+			break;
+
+		/* Implement wrap around for the address, page and ptep. */
+		address -= offset << PAGE_SHIFT;
+		page_table -= offset;
+		new_page -= offset;
+
+		offset = (offset + 1) & PAGE_CACHE_PMASK;
+
+		address += offset << PAGE_SHIFT;
+		page_table += offset;
+		new_page += offset;
+	} while (++i < PAGE_CACHE_PAGES) ;
 	return 2;	/* Major fault */
 }

diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/page_alloc.c pgc-2.4.6-pre8/mm/page_alloc.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/page_alloc.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/page_alloc.c	Wed Jul  4 02:46:12 2001
@@ -87,6 +87,13 @@
 		BUG();
 	if (PageInactiveClean(page))
 		BUG();
+	if (PageCachePage(page) && (order != PAGE_CACHE_ORDER)) {
+		printk("PageCachePage and order == %lu\n", order);
+		BUG();
+	}
+
+	for (index=0; index < (1<<order); index++)
+		ClearPageCache(page+index);

 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 	page->age = PAGE_AGE_START;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/vmscan.c pgc-2.4.6-pre8/mm/vmscan.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/vmscan.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/vmscan.c	Mon Jul  2 17:08:34 2001
@@ -38,8 +38,11 @@
 /* mm->page_table_lock is held. mmap_sem is not held */
 static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
 {
-	pte_t pte;
 	swp_entry_t entry;
+	pte_t pte;
+
+	if (PageCachePage(page))
+		page = page_cache_page(page);

 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] rfi: PAGE_CACHE_SIZE suppoort
  2001-07-05  5:06 [wip-PATCH] rfi: PAGE_CACHE_SIZE suppoort Ben LaHaise
@ 2001-07-05  5:55 ` Linus Torvalds
  2001-07-05 16:45   ` Large PAGE_SIZE Hugh Dickins
  0 siblings, 1 reply; 20+ messages in thread
From: Linus Torvalds @ 2001-07-05  5:55 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: linux-mm, linux-kernel, Alexander Viro

On Thu, 5 Jul 2001, Ben LaHaise wrote:
>
> I attacked the PAGE_CACHE_SIZE support in the kernel for the last few days
> in an attempt to get multipage PAGE_CACHE_SIZE support working and below
> is what I've come up with.  It currently boots to single user read only,
> doesn't quite have write support fixed properly yet, but is going pretty
> well.  The reason for sending this out now is the question of what to do
> about kmap() support.

I suggest making kmap _always_ map the "biggest" chunk of physical memory
that the kernel ever touches at a time.

So I would _strongly_ suggest that you make the kmap granularity be at
_least_ PAGE_CACHE_SIZE. For debugging reasons I would suggest you have a
separate "PAGE_KMAP_SIZE" thing, so that you can get the kmap code working
independently of the PAGE_CACHE_SIZE thing.

Once you have the guarantee that "kmap(page)" will actually end up mapping
the (power-of-two-aligned) power-of-two-sized PAGE_KMAP_SIZE around the
page, the loops should all go away, and you should be able to use kmap()
the same way you've always used it (whether the user actually cares about
just one page or not ends up being a non-issue).

> -	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
> +	filp->f_pos = (n << PAGE_SHIFT) | offset;

You're definitely doing something wrong here.

You should _never_ care about PAGE_SHIFT, except in the case of a mmap()
where you obviously end up mapping in "partial" page-cache pages.  I
suspect you're doing all this exactly because of the kmap issue, but you
really shouldn't need to do it.

The whole point with having a bigger page-cache-size is to be able to
process bigger chunks at a time.

Now, one thing you might actually want to look into is to make the dirty
bit be a "dirty bitmap", so that you have the option of marking things
dirty at a finer granularity. But that, I feel, is after you've gotten the
basic stuff working with a PAGE_CACHE_SIZE dirty granularity.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Large PAGE_SIZE
  2001-07-05  5:55 ` Linus Torvalds
@ 2001-07-05 16:45   ` Hugh Dickins
  2001-07-05 17:13     ` Linus Torvalds
  2001-07-18  0:02     ` Hugh Dickins
  0 siblings, 2 replies; 20+ messages in thread
From: Hugh Dickins @ 2001-07-05 16:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ben LaHaise

Linus,

Ben's mail on multipage PAGE_CACHE_SIZE support prompts me to let you
know now what I've been doing, and ask your opinion on this direction.

Congratulations to Ben for working out multipage PAGE_CACHE_SIZE.
I couldn't see where it was headed, and PAGE_CACHE_SIZE has been
PAGE_SIZE for so long that I assumed everyone had given up on it.

I'm interested in larger pages, but wary of multipage PAGE_CACHE_SIZE:
partly because it relies on non-0-order page allocations, partly because
it seems a shame then to break I/O into smaller units below the cache.

So instead I'm using a larger PAGE_SIZE throughout the kernel: here's an
extract from include/asm-i386/page.h (currently edited not configured):

/*
 * One subpage is represented by one Page Table Entry at the MMU level,
 * and corresponds to one page at the user process level: its size is
 * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)).
 */
#define SUBPAGE_SHIFT	12
#define SUBPAGE_SIZE	(1UL << SUBPAGE_SHIFT)
#define SUBPAGE_MASK	(~(SUBPAGE_SIZE-1))

/*
 * 2**N adjacent subpages may be clustered to make up one kernel page.
 * Reasonable and tested values for PAGE_SUBSHIFT are 0 (4k page),
 * 1 (8k page), 2 (16k page), 3 (32k page).  Higher values will not
 * work without further changes e.g. to unsigned short b_size.
 */
#define PAGE_SUBSHIFT	0
#define PAGE_SUBCOUNT	(1UL << PAGE_SUBSHIFT)

/*
 * One kernel page is represented by one struct page (see mm.h),
 * and is the kernel's principal unit of memory allocation.
 */
#define PAGE_SHIFT	(PAGE_SUBSHIFT + SUBPAGE_SHIFT)
#define PAGE_SIZE	(1UL << PAGE_SHIFT)
#define PAGE_MASK	(~(PAGE_SIZE-1))

The kernel patch which applies these definitions is, of course, much
larger than Ben's multipage PAGE_CACHE_SIZE patch.   Currently against
2.4.4 (I'm rebasing to 2.4.6 in the next week) plus some other patches
we're using inhouse, it's about 350KB touching 160 files.  Not quite
complete yet (trivial macros still to be added to non-i386 arches; md
readahead size not yet resolved; num_physpages in tuning to be checked;
vmscan algorithms probably misscaled) and certainly undertested, but
both 2GB SMP machine and 256MB laptop run stably with 32k pages (though
4k pages are better on the laptop, to keep kernel source tree in cache).

Most of the patch is simple and straightforward, replacing PAGE_SIZE
by SUBPAGE_SIZE where appropriate (in drivers that's usually only when
handling vm_pgoff).  Though I'm happy with the "SUB" naming, others may
not be, and a more vivid naming might make driver maintenance easier.

Some of the patch is rather tangential: seemed right to implement proper
flush_tlb_range() and flush_tlb_range_k() for flushing subpages togther;
hard to resist tidyups like changing zap_page_range() arg from size to
end when it's always sandwiched between start,end functions.  Unless
PAGE_CACHE_SIZE definition were to be removed too, no change at all
to most filesystems (cramfs, ncpfs, proc being exceptions).

Kernel physical and virtual address space mostly in PAGE_SIZE units:
__get_free_page(), vmalloc(), ioremap(), kmap_atomic(), kmap() pages;
but early alloc_bootmem_pages() and fixmap.h slots in SUBPAGE_SIZE.

User address space has to be in SUBPAGE_SIZE units (unless I want to
rebuild all my userspace): so the difficult part of the patch is the
mm/memory.c fault handlers, and preventing the anonymous SUBPAGE_SIZE
pieces from degenerating into needing a PAGE_SIZE physical page each,
and how to translate exclusive_swap_page().

These page fault handlers now prepare and operate upon a
pte_t *folio[PAGE_SUBCOUNT], different parts of the same large page
expected at respective virtual offsets (yes, mremap() can spoil that,
but it's exceptional).  Anon mappings may have non-0 vm_pgoff, to share
page with adjacent private mappings e.g. bss share large page with data,
so KIO across data-bss boundary works (KIO page granularity troublesome,
but would have been a shame to revert to the easier SUBPAGE_SIZE there).
Hard to get the macros right, to melt away to efficient code in the
PAGE_SUBSHIFT 0 case: I've done the best I can for now,
you'll probably find them clunky and suggest better.

Performance?  Not yet determined, we're just getting around to that.
Unless it performs significantly better than multipage PAGE_CACHE_SIZE,
it should be forgotten: no point in extensive change for no gain.

I've said enough for now: either you're already disgusted, and will
reply "Never!", or you'll sometime want to cast an eye over the patch
itself (or nominate someone else to do so), to get the measure of it.
If the latter, please give me a few days to put it together against
2.4.6, minus our other inhouse pieces, then I can put the result on
an ftp site for you.

I would have preferred to wait a little longer before unveiling this,
but it's appropriate to consider it with multipage PAGE_CACHE_SIZE.

Thanks for your time!
Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 16:45   ` Large PAGE_SIZE Hugh Dickins
@ 2001-07-05 17:13     ` Linus Torvalds
  2001-07-05 18:38       ` Hugh Dickins
  2001-07-18  0:02     ` Hugh Dickins
  1 sibling, 1 reply; 20+ messages in thread
From: Linus Torvalds @ 2001-07-05 17:13 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Ben LaHaise

On Thu, 5 Jul 2001, Hugh Dickins wrote:
>
> I'm interested in larger pages, but wary of multipage PAGE_CACHE_SIZE:
> partly because it relies on non-0-order page allocations, partly because
> it seems a shame then to break I/O into smaller units below the cache.

Note that once PAGE_CACHE_SIZE is of a higher order, then they effectively
become the same as the current order-0 pages - it's just that the buddy
system can always allocate "fractional" pages too.

We shouldn't get the same fragmentation issues, as the new order-N
allocation should be the common one, and the sub-oder-N fragments should
clump nicely together.

Also note that the I/O _would_ happen in PAGE_CACHE_SIZE - you'd never
break it into smaller chunks. That's the whole point of having a bigger
PAGE_CACHE_SIZE.

Now, I actually think your approach basically does the very same thing,
and I don't think there are necessarily any real differences between the
two. It's more of a perception issue: which "direction" do you look at it
from.

You take the approach that pages are bigger, but that you can map partial
pages into VM spaces. That is 100% equivalent to saying that the caching
fragment size is a order-N page, I think.

Obviously your world-view ends up very much impacting how you actually
implement it, so in that sense perception certainly does matter.

>  * One subpage is represented by one Page Table Entry at the MMU level,
>  * and corresponds to one page at the user process level: its size is
>  * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)).
>  */
> #define SUBPAGE_SHIFT	12
> #define SUBPAGE_SIZE	(1UL << SUBPAGE_SHIFT)
> #define SUBPAGE_MASK	(~(SUBPAGE_SIZE-1))

I would _really_ prefer to make it clear that "SUBPAGE" is a VM mapping
issue and nothing more (which is your approach), and would much prefer
that to be made very explicit. So I'd not call them "SUBPAGES", but
something like

	#define VM_PAGE_SHIFT	12
	#define VM_PAGE_SIZE ..

However, once you do this, who cares about "PAGE_SIZE" at all? In the end,
PAGE_SIZE has no meaning except for the internal VM memory management:
it's nothing but the smallest fragment-size that the buddy system works
with.

What does that matter? It makes a huge difference for page accounting.
That's really the only thing that should care about PAGE_SIZE, and the
difference here between the two approaches isn't all that big:

 - in your approach, PAGE_SIZE equals PAGE_CACHE_SIZE, so a PAGE_CACHE
   page only has one page count arrociated with it. That's good, because
   it simplifies "release_page_cache()" and friends.

 - going the other way, each VM "dirty" entity has a "struct page *"
   associated with it. That makes page count handling a bit nastier, but
   on the other hand it makes VM attributes much easier to handle, notably
   things like "dirty" bits.

Which is the right one? Frankly, don't know. It may be quite acceptable to
have just a single dirty bit for bigger regions. That would simplify
things, for sure.

On the other hand, maybe we will eventually have a per-mapping "page
size". That would be pretty much impossible with your approach, while the
"page size is the smallest VM granularity, PAGE_CACHE_SIZE is something
else" approach lends itself to that extension (just add a "size_shift" to
"struct address_space", and make the #defines use that instead. "Small
matter of programming").

> I've said enough for now: either you're already disgusted, and will
> reply "Never!", or you'll sometime want to cast an eye over the patch
> itself (or nominate someone else to do so), to get the measure of it.

I'd really like both of you to think about both of the approaches as the
same thing, but with different mindsets. Maybe there is something that
clearly makes one mindset better. And maybe there is some way to just make
the two be completely equivalent..

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 17:13     ` Linus Torvalds
@ 2001-07-05 18:38       ` Hugh Dickins
  2001-07-05 18:53         ` Linus Torvalds
  0 siblings, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-05 18:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ben LaHaise

On Thu, 5 Jul 2001, Linus Torvalds wrote:
> 
> Also note that the I/O _would_ happen in PAGE_CACHE_SIZE - you'd never
> break it into smaller chunks. That's the whole point of having a bigger
> PAGE_CACHE_SIZE.

Aha, are you saying that a part of the multipage PAGE_CACHE_SIZE project
is to go through the block layer and driver layer, changing appropriate
"PAGE_SIZE"s to "PAGE_CACHE_SIZE"s (whereas at present PAGE_CACHE_SIZE
is pretty much confined to the FS layer), so that the I/O isn't split?

If so, then yes indeed, the two approaches seem two sides of same coin:
I'd be changing one set of PAGE_SIZEs to VM_PAGE_SIZEs, while Ben would
be changing many of the others to PAGE_CACHE_SIZEs!  We'd differ at the
the user space level, but it might not amount to much (already we're both
filling multiple ptes on one fault).  I couldn't see what was going to
happen to the swap cache, if the anon pages were small but the cache size
large; but maybe swap readahead would dissolve our differences there too.

If not, please clarify.

> I'd really like both of you to think about both of the approaches as the
> same thing, but with different mindsets. Maybe there is something that
> clearly makes one mindset better. And maybe there is some way to just make
> the two be completely equivalent..

Yes, certainly I went about it in the only way I safely could, coming
from a VM background; someone with greater FS or I/O experience might
approach it differently.

It may come down to Ben having 2**N more struct pages than I do:
greater flexibility, but significant waste of kernel virtual.

I want to ponder the points in your mail: I'm a slow thinker and this
isn't intended as a reply, but I wanted to clarify PAGE_CACHE_SIZE I/O.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 18:38       ` Hugh Dickins
@ 2001-07-05 18:53         ` Linus Torvalds
  2001-07-05 20:41           ` Ben LaHaise
  2001-07-09  3:04           ` [wip-PATCH] " Ben LaHaise
  0 siblings, 2 replies; 20+ messages in thread
From: Linus Torvalds @ 2001-07-05 18:53 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Ben LaHaise

On Thu, 5 Jul 2001, Hugh Dickins wrote:
> On Thu, 5 Jul 2001, Linus Torvalds wrote:
> >
> > Also note that the I/O _would_ happen in PAGE_CACHE_SIZE - you'd never
> > break it into smaller chunks. That's the whole point of having a bigger
> > PAGE_CACHE_SIZE.
>
> Aha, are you saying that a part of the multipage PAGE_CACHE_SIZE project
> is to go through the block layer and driver layer, changing appropriate
> "PAGE_SIZE"s to "PAGE_CACHE_SIZE"s (whereas at present PAGE_CACHE_SIZE
> is pretty much confined to the FS layer), so that the I/O isn't split?

Any block devices that do that are already broken. Block drivers always
get physical addresses, they shouldn't care. The one exception is the kmap
case, where the programmed-IO thing needs the virtual re-mapping, but as I
already stated earlier I think kmap should always map the biggest chunk so
that nobody ever tries to loop over multiple pages if they don't have to.

Of course, the people playing with direct-IO from user space will always
be limited by the mapping size.

So in general, the block layer should not care AT ALL, and just use the
physical addresses passed in to it. For things like bounce buffers, YES,
we should make sure that the bounce buffers are at least the size of
PAGE_CACHE_SIZE.

> It may come down to Ben having 2**N more struct pages than I do:
> greater flexibility, but significant waste of kernel virtual.

The waste of kernel virtual memory space is actually a good point. Already
on big x86 machines the "struct page[]" array is a big memory-user. That
may indeed be the biggest argument for increasing PAGE_SIZE.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 18:53         ` Linus Torvalds
@ 2001-07-05 20:41           ` Ben LaHaise
  2001-07-05 20:59             ` Hugh Dickins
  2001-07-06  5:11             ` Linus Torvalds
  2001-07-09  3:04           ` [wip-PATCH] " Ben LaHaise
  1 sibling, 2 replies; 20+ messages in thread
From: Ben LaHaise @ 2001-07-05 20:41 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Hugh Dickins

On Thu, 5 Jul 2001, Linus Torvalds wrote:

> > It may come down to Ben having 2**N more struct pages than I do:
> > greater flexibility, but significant waste of kernel virtual.
>
> The waste of kernel virtual memory space is actually a good point. Already
> on big x86 machines the "struct page[]" array is a big memory-user. That
> may indeed be the biggest argument for increasing PAGE_SIZE.

I think the two patches will be complementary as they have different
effects.  Basically, we want to limit the degree which PAGE_SIZE increases
as increasing it too much can result in increased memory usage and
overhead for COW.  PAGE_CACHE_SIZE probably wants to be increased further,
simply to improve io efficiency.

On the topic of struct page size, yes it is too large.  There are a few
things we can do here to make things more efficient, like seperating the
notition of struct page and the page cache, but we have to be careful not
to split things up too much as 64 bytes is ideal for processors like the
Athlon, whereas the P4 really wants 128 byte to avoid false cache line
sharing on SMP.  I've got a few ideas on the page cache front to explore
in the next month or two that could result in another 12 bytes of savings
per page, plus we can look into other things like reducing the overhead of
the wait queue and the other contents of struct page.

		-ben

ps, would you mind if I forward the messages in this thread to linux-mm so
that other people can see the discussion?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 20:41           ` Ben LaHaise
@ 2001-07-05 20:59             ` Hugh Dickins
  2001-07-06  5:11             ` Linus Torvalds
  1 sibling, 0 replies; 20+ messages in thread
From: Hugh Dickins @ 2001-07-05 20:59 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: Linus Torvalds

On Thu, 5 Jul 2001, Ben LaHaise wrote:
> 
> ps, would you mind if I forward the messages in this thread to linux-mm so
> that other people can see the discussion?

I've no real objection, go ahead if you think fit.  I intentionally
left the lists out of it, being not yet quite ready to publish a patch,
and reluctant to talk without showing; but I don't mean to suppress
free discussion.  Okay with you, Linus?

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 20:41           ` Ben LaHaise
  2001-07-05 20:59             ` Hugh Dickins
@ 2001-07-06  5:11             ` Linus Torvalds
  1 sibling, 0 replies; 20+ messages in thread
From: Linus Torvalds @ 2001-07-06  5:11 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: Hugh Dickins

On Thu, 5 Jul 2001, Ben LaHaise wrote:
>
> ps, would you mind if I forward the messages in this thread to linux-mm so
> that other people can see the discussion?

Go ahead..

Btw, I wouldn't worry too much about the false sharing on a 128-byte
cache-line. Let's face it, we're unlikely to see many P4+ class machines
with less than 128MB of memory, at which time it starts to get unlikely
that we'll see all that many horrible ping-pong schenarios between CPU's -
touching alternate physical pages simply isn't all that likely.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-05 18:53         ` Linus Torvalds
  2001-07-05 20:41           ` Ben LaHaise
@ 2001-07-09  3:04           ` Ben LaHaise
  2001-07-09 11:18             ` Hugh Dickins
  2001-07-09 17:21             ` Hugh Dickins
  1 sibling, 2 replies; 20+ messages in thread
From: Ben LaHaise @ 2001-07-09  3:04 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Hugh Dickins, linux-mm

On Thu, 5 Jul 2001, Linus Torvalds wrote:

> So in general, the block layer should not care AT ALL, and just use the
> physical addresses passed in to it. For things like bounce buffers, YES,
> we should make sure that the bounce buffers are at least the size of
> PAGE_CACHE_SIZE.

Hmmm, interesting.  At present page cache sizes from PAGE_SIZE to
8*PAGE_SIZE are working here.  Setting the shift to 4 or a 64KB page size
results in the SCSI driver blowing up on io completion.  See the patch
below.  This version works and seems to be stable in normal usage
providing you run without swap.  Properly fixing swapping probably means
using O_DIRECT... ;-)

> > It may come down to Ben having 2**N more struct pages than I do:
> > greater flexibility, but significant waste of kernel virtual.
>
> The waste of kernel virtual memory space is actually a good point. Already
> on big x86 machines the "struct page[]" array is a big memory-user. That
> may indeed be the biggest argument for increasing PAGE_SIZE.

Well, here are a few lmbench runs with larger PAGE_CACHE_SIZES.  Except
for 2.4.2-2, the kernels are all based on 2.4.6-pre8, with -b and -c being
the 2 and 3 shift page cache kernels.  As expected, exec and sh latencies
are reduced.  Mmap latency appears to be adversely affected in the 16KB
page cache case while other latencies are reduced.  My best guess here is
that either a change in layout is causing cache collisions, or the changes
in do_no_page are having an adverse impact on page fault timing.  Ideally
the loop would be unrolled, however...

The way I changed do_no_page to speculatively pre-fill ptes is suboptimal:
it still has to obtain a ref count for each pte that touches the page
cache page.  One idea here is to treat ptes within a given page cache page
as sharing a single reference count, but this may have no impact on
performance and simply add to code complexity and as such probably isn't
worth the added hassle.

There is a noteworthy increase in file re-read bandwidth from 212MB/s in
the base kernel to 230 and 237 MB/s for kernels with 16 and 32KB pages.
I also tried a few kernel compiles against all three, and the larger page
cache sizes resulted in a 2m20s cache warm compile compared to 2m21s; a
change well below the margin of error, but at least not negative.  I
didn't try the cold cache senario, which on reflection is probably more
interesting.

The next step is to try out Hugh's approach and see what differences there
are and how the patches work together.  I also suspect that these changes
will have a larger impact on performance with ia64 where we can use a
single tlb entry to map all the page cache pages at the same time.  Hmmm,
perhaps I should try making anonymous pages use the larger allocations
where possible...

		-ben


cd results && make summary percent 2>/dev/null | more
make[1]: Entering directory `/tmp/LMbench/results'

                 L M B E N C H  2 . 0   S U M M A R Y
                 ------------------------------------
		 (Alpha software, do not distribute)

Basic system parameters
----------------------------------------------------
Host                 OS Description              Mhz

--------- ------------- ----------------------- ----
toolbox.t Linux 2.4.2-2       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-p       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-p       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-p       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-b       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-b       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-b       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-c       i686-pc-linux-gnu  550
toolbox.t Linux 2.4.6-c       i686-pc-linux-gnu  550

Processor, Processes - times in microseconds - smaller is better
----------------------------------------------------------------
Host                 OS  Mhz null null      open selct sig  sig  fork exec sh
                             call  I/O stat clos TCP   inst hndl proc proc proc
--------- ------------- ---- ---- ---- ---- ---- ----- ---- ---- ---- ---- ----
toolbox.t Linux 2.4.2-2  550 0.60 0.97 3.60 5.63    44 1.46 4.79  547 1948 7115
toolbox.t Linux 2.4.6-p  550 0.63 1.02 3.61 5.47    46 1.47 4.91  553 1932 6923
toolbox.t Linux 2.4.6-p  550 0.63 1.01 3.61 5.50    44 1.50 4.92  563 1927 7072
toolbox.t Linux 2.4.6-p  550 0.63 1.00 3.64 5.50    43 1.50 4.91  563 1917 6961
toolbox.t Linux 2.4.6-b  550 0.63 1.02 3.54 5.35    43 1.50 4.84  547 1878 6933
toolbox.t Linux 2.4.6-b  550 0.63 1.02 3.55 5.38    49 1.50 4.90  551 1889 6951
toolbox.t Linux 2.4.6-b  550 0.63 1.02 3.54 5.37    47 1.50 4.84  550 1887 6927
toolbox.t Linux 2.4.6-c  550 0.63 1.00 3.60 5.40    44 1.51 4.90  543 1882 6854
toolbox.t Linux 2.4.6-c  550 0.63 1.02 3.54 5.46    47 1.47 4.90  545 1875 6872

Context switching - times in microseconds - smaller is better
-------------------------------------------------------------
Host                 OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
                        ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
--------- ------------- ----- ------ ------ ------ ------ ------- -------
toolbox.t Linux 2.4.2-2 4.280     12     40     19     43      19      52
toolbox.t Linux 2.4.6-p 4.360     11     39     19     43      18      43
toolbox.t Linux 2.4.6-p 4.530     11     39     18     42      18      43
toolbox.t Linux 2.4.6-p 4.600     12     39     19     43      19      43
toolbox.t Linux 2.4.6-b 4.470     11     39     18     43      19      43
toolbox.t Linux 2.4.6-b 4.560     11     39     18     42      18      44
toolbox.t Linux 2.4.6-b 4.700     11     39     18     43      18      44
toolbox.t Linux 2.4.6-c 4.430     11     39     18     43      19      60
toolbox.t Linux 2.4.6-c 4.630     11     39     19     42      18      48

*Local* Communication latencies in microseconds - smaller is better
-------------------------------------------------------------------
Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
                        ctxsw       UNIX         UDP         TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
toolbox.t Linux 2.4.2-2 4.280    15   35    44    82    58   109  110
toolbox.t Linux 2.4.6-p 4.360    15   32    46    82    57   104  111
toolbox.t Linux 2.4.6-p 4.530    15   31    46    82    56   103  110
toolbox.t Linux 2.4.6-p 4.600    15   32    45    82    58   104  111
toolbox.t Linux 2.4.6-b 4.470    15   33    45    81    56   103  109
toolbox.t Linux 2.4.6-b 4.560    15   35    45    81    56   104  109
toolbox.t Linux 2.4.6-b 4.700    15   35    45    82    56   104  110
toolbox.t Linux 2.4.6-c 4.430    15   34    45    82    56   104  110
toolbox.t Linux 2.4.6-c 4.630    15   35    45    82    56   103  110

File & VM system latencies in microseconds - smaller is better
--------------------------------------------------------------
Host                 OS   0K File      10K File      Mmap    Prot    Page
                        Create Delete Create Delete  Latency Fault   Fault
--------- ------------- ------ ------ ------ ------  ------- -----   -----
toolbox.t Linux 2.4.2-2    113     15    214     36      424 1.204 5.00000
toolbox.t Linux 2.4.6-p     59     11    157     31      496 1.199 4.00000
toolbox.t Linux 2.4.6-p     60     12    158     31      506 1.270 4.00000
toolbox.t Linux 2.4.6-p     60     12    157     31      508 1.221 4.00000
toolbox.t Linux 2.4.6-b     59     11    152     28      737 1.169 5.00000
toolbox.t Linux 2.4.6-b     59     11    152     27      736 1.225 5.00000
toolbox.t Linux 2.4.6-b     59     11    152     28      746 1.152 5.00000
toolbox.t Linux 2.4.6-c     60     11    157     32      516 1.223 4.00000
toolbox.t Linux 2.4.6-c     60     11    157     32      541 1.270 4.00000

*Local* Communication bandwidths in MB/s - bigger is better
-----------------------------------------------------------
Host                OS  Pipe AF    TCP  File   Mmap  Bcopy  Bcopy  Mem   Mem
                             UNIX      reread reread (libc) (hand) read write
--------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- -----
toolbox.t Linux 2.4.2-2  219  160  114    211    274    197    160  274   210
toolbox.t Linux 2.4.6-p  221  160  117    212    274    197    160  274   210
toolbox.t Linux 2.4.6-p  220  160  115    212    273    197    160  274   210
toolbox.t Linux 2.4.6-p  221  159  117    212    273    197    160  274   210
toolbox.t Linux 2.4.6-b  220  160  114    231    274    197    160  274   210
toolbox.t Linux 2.4.6-b  221  159  116    230    274    197    160  274   210
toolbox.t Linux 2.4.6-b  222  158  116    230    274    197    160  274   210
toolbox.t Linux 2.4.6-c  218  159  122    237    274    192    159  274   210
toolbox.t Linux 2.4.6-c  220  159  116    238    274    193    160  274   210

Memory latencies in nanoseconds - smaller is better
    (WARNING - may not be correct, check graphs)
---------------------------------------------------
Host                 OS   Mhz  L1 $   L2 $    Main mem    Guesses
--------- -------------  ---- ----- ------    --------    -------
toolbox.t Linux 2.4.2-2   550 5.457     32    222
toolbox.t Linux 2.4.6-p   550 5.455     32    222
toolbox.t Linux 2.4.6-p   550 5.455     32    222
toolbox.t Linux 2.4.6-p   550 5.455     32    222
toolbox.t Linux 2.4.6-b   550 5.454     32    222
toolbox.t Linux 2.4.6-b   550 5.455     32    222
toolbox.t Linux 2.4.6-b   550 5.455     32    222
toolbox.t Linux 2.4.6-c   550 5.455     32    222
toolbox.t Linux 2.4.6-c   550 5.455     32    222
make[1]: Leaving directory `/tmp/LMbench/results'



.... ~/patches/v2.4.6-pre8-pgc-B0.diff ....
diff -ur /md0/kernels/2.4/v2.4.6-pre8/Makefile pgc-2.4.6-pre8/Makefile
--- /md0/kernels/2.4/v2.4.6-pre8/Makefile	Sat Jun 30 14:04:26 2001
+++ pgc-2.4.6-pre8/Makefile	Sun Jul  8 02:32:00 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 6
-EXTRAVERSION =-pre8
+EXTRAVERSION =-pre8-pgc-B0

 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)

diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/boot/install.sh pgc-2.4.6-pre8/arch/i386/boot/install.sh
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/boot/install.sh	Tue Jan  3 06:57:26 1995
+++ pgc-2.4.6-pre8/arch/i386/boot/install.sh	Wed Jul  4 16:42:32 2001
@@ -21,6 +21,7 @@

 # User may have a custom install script

+if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
 if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi

 # Default install - same as make zlilo
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in pgc-2.4.6-pre8/arch/i386/config.in
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/config.in	Sun Jul  1 21:45:04 2001
+++ pgc-2.4.6-pre8/arch/i386/config.in	Sun Jul  1 21:49:20 2001
@@ -180,6 +180,8 @@
 if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
    define_bool CONFIG_HAVE_DEC_LOCK y
 fi
+
+int 'Page cache shift' CONFIG_PAGE_CACHE_SHIFT 0
 endmenu

 mainmenu_option next_comment
diff -ur /md0/kernels/2.4/v2.4.6-pre8/arch/i386/mm/init.c pgc-2.4.6-pre8/arch/i386/mm/init.c
--- /md0/kernels/2.4/v2.4.6-pre8/arch/i386/mm/init.c	Thu May  3 11:22:07 2001
+++ pgc-2.4.6-pre8/arch/i386/mm/init.c	Fri Jul  6 01:11:23 2001
@@ -156,6 +156,7 @@
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
+	unsigned i;

 	if (idx >= __end_of_fixed_addresses) {
 		printk("Invalid __set_fixmap\n");
@@ -282,7 +283,7 @@
 	 * Permanent kmaps:
 	 */
 	vaddr = PKMAP_BASE;
-	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP*PKMAP_PAGES, pgd_base);

 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pmd = pmd_offset(pgd, vaddr);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c pgc-2.4.6-pre8/fs/buffer.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/buffer.c	Sat Jun 30 14:04:27 2001
+++ pgc-2.4.6-pre8/fs/buffer.c	Thu Jul  5 04:41:19 2001
@@ -774,6 +774,7 @@

 	/* This is a temporary buffer used for page I/O. */
 	page = bh->b_page;
+	page = page_cache_page(page);

 	if (!uptodate)
 		SetPageError(page);
@@ -1252,8 +1253,10 @@

 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
 {
+	page += offset >> PAGE_SHIFT;
+	offset &= PAGE_SIZE - 1;
 	bh->b_page = page;
-	if (offset >= PAGE_SIZE)
+	if (offset >= PAGE_CACHE_SIZE)
 		BUG();
 	if (PageHighMem(page))
 		/*
@@ -1280,7 +1283,9 @@

 try_again:
 	head = NULL;
-	offset = PAGE_SIZE;
+	if (!PageCachePage(page))
+		BUG();
+	offset = PAGE_CACHE_SIZE;
 	while ((offset -= size) >= 0) {
 		bh = get_unused_buffer_head(async);
 		if (!bh)
@@ -1664,6 +1669,8 @@
 	unsigned int blocksize, blocks;
 	int nr, i;

+	if (!PageCachePage(page))
+		BUG();
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 	blocksize = inode->i_sb->s_blocksize;
@@ -2228,7 +2235,7 @@
 		return 0;
 	}

-	page = alloc_page(GFP_NOFS);
+	page = __page_cache_alloc(GFP_NOFS);
 	if (!page)
 		goto out;
 	LockPage(page);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/dir.c pgc-2.4.6-pre8/fs/ext2/dir.c
--- /md0/kernels/2.4/v2.4.6-pre8/fs/ext2/dir.c	Sat Jun 30 14:04:27 2001
+++ pgc-2.4.6-pre8/fs/ext2/dir.c	Thu Jul  5 21:38:16 2001
@@ -321,15 +321,13 @@
 		de = (ext2_dirent *) kaddr;
 		kaddr += PAGE_CACHE_SIZE - reclen;
 		for ( ; (char *) de <= kaddr ; de = ext2_next_entry(de))
-			if (ext2_match (namelen, name, de))
-				goto found;
+			if (ext2_match (namelen, name, de)) {
+				*res_page = page;
+				return de;
+			}
 		ext2_put_page(page);
 	}
 	return NULL;
-
-found:
-	*res_page = page;
-	return de;
 }

 struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
@@ -353,8 +351,7 @@
 	de = ext2_find_entry (dir, dentry, &page);
 	if (de) {
 		res = le32_to_cpu(de->inode);
-		kunmap(page);
-		page_cache_release(page);
+		ext2_put_page(page);
 	}
 	return res;
 }
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/asm-i386/fixmap.h pgc-2.4.6-pre8/include/asm-i386/fixmap.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/asm-i386/fixmap.h	Sun Jul  8 02:18:42 2001
+++ pgc-2.4.6-pre8/include/asm-i386/fixmap.h	Sun Jul  8 02:36:31 2001
@@ -40,6 +40,8 @@
  * TLB entries of such buffers will not be flushed across
  * task switches.
  */
+#define KM_ORDER	(CONFIG_PAGE_CACHE_SHIFT)
+#define KM_PAGES	(1UL << KM_ORDER)

 /*
  * on UP currently we will have no trace of the fixmap mechanizm,
@@ -63,7 +65,7 @@
 #endif
 #ifdef CONFIG_HIGHMEM
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_PAGES*KM_TYPE_NR*NR_CPUS)-1,
 #endif
 	__end_of_fixed_addresses
 };
@@ -86,7 +88,7 @@
  * at the top of mem..
  */
 #define FIXADDR_TOP	(0xffffe000UL)
-#define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_SIZE	(__end_of_fixed_addresses << (PAGE_SHIFT + KM_ORDER))
 #define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)

 #define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/asm-i386/highmem.h pgc-2.4.6-pre8/include/asm-i386/highmem.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/asm-i386/highmem.h	Sun Jul  8 04:50:02 2001
+++ pgc-2.4.6-pre8/include/asm-i386/highmem.h	Sun Jul  8 02:36:31 2001
@@ -43,15 +43,19 @@
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#define PKMAP_BASE (0xfe000000UL)
+#define PKMAP_ORDER (CONFIG_PAGE_CACHE_SHIFT)	/* Fix mm dependancies if changed*/
+#define PKMAP_PAGES (1UL << PKMAP_ORDER)
+#define PKMAP_SIZE	4096
 #ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
+#define LAST_PKMAP ((PKMAP_SIZE / 8) >> PKMAP_ORDER)
 #else
-#define LAST_PKMAP 1024
+#define LAST_PKMAP ((PKMAP_SIZE / 4) >> PKMAP_ORDER)
 #endif
 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
-#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
-#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+#define PKMAP_BASE	(0xfe000000UL)
+#define PKMAP_SHIFT	(PAGE_SHIFT + PKMAP_ORDER)
+#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PKMAP_SHIFT)
+#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PKMAP_SHIFT))

 extern void * FASTCALL(kmap_high(struct page *page));
 extern void FASTCALL(kunmap_high(struct page *page));
@@ -84,18 +88,22 @@
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
+	unsigned i;

 	if (page < highmem_start_page)
 		return page_address(page);

 	idx = type + KM_TYPE_NR*smp_processor_id();
+	idx <<= PKMAP_ORDER;
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 #if HIGHMEM_DEBUG
 	if (!pte_none(*(kmap_pte-idx)))
 		BUG();
 #endif
-	set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
-	__flush_tlb_one(vaddr);
+	for (i=0; i<PKMAP_PAGES; i++)
+		set_pte(kmap_pte-idx+i, mk_pte(page+i, kmap_prot));
+	for (i=0; i<PKMAP_PAGES; i++)
+		__flush_tlb_one(vaddr + (i << PAGE_SHIFT));

 	return (void*) vaddr;
 }
@@ -105,10 +113,12 @@
 #if HIGHMEM_DEBUG
 	unsigned long vaddr = (unsigned long) kvaddr;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+	unsigned i;

 	if (vaddr < FIXADDR_START) // FIXME
 		return;

+	idx <<= PKMAP_ORDER;
 	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		BUG();

@@ -116,8 +126,10 @@
 	 * force other mappings to Oops if they'll try to access
 	 * this pte without first remap it
 	 */
-	pte_clear(kmap_pte-idx);
-	__flush_tlb_one(vaddr);
+	for (i=0; i<PKMAP_PAGES; i++)
+		pte_clear(kmap_pte-idx+i);
+	for (i=0; i<PKMAP_PAGES; i++, vaddr += PAGE_SIZE)
+		__flush_tlb_one(vaddr);
 #endif
 }

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/highmem.h pgc-2.4.6-pre8/include/linux/highmem.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/highmem.h	Sun Jul  8 04:50:02 2001
+++ pgc-2.4.6-pre8/include/linux/highmem.h	Sun Jul  8 02:36:31 2001
@@ -59,7 +59,7 @@
 {
 	char *kaddr;

-	if (offset + size > PAGE_SIZE)
+	if (offset + size > (PAGE_SIZE * PKMAP_PAGES))
 		BUG();
 	kaddr = kmap(page);
 	memset(kaddr + offset, 0, size);
@@ -73,7 +73,7 @@
 {
 	char *kaddr;

-	if (offset + size > PAGE_SIZE)
+	if (offset + size > (PAGE_SIZE * PKMAP_PAGES))
 		BUG();
 	kaddr = kmap(page);
 	memset(kaddr + offset, 0, size);
diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/mm.h pgc-2.4.6-pre8/include/linux/mm.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/mm.h	Sun Jul  8 04:50:02 2001
+++ pgc-2.4.6-pre8/include/linux/mm.h	Sun Jul  8 02:36:32 2001
@@ -282,6 +282,7 @@
 #define PG_inactive_clean	11
 #define PG_highmem		12
 #define PG_checked		13	/* kill me in 2.5.<early>. */
+#define PG_pagecache		14
 				/* bits 21-29 unused */
 #define PG_arch_1		30
 #define PG_reserved		31
@@ -298,6 +299,9 @@
 #define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
 #define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
+#define PageCachePage(page)	test_bit(PG_pagecache, &(page)->flags)
+#define SetPageCache(page)	set_bit(PG_pagecache, &(page)->flags)
+#define ClearPageCache(page)	clear_bit(PG_pagecache, &(page)->flags)

 extern void __set_page_dirty(struct page *);

diff -ur /md0/kernels/2.4/v2.4.6-pre8/include/linux/pagemap.h pgc-2.4.6-pre8/include/linux/pagemap.h
--- /md0/kernels/2.4/v2.4.6-pre8/include/linux/pagemap.h	Sun Jul  8 04:50:02 2001
+++ pgc-2.4.6-pre8/include/linux/pagemap.h	Sun Jul  8 20:25:14 2001
@@ -22,19 +22,53 @@
  * space in smaller chunks for same flexibility).
  *
  * Or rather, it _will_ be done in larger chunks.
+ *
+ * It's now configurable.  -ben 20010702
  */
-#define PAGE_CACHE_SHIFT	PAGE_SHIFT
-#define PAGE_CACHE_SIZE		PAGE_SIZE
-#define PAGE_CACHE_MASK		PAGE_MASK
+#define PAGE_CACHE_ORDER	(CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_PAGES	(1UL << CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_PMASK	(PAGE_CACHE_PAGES - 1)
+#define PAGE_CACHE_SHIFT	(PAGE_SHIFT + CONFIG_PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_SIZE		(1UL << PAGE_CACHE_SHIFT)
+#define PAGE_CACHE_MASK		(~(PAGE_CACHE_SIZE - 1))
 #define PAGE_CACHE_ALIGN(addr)	(((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)

+#define __page_cache_page(page)	(page - ((page - mem_map) & PAGE_CACHE_PMASK))
+
+static inline struct page *page_cache_page(struct page *page)
+{
+	if (PageCachePage(page))
+		page = __page_cache_page(page);
+	return page;
+}
+
 #define page_cache_get(x)	get_page(x)
-#define page_cache_free(x)	__free_page(x)
-#define page_cache_release(x)	__free_page(x)
+#define __page_cache_free(x)	__free_pages(x, PAGE_CACHE_ORDER)
+#define page_cache_free(x)	page_cache_release(x)
+
+static inline void page_cache_release(struct page *page)
+{
+	if (PageCachePage(page))
+		__page_cache_free(__page_cache_page(page));
+	else
+		__free_page(page);
+}
+
+static inline struct page *__page_cache_alloc(int gfp)
+{
+	struct page *page;
+	page = alloc_pages(gfp, PAGE_CACHE_ORDER);
+	if (page) {
+		unsigned i;
+		for (i=0; i<PAGE_CACHE_PAGES; i++)
+			SetPageCache(page+i);
+	}
+	return page;
+}

 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return alloc_pages(x->gfp_mask, 0);
+	return __page_cache_alloc(x->gfp_mask);
 }

 /*
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/filemap.c pgc-2.4.6-pre8/mm/filemap.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/filemap.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/filemap.c	Thu Jul  5 19:54:16 2001
@@ -236,13 +236,12 @@
 		if ((offset >= start) || (*partial && (offset + 1) == start)) {
 			list_del(head);
 			list_add(head, curr);
+			page_cache_get(page);
 			if (TryLockPage(page)) {
-				page_cache_get(page);
 				spin_unlock(&pagecache_lock);
 				wait_on_page(page);
 				goto out_restart;
 			}
-			page_cache_get(page);
 			spin_unlock(&pagecache_lock);

 			if (*partial && (offset + 1) == start) {
@@ -1499,8 +1498,11 @@
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page, **hash, *old_page;
 	unsigned long size, pgoff;
+	unsigned long offset;

-	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	pgoff = ((address - area->vm_start) >> PAGE_SHIFT) + area->vm_pgoff;
+	offset = pgoff & PAGE_CACHE_PMASK;
+	pgoff >>= PAGE_CACHE_ORDER;

 retry_all:
 	/*
@@ -1538,7 +1540,7 @@
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
 	 */
-	old_page = page;
+	old_page = page + offset;
 	if (no_share) {
 		struct page *new_page = alloc_page(GFP_HIGHUSER);

@@ -1652,6 +1654,7 @@
 	if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
 		struct page *page = pte_page(pte);
 		flush_tlb_page(vma, address);
+		page = page_cache_page(page);
 		set_page_dirty(page);
 	}
 	return 0;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/highmem.c pgc-2.4.6-pre8/mm/highmem.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/highmem.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/highmem.c	Sun Jul  8 00:30:12 2001
@@ -46,6 +46,7 @@

 	for (i = 0; i < LAST_PKMAP; i++) {
 		struct page *page;
+		unsigned j;
 		pte_t pte;
 		/*
 		 * zero means we don't have anything to do,
@@ -56,9 +57,11 @@
 		if (pkmap_count[i] != 1)
 			continue;
 		pkmap_count[i] = 0;
-		pte = ptep_get_and_clear(pkmap_page_table+i);
-		if (pte_none(pte))
-			BUG();
+		for (j=PKMAP_PAGES; j>0; ) {
+			pte = ptep_get_and_clear(pkmap_page_table+(i*PKMAP_PAGES)+ --j);
+			if (pte_none(pte))
+				BUG();
+		}
 		page = pte_page(pte);
 		page->virtual = NULL;
 	}
@@ -68,6 +71,7 @@
 static inline unsigned long map_new_virtual(struct page *page)
 {
 	unsigned long vaddr;
+	unsigned i;
 	int count;

 start:
@@ -105,10 +109,12 @@
 			goto start;
 		}
 	}
+	pkmap_count[last_pkmap_nr] = 1;
 	vaddr = PKMAP_ADDR(last_pkmap_nr);
-	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+	last_pkmap_nr <<= PKMAP_ORDER;
+	for (i=0; i<PKMAP_PAGES; i++)
+		set_pte(&(pkmap_page_table[last_pkmap_nr+i]), mk_pte(page+i, kmap_prot));

-	pkmap_count[last_pkmap_nr] = 1;
 	page->virtual = (void *) vaddr;

 	return vaddr;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/memory.c pgc-2.4.6-pre8/mm/memory.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/memory.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/memory.c	Sun Jul  8 02:36:20 2001
@@ -233,6 +233,7 @@
 				if (vma->vm_flags & VM_SHARED)
 					pte = pte_mkclean(pte);
 				pte = pte_mkold(pte);
+				ptepage = page_cache_page(ptepage);
 				get_page(ptepage);

 cont_copy_pte_range:		set_pte(dst_pte, pte);
@@ -268,6 +269,7 @@
 		struct page *page = pte_page(pte);
 		if ((!VALID_PAGE(page)) || PageReserved(page))
 			return 0;
+		page = page_cache_page(page);
 		/*
 		 * free_page() used to be able to clear swap cache
 		 * entries.  We may now have to do it manually.
@@ -508,7 +510,7 @@
 		map = get_page_map(map);
 		if (map) {
 			flush_dcache_page(map);
-			atomic_inc(&map->count);
+			get_page(page_cache_page(map));
 		} else
 			printk (KERN_INFO "Mapped page missing [%d]\n", i);
 		spin_unlock(&mm->page_table_lock);
@@ -551,7 +553,7 @@

 	while (remaining > 0 && index < iobuf->nr_pages) {
 		page = iobuf->maplist[index];
-
+		page = page_cache_page(page);
 		if (!PageReserved(page))
 			SetPageDirty(page);

@@ -574,6 +576,7 @@
 	for (i = 0; i < iobuf->nr_pages; i++) {
 		map = iobuf->maplist[i];
 		if (map) {
+			map = page_cache_page(map);
 			if (iobuf->locked)
 				UnlockPage(map);
 			__free_page(map);
@@ -616,7 +619,7 @@
 			page = *ppage;
 			if (!page)
 				continue;
-
+			page = page_cache_page(page);
 			if (TryLockPage(page)) {
 				while (j--) {
 					page = *(--ppage);
@@ -687,6 +690,7 @@
 			page = *ppage;
 			if (!page)
 				continue;
+			page = page_cache_page(page);
 			UnlockPage(page);
 		}
 	}
@@ -894,12 +898,14 @@
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	unsigned long address, pte_t *page_table, pte_t pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *__old_page, *new_page;
+
+	__old_page = pte_page(pte);
+	old_page = page_cache_page(__old_page);

-	old_page = pte_page(pte);
 	if (!VALID_PAGE(old_page))
 		goto bad_wp_page;
-
+
 	/*
 	 * We can avoid the copy if:
 	 * - we're the only user (count == 1)
@@ -949,7 +955,7 @@
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
-		break_cow(vma, old_page, new_page, address, page_table);
+		break_cow(vma, __old_page, new_page, address, page_table);

 		/* Free the old page.. */
 		new_page = old_page;
@@ -1016,7 +1022,7 @@
 	if (!mapping->i_mmap && !mapping->i_mmap_shared)
 		goto out_unlock;

-	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	pgoff = (offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (mapping->i_mmap != NULL)
 		vmtruncate_list(mapping->i_mmap, pgoff);
 	if (mapping->i_mmap_shared != NULL)
@@ -1201,8 +1207,11 @@
 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	unsigned long address, int write_access, pte_t *page_table)
 {
-	struct page * new_page;
+	struct page *new_page, *ppage;
 	pte_t entry;
+	int no_share, offset, i;
+	unsigned long addr_min, addr_max;
+	int put;

 	if (!vma->vm_ops || !vma->vm_ops->nopage)
 		return do_anonymous_page(mm, vma, page_table, write_access, address);
@@ -1213,13 +1222,15 @@
 	 * to copy, not share the page even if sharing is possible.  It's
 	 * essentially an early COW detection.
 	 */
-	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
+	no_share = (vma->vm_flags & VM_SHARED) ? 0 : write_access;
+	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, no_share);

 	spin_lock(&mm->page_table_lock);
 	if (new_page == NULL)	/* no page was available -- SIGBUS */
 		return 0;
 	if (new_page == NOPAGE_OOM)
 		return -1;
+	ppage = page_cache_page(new_page);
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
@@ -1231,25 +1242,73 @@
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (pte_none(*page_table)) {
-		++mm->rss;
+	if (!pte_none(*page_table)) {
+		/* One of our sibling threads was faster, back out. */
+		page_cache_release(ppage);
+		return 1;
+	}
+
+	addr_min = address & PMD_MASK;
+	addr_max = address | (PMD_SIZE - 1);
+
+	addr_min = vma->vm_start;
+	addr_max = vma->vm_end;
+
+	/* The following implements PAGE_CACHE_SIZE prefilling of
+	 * page tables.  The technique is essentially the same as
+	 * a cache burst using
+	 */
+	offset = address >> PAGE_SHIFT;
+	offset &= PAGE_CACHE_PMASK;
+	i = 0;
+	put = 1;
+	do {
+		if (!pte_none(*page_table))
+			goto next_page;
+
+		if ((address < addr_min) || (address >= addr_max))
+			goto next_page;
+
+		if (put)
+			put = 0;
+		else
+			page_cache_get(ppage);
+
+		mm->rss++;
 		flush_page_to_ram(new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		if (write_access) {
+		if (write_access && !i)
 			entry = pte_mkwrite(pte_mkdirty(entry));
-		} else if (page_count(new_page) > 1 &&
+		else if (page_count(ppage) > 1 &&
 			   !(vma->vm_flags & VM_SHARED))
 			entry = pte_wrprotect(entry);
+		if (i)
+			entry = pte_mkold(entry);
 		set_pte(page_table, entry);
-	} else {
-		/* One of our sibling threads was faster, back out. */
-		page_cache_release(new_page);
-		return 1;
-	}

-	/* no need to invalidate: a not-present page shouldn't be cached */
-	update_mmu_cache(vma, address, entry);
+		/* no need to invalidate: a not-present page shouldn't be cached */
+		update_mmu_cache(vma, address, entry);
+
+next_page:
+		if (!PageCachePage(ppage))
+			break;
+		if ((ppage + offset) != new_page)
+			break;
+
+		/* Implement wrap around for the address, page and ptep. */
+		address -= offset << PAGE_SHIFT;
+		page_table -= offset;
+		new_page -= offset;
+
+		offset = (offset + 1) & PAGE_CACHE_PMASK;
+
+		address += offset << PAGE_SHIFT;
+		page_table += offset;
+		new_page += offset;
+	} while (++i < PAGE_CACHE_PAGES) ;
+	if (put)
+		page_cache_release(ppage);
 	return 2;	/* Major fault */
 }

diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/page_alloc.c pgc-2.4.6-pre8/mm/page_alloc.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/page_alloc.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/page_alloc.c	Wed Jul  4 02:46:12 2001
@@ -87,6 +87,13 @@
 		BUG();
 	if (PageInactiveClean(page))
 		BUG();
+	if (PageCachePage(page) && (order != PAGE_CACHE_ORDER)) {
+		printk("PageCachePage and order == %lu\n", order);
+		BUG();
+	}
+
+	for (index=0; index < (1<<order); index++)
+		ClearPageCache(page+index);

 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 	page->age = PAGE_AGE_START;
diff -ur /md0/kernels/2.4/v2.4.6-pre8/mm/vmscan.c pgc-2.4.6-pre8/mm/vmscan.c
--- /md0/kernels/2.4/v2.4.6-pre8/mm/vmscan.c	Sat Jun 30 14:04:28 2001
+++ pgc-2.4.6-pre8/mm/vmscan.c	Mon Jul  2 17:08:34 2001
@@ -38,8 +38,11 @@
 /* mm->page_table_lock is held. mmap_sem is not held */
 static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
 {
-	pte_t pte;
 	swp_entry_t entry;
+	pte_t pte;
+
+	if (PageCachePage(page))
+		page = page_cache_page(page);

 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09  3:04           ` [wip-PATCH] " Ben LaHaise
@ 2001-07-09 11:18             ` Hugh Dickins
  2001-07-09 13:13               ` Jeff Garzik
  2001-07-09 17:21             ` Hugh Dickins
  1 sibling, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-09 11:18 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: Linus Torvalds, linux-mm

On Sun, 8 Jul 2001, Ben LaHaise wrote:
> 
> Hmmm, interesting.  At present page cache sizes from PAGE_SIZE to
> 8*PAGE_SIZE are working here.  Setting the shift to 4 or a 64KB page size
> results in the SCSI driver blowing up on io completion.

I hit that limit too: I believe it comes from unsigned short b_size.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09 11:18             ` Hugh Dickins
@ 2001-07-09 13:13               ` Jeff Garzik
  2001-07-09 14:18                 ` Hugh Dickins
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Garzik @ 2001-07-09 13:13 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Ben LaHaise, Linus Torvalds, linux-mm

Hugh Dickins wrote:
> 
> On Sun, 8 Jul 2001, Ben LaHaise wrote:
> >
> > Hmmm, interesting.  At present page cache sizes from PAGE_SIZE to
> > 8*PAGE_SIZE are working here.  Setting the shift to 4 or a 64KB page size
> > results in the SCSI driver blowing up on io completion.
> 
> I hit that limit too: I believe it comes from unsigned short b_size.

That limit's not a big deal.. the limits in the lower-level disk drivers
are what you start hitting...

-- 
Jeff Garzik      | A recent study has shown that too much soup
Building 1024    | can cause malaise in laboratory mice.
MandrakeSoft     |
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09 13:13               ` Jeff Garzik
@ 2001-07-09 14:18                 ` Hugh Dickins
  2001-07-09 14:33                   ` Jeff Garzik
  0 siblings, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-09 14:18 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Ben LaHaise, Linus Torvalds, linux-mm

On Mon, 9 Jul 2001, Jeff Garzik wrote:
> Hugh Dickins wrote:
> > On Sun, 8 Jul 2001, Ben LaHaise wrote:
> > >
> > > Hmmm, interesting.  At present page cache sizes from PAGE_SIZE to
> > > 8*PAGE_SIZE are working here.  Setting the shift to 4 or a 64KB page size
> > > results in the SCSI driver blowing up on io completion.
> > 
> > I hit that limit too: I believe it comes from unsigned short b_size.
> 
> That limit's not a big deal.. the limits in the lower-level disk drivers
> are what you start hitting...

Examples?

Limits below 64kB with some drivers we happen not to be using,
or limits >= 64kB we'd soon hit if we chose to do something about
unsigned short b_size (e.g. short sizes in the drivers own code)?

Limits in the disk drivers or limits in their firmware?  If the limits
are in the drivers, then they're probably PAGE_SIZE limits which raising
PAGE_SIZE deals with automatically, but raising PAGE_CACHE_SIZE needs
more edits to get working.

(Whereas raising PAGE_SIZE needs edits where it's the vm_pgoff
MMAP_? MMU_? PTE_? VM_? SUB? PAGE_SIZE that's needed.)

Linus believes it would be no more than a few buggy drivers which would
impose such limits; I don't know, I took little notice of the instances
I didn't need to change in raising PAGE_SIZE.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09 14:18                 ` Hugh Dickins
@ 2001-07-09 14:33                   ` Jeff Garzik
  0 siblings, 0 replies; 20+ messages in thread
From: Jeff Garzik @ 2001-07-09 14:33 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Ben LaHaise, Linus Torvalds, linux-mm

Hugh Dickins wrote:
> On Mon, 9 Jul 2001, Jeff Garzik wrote:
> > Hugh Dickins wrote:
> > > On Sun, 8 Jul 2001, Ben LaHaise wrote:
> > > >
> > > > Hmmm, interesting.  At present page cache sizes from PAGE_SIZE to
> > > > 8*PAGE_SIZE are working here.  Setting the shift to 4 or a 64KB page size
> > > > results in the SCSI driver blowing up on io completion.
> > >
> > > I hit that limit too: I believe it comes from unsigned short b_size.
> >
> > That limit's not a big deal.. the limits in the lower-level disk drivers
> > are what you start hitting...
> 
> Examples?

16-bit size values in places like the SCSI mid layer.

> Linus believes it would be no more than a few buggy drivers which would
> impose such limits;

I cannot say "few" or "many", Ben knows better than I, but Linus is
correct...  there is no hard 64K limit, it is just bugs we must flush
out of drivers and layers.

-- 
Jeff Garzik      | A recent study has shown that too much soup
Building 1024    | can cause malaise in laboratory mice.
MandrakeSoft     |
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09  3:04           ` [wip-PATCH] " Ben LaHaise
  2001-07-09 11:18             ` Hugh Dickins
@ 2001-07-09 17:21             ` Hugh Dickins
  2001-07-10  5:53               ` Ben LaHaise
  1 sibling, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-09 17:21 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: Linus Torvalds, linux-mm

On Sun, 8 Jul 2001, Ben LaHaise wrote:
> 
> Well, here are a few lmbench runs with larger PAGE_CACHE_SIZES.  Except
> for 2.4.2-2, the kernels are all based on 2.4.6-pre8, with -b and -c being
> the 2 and 3 shift page cache kernels.  As expected, exec and sh latencies
> are reduced.  Mmap latency appears to be adversely affected in the 16KB
> page cache case while other latencies are reduced.  My best guess here is
> that either a change in layout is causing cache collisions, or the changes
> in do_no_page are having an adverse impact on page fault timing.  Ideally
> the loop would be unrolled, however...

I doubt loop unrolling will make much difference.  Mark Hemment tells me
that lmbench makes very widely spaced accesses in its mmap() tests, so is
liable to show up the latency from the larger reads.

> The way I changed do_no_page to speculatively pre-fill ptes is suboptimal:
> it still has to obtain a ref count for each pte that touches the page
> cache page.  One idea here is to treat ptes within a given page cache page
> as sharing a single reference count, but this may have no impact on
> performance and simply add to code complexity and as such probably isn't
> worth the added hassle.

I'm sure not worth the added hassle - it means that all the unmappers
have to be made more complicated, to look ahead and behind for nearby
ptes which are sharing the ref count.  But you can add (N - 1) to the
ref count in one go once you know what N is.

(In looking at your do_no_page() code briefly then, I notice addr_min
and addr_max are first set up with page-table-limits, then immediately
overwritten with vma-limits - I think you meant to take max and min.)

> The next step is to try out Hugh's approach and see what differences there
> are and how the patches work together.  I also suspect that these changes
> will have a larger impact on performance with ia64 where we can use a
> single tlb entry to map all the page cache pages at the same time.  Hmmm,
> perhaps I should try making anonymous pages use the larger allocations
> where possible...

I'm interested you're having trouble with the anonymous->swap pages,
they're one of the reasons I went the large PAGE_SIZE instead of the
large PAGE_CACHE_SIZE route.  I think there's a lot in my mm/memory.c
mods which you could apply in yours, so even anonymous pages could use
PAGE_CACHE_SIZE pages efficiently.

I'll proceed with porting mine forward to 2.4.6 and make that available
to you a.s.a.p. - or else decide it'll take me too long, and make the
2.4.4 available instead - you're going much faster than I can manage.

I agree that our approaches are complementary, with a large overlap.
Shall we aim towards one patch combining configurable PAGE_CACHE_SIZE
and configurable PAGE_SIZE?  and later discard one or the other if
it proves redundant.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-09 17:21             ` Hugh Dickins
@ 2001-07-10  5:53               ` Ben LaHaise
  2001-07-10 16:42                 ` Hugh Dickins
  0 siblings, 1 reply; 20+ messages in thread
From: Ben LaHaise @ 2001-07-10  5:53 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Linus Torvalds, linux-mm

On Mon, 9 Jul 2001, Hugh Dickins wrote:

> I doubt loop unrolling will make much difference.  Mark Hemment tells me
> that lmbench makes very widely spaced accesses in its mmap() tests, so is
> liable to show up the latency from the larger reads.

Err, the difference is that unrolling those loops should allow them to run
with decreased latency as the current code will suffer from a number of
mispredictions.

> (In looking at your do_no_page() code briefly then, I notice addr_min
> and addr_max are first set up with page-table-limits, then immediately
> overwritten with vma-limits - I think you meant to take max and min.)

Not quite -- I just forgot to remove the first two as they're not needed
since everything operates on powers of two.

> I'm interested you're having trouble with the anonymous->swap pages,
> they're one of the reasons I went the large PAGE_SIZE instead of the
> large PAGE_CACHE_SIZE route.  I think there's a lot in my mm/memory.c
> mods which you could apply in yours, so even anonymous pages could use
> PAGE_CACHE_SIZE pages efficiently.

I'm not having trouble with it, I'm just uninterested in implementing it
since it has no effect on the performance measurements.  Namely, if there
is no change in performance, then there is little reason to waste time on
fixing swapping.

> I agree that our approaches are complementary, with a large overlap.
> Shall we aim towards one patch combining configurable PAGE_CACHE_SIZE
> and configurable PAGE_SIZE?  and later discard one or the other if
> it proves redundant.

Sure.  It doesn't look like much work to add in large page support, so let
me know one way or the other.

		-ben

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [wip-PATCH] Re: Large PAGE_SIZE
  2001-07-10  5:53               ` Ben LaHaise
@ 2001-07-10 16:42                 ` Hugh Dickins
  0 siblings, 0 replies; 20+ messages in thread
From: Hugh Dickins @ 2001-07-10 16:42 UTC (permalink / raw)
  To: Ben LaHaise; +Cc: Linus Torvalds, linux-mm

On Tue, 10 Jul 2001, Ben LaHaise wrote:
> On Mon, 9 Jul 2001, Hugh Dickins wrote:
> 
> > I doubt loop unrolling will make much difference.  Mark Hemment tells me
> > that lmbench makes very widely spaced accesses in its mmap() tests, so is
> > liable to show up the latency from the larger reads.
> 
> Err, the difference is that unrolling those loops should allow them to run
> with decreased latency as the current code will suffer from a number of
> mispredictions.

Yes, but I wouldn't expect that to be significant compared with
e.g. the avoidance of repeated faults by prefilling ptes.

(I think the latency you're talking about is in CPU execution,
whereas the latency I was talking about was in I/O?  Which are
many magnitudes different?  Am I misunderstanding you completely?)

If the prefilling of ptes turns out to be a significant win, then it can
be implemented very simply in do_no_page(), independent of more complex
patches for PAGE_CACHE_SIZE or PAGE_SIZE enlargement.  Our patches give
it a better chance of succeeding first time around, that's all.

> > (In looking at your do_no_page() code briefly then, I notice addr_min
> > and addr_max are first set up with page-table-limits, then immediately
> > overwritten with vma-limits - I think you meant to take max and min.)
> 
> Not quite -- I just forgot to remove the first two as they're not needed
> since everything operates on powers of two.

Right.  Took me awhile to understand your code there, different mindset.
Your folio of ptes is aligned in address space, does not have to worry
about page table changeover, may involve more than one page group.  My
folio of ptes is aligned in file offset, has to worry about page table
changeover, only involves one large page.  Your approach much simpler
than mine; mine finds more ptes to fill in first time around,
no difference once the cache is primed.

> > I'm interested you're having trouble with the anonymous->swap pages,
> > they're one of the reasons I went the large PAGE_SIZE instead of the
> > large PAGE_CACHE_SIZE route.  I think there's a lot in my mm/memory.c
> > mods which you could apply in yours, so even anonymous pages could use
> > PAGE_CACHE_SIZE pages efficiently.
> 
> I'm not having trouble with it, I'm just uninterested in implementing it
> since it has no effect on the performance measurements.  Namely, if there
> is no change in performance, then there is little reason to waste time on
> fixing swapping.

Fair enough.

> > I agree that our approaches are complementary, with a large overlap.
> > Shall we aim towards one patch combining configurable PAGE_CACHE_SIZE
> > and configurable PAGE_SIZE?  and later discard one or the other if
> > it proves redundant.
> 
> Sure.  It doesn't look like much work to add in large page support, so let
> me know one way or the other.

Umm, well, my patch is 350KB touching 160 files.  A lot of that trivial,
a fair amount tangential.  You might take a fresh look at it and find a
lot could be thrown out or simplified.  But don't let me hold you up:
you're steaming ahead, and your patch is much the smaller, I think it's
up to me to merge your work into mine once I've rebased and made it
available to you.

Back to work...

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-05 16:45   ` Large PAGE_SIZE Hugh Dickins
  2001-07-05 17:13     ` Linus Torvalds
@ 2001-07-18  0:02     ` Hugh Dickins
  2001-07-18 18:48       ` Hugh Dickins
  1 sibling, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-18  0:02 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ben LaHaise, linux-mm

is the promised Large PAGE_SIZE patch against 2.4.6.  If you'd like
to try these large pages, you'll have to edit include/asm-i386/page.h
PAGE_MMUSHIFT from 0 to 1 or 2 or 3: no configuration yet.  There's
a sense in which the patch is now complete, but I'll probably be
ashamed of that claim tomorrow (several of the drivers haven't even
got compiled yet, much more remains untested).  I'll update to 2.4.7
once it appears, but probably have to skip the -pres.

My original mail repeated below, to give a little explanation of what
you'll find; but I've changed it to match the current patch, saying
"MMU" where originally it said "SUB".  You did suggest VM_PAGE_SIZE
to match vm_pgoff, but I soon found even that too ambiguous.

I've not merged Ben's multipage PAGE_CACHE_SIZE into this version:
I couldn't think coolly enough to decide page_cluster readahead as
PAGE_SIZE and PAGE_CACHE_SIZE vary; and some other issues I'll need
to settle with Ben first.

Hugh

On Thu, 5 Jul 2001, Hugh Dickins wrote:
> 
> Linus,
> 
> Ben's mail on multipage PAGE_CACHE_SIZE support prompts me to let you
> know now what I've been doing, and ask your opinion on this direction.
> 
> Congratulations to Ben for working out multipage PAGE_CACHE_SIZE.
> I couldn't see where it was headed, and PAGE_CACHE_SIZE has been
> PAGE_SIZE for so long that I assumed everyone had given up on it.
> 
> I'm interested in larger pages, but wary of multipage PAGE_CACHE_SIZE:
> partly because it relies on non-0-order page allocations, partly because
> it seems a shame then to break I/O into smaller units below the cache.
> 
> So instead I'm using a larger PAGE_SIZE throughout the kernel: here's an
> extract from include/asm-i386/page.h (currently edited not configured):
> 
> /*
>  * One mmupage is represented by one Page Table Entry at the MMU level,
>  * and corresponds to one page at the user process level: its size is
>  * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)).
>  */
> #define MMUPAGE_SHIFT	12
> #define MMUPAGE_SIZE	(1UL << MMUPAGE_SHIFT)
> #define MMUPAGE_MASK	(~(MMUPAGE_SIZE-1))
> 
> /*
>  * 2**N adjacent mmupages may be clustered to make up one kernel page.
>  * Reasonable and tested values for PAGE_MMUSHIFT are 0 (4k page),
>  * 1 (8k page), 2 (16k page), 3 (32k page).  Higher values will not
>  * work without further changes e.g. to unsigned short b_size.
>  */
> #define PAGE_MMUSHIFT	0
> #define PAGE_MMUCOUNT	(1UL << PAGE_MMUSHIFT)
> 
> /*
>  * One kernel page is represented by one struct page (see mm.h),
>  * and is the kernel's principal unit of memory allocation.
>  */
> #define PAGE_SHIFT	(PAGE_MMUSHIFT + MMUPAGE_SHIFT)
> #define PAGE_SIZE	(1UL << PAGE_SHIFT)
> #define PAGE_MASK	(~(PAGE_SIZE-1))
> 
> The kernel patch which applies these definitions is, of course, much
> larger than Ben's multipage PAGE_CACHE_SIZE patch.   Currently against
> 2.4.4 (I'm rebasing to 2.4.6 in the next week) plus some other patches
> we're using inhouse, it's about 350KB touching 160 files.  Not quite
> complete yet (trivial macros still to be added to non-i386 arches; md
> readahead size not yet resolved; num_physpages in tuning to be checked;
> vmscan algorithms probably misscaled) and certainly undertested, but
> both 2GB SMP machine and 256MB laptop run stably with 32k pages (though
> 4k pages are better on the laptop, to keep kernel source tree in cache).
> 
> Most of the patch is simple and straightforward, replacing PAGE_SIZE
> by MMUPAGE_SIZE where appropriate (in drivers that's usually only when
> handling vm_pgoff).
> 
> Some of the patch is rather tangential: seemed right to implement proper
> flush_tlb_range() and flush_tlb_range_k() for flushing mmupages togther;
> hard to resist tidyups like changing zap_page_range() arg from size to
> end when it's always sandwiched between start,end functions.  Unless
> PAGE_CACHE_SIZE definition were to be removed too, no change at all
> to most filesystems (cramfs, ncpfs, proc being exceptions).
> 
> Kernel physical and virtual address space mostly in PAGE_SIZE units:
> __get_free_page(), vmalloc(), ioremap(), kmap_atomic(), kmap() pages;
> but early alloc_bootmem_pages() and fixmap.h slots in MMUPAGE_SIZE.
> 
> User address space has to be in MMUPAGE_SIZE units (unless I want to
> rebuild all my userspace): so the difficult part of the patch is the
> mm/memory.c fault handlers, and preventing the anonymous MMUPAGE_SIZE
> pieces from degenerating into needing a PAGE_SIZE physical page each,
> and how to translate exclusive_swap_page().
> 
> These page fault handlers now prepare and operate upon a
> pte_t *folio[PAGE_MMUCOUNT], different parts of the same large page
> expected at respective virtual offsets (yes, mremap() can spoil that,
> but it's exceptional).  Anon mappings may have non-0 vm_pgoff, to share
> page with adjacent private mappings e.g. bss share large page with data,
> so KIO across data-bss boundary works (KIO page granularity troublesome,
> but would have been a shame to revert to the easier MMUPAGE_SIZE there).
> Hard to get the macros right, to melt away to efficient code in the
> PAGE_MMUSHIFT 0 case: I've done the best I can for now,
> you'll probably find them clunky and suggest better.
> 
> Performance?  Not yet determined, we're just getting around to that.
> Unless it performs significantly better than multipage PAGE_CACHE_SIZE,
> it should be forgotten: no point in extensive change for no gain.
> 
> I've said enough for now: either you're already disgusted, and will
> reply "Never!", or you'll sometime want to cast an eye over the patch
> itself (or nominate someone else to do so), to get the measure of it.
> If the latter, please give me a few days to put it together against
> 2.4.6, minus our other inhouse pieces, then I can put the result on
> an ftp site for you.
> 
> I would have preferred to wait a little longer before unveiling this,
> but it's appropriate to consider it with multipage PAGE_CACHE_SIZE.
> 
> Thanks for your time!
> Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-18  0:02     ` Hugh Dickins
@ 2001-07-18 18:48       ` Hugh Dickins
  2001-07-22 23:08         ` Hugh Dickins
  0 siblings, 1 reply; 20+ messages in thread
From: Hugh Dickins @ 2001-07-18 18:48 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ben LaHaise, linux-mm

On Wed, 18 Jul 2001, Hugh Dickins wrote:
> 
> ftp://ftp.veritas.com/linux/larpage-2.4.6.patch.bz2
> 
> is the promised Large PAGE_SIZE patch against 2.4.6.  If you'd like
> to try these large pages, you'll have to edit include/asm-i386/page.h
> PAGE_MMUSHIFT from 0 to 1 or 2 or 3: no configuration yet.  There's
> a sense in which the patch is now complete, but I'll probably be
> ashamed of that claim tomorrow (several of the drivers haven't even
> got compiled yet, much more remains untested).  I'll update to 2.4.7
> once it appears, but probably have to skip the -pres.

Sorry for the noise, but somewhere between send and receive,
the all-important first line of yesterday's mail moved itself from
mail body to mail header.  I guess it's a bad idea to start off with
an ftp path (or "token:"?), so let's try it this way instead.

ftp://ftp.veritas.com/linux/larpage-2.4.6.patch.bz2

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Large PAGE_SIZE
  2001-07-18 18:48       ` Hugh Dickins
@ 2001-07-22 23:08         ` Hugh Dickins
  0 siblings, 0 replies; 20+ messages in thread
From: Hugh Dickins @ 2001-07-22 23:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ben LaHaise, linux-mm

Large i386 PAGE_SIZE patch is now updated to 2.4.7:

ftp://ftp.veritas.com/linux/larpage-2.4.7.patch.bz2

To try these large pages, edit include/asm-i386/page.h
PAGE_MMUSHIFT from 0 to 1 or 2 or 3: no configuration yet.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2001-07-22 23:08 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-07-05  5:06 [wip-PATCH] rfi: PAGE_CACHE_SIZE suppoort Ben LaHaise
2001-07-05  5:55 ` Linus Torvalds
2001-07-05 16:45   ` Large PAGE_SIZE Hugh Dickins
2001-07-05 17:13     ` Linus Torvalds
2001-07-05 18:38       ` Hugh Dickins
2001-07-05 18:53         ` Linus Torvalds
2001-07-05 20:41           ` Ben LaHaise
2001-07-05 20:59             ` Hugh Dickins
2001-07-06  5:11             ` Linus Torvalds
2001-07-09  3:04           ` [wip-PATCH] " Ben LaHaise
2001-07-09 11:18             ` Hugh Dickins
2001-07-09 13:13               ` Jeff Garzik
2001-07-09 14:18                 ` Hugh Dickins
2001-07-09 14:33                   ` Jeff Garzik
2001-07-09 17:21             ` Hugh Dickins
2001-07-10  5:53               ` Ben LaHaise
2001-07-10 16:42                 ` Hugh Dickins
2001-07-18  0:02     ` Hugh Dickins
2001-07-18 18:48       ` Hugh Dickins
2001-07-22 23:08         ` Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox