linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* hugetlb page patch for 2.5.48-bug fixes
@ 2002-11-21 22:05 Rohit Seth
  2002-11-21 23:51 ` William Lee Irwin III
  0 siblings, 1 reply; 9+ messages in thread
From: Rohit Seth @ 2002-11-21 22:05 UTC (permalink / raw)
  To: linux-mm, linux-kernel, akpm, torvalds, rohit.seth

[-- Attachment #1: Type: text/plain, Size: 350 bytes --]

Linus, Andrew,

Attached is the hugetlbpage patch for 2.5.48 containing following main 
changes:

1) Bug fixes (mainly in the unsuccessful attempts of hugepages).
2) Removal of Radix Tree field in key structure (as it is not needed).
3) Include the IPC_LOCK for permission to use hugepages.
4) Increment the key_counts during forks.

thanks,
rohit



[-- Attachment #2: patch2548.1121 --]
[-- Type: text/plain, Size: 7728 bytes --]

--- linux-2.5.48/include/linux/hugetlb.h	Sun Nov 17 20:29:45 2002
+++ linux-2.5.48.work//include/linux/hugetlb.h	Thu Nov 21 11:49:57 2002
@@ -4,7 +4,17 @@
 #ifdef CONFIG_HUGETLB_PAGE
 
 struct ctl_table;
-struct hugetlb_key;
+struct hugetlb_key {
+	struct page *root;
+	loff_t size;
+	atomic_t count;
+	spinlock_t lock;
+	int key;
+	int busy;
+	uid_t uid;
+	gid_t gid;
+	umode_t mode;
+};
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
--- linux-2.5.48/arch/i386/mm/hugetlbpage.c	Sun Nov 17 20:29:55 2002
+++ linux-2.5.48.work/arch/i386/mm/hugetlbpage.c	Thu Nov 21 12:12:18 2002
@@ -19,6 +19,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+#include <linux/sysctl.h>
+
 static long    htlbpagemem;
 int     htlbpage_max;
 static long    htlbzone_pages;
@@ -29,18 +31,6 @@
 
 #define MAX_ID 	32
 
-struct hugetlb_key {
-	struct radix_tree_root tree;
-	atomic_t count;
-	spinlock_t lock;
-	int key;
-	int busy;
-	uid_t uid;
-	gid_t gid;
-	umode_t mode;
-	loff_t size;
-};
-
 static struct hugetlb_key htlbpagek[MAX_ID];
 
 static void mark_key_busy(struct hugetlb_key *hugetlb_key)
@@ -81,7 +71,7 @@
 		spin_lock(&htlbpage_lock);
 		hugetlb_key = find_key(key);
 		if (!hugetlb_key) {
-			if (!capable(CAP_SYS_ADMIN) || !in_group_p(0))
+			if (!capable(CAP_SYS_ADMIN) && !capable(CAP_IPC_LOCK) && !in_group_p(0))
 				hugetlb_key = ERR_PTR(-EPERM);
 			else if (!(flag & IPC_CREAT))
 				hugetlb_key = ERR_PTR(-ENOENT);
@@ -96,7 +86,7 @@
 					hugetlb_key = &htlbpagek[i];
 					mark_key_busy(hugetlb_key);
 					hugetlb_key->key = key;
-					INIT_RADIX_TREE(&hugetlb_key->tree, GFP_ATOMIC);
+					hugetlb_key->root = NULL;
 					hugetlb_key->uid = current->fsuid;
 					hugetlb_key->gid = current->fsgid;
 					hugetlb_key->mode = prot;
@@ -107,7 +97,6 @@
 			hugetlb_key = ERR_PTR(-EAGAIN);
 			spin_unlock(&htlbpage_lock);
 		} else if (check_size_prot(hugetlb_key, len, prot, flag) < 0) {
-			hugetlb_key->key = 0;
 			hugetlb_key = ERR_PTR(-EINVAL);
 		} 
 	} while (hugetlb_key == ERR_PTR(-EAGAIN));
@@ -120,7 +109,10 @@
 {
 	unsigned long index;
 	unsigned long max_idx;
+	struct page *page, *prev;
 
+	if (key == NULL)
+		return;
 	if (!atomic_dec_and_test(&key->count)) {
 		spin_lock(&htlbpage_lock);
 		clear_key_busy(key);
@@ -129,16 +121,19 @@
 	}
 
 	max_idx = (key->size >> HPAGE_SHIFT);
+	page = key->root;
 	for (index = 0; index < max_idx; ++index) {
-		struct page *page = radix_tree_lookup(&key->tree, index);
 		if (!page)
 			continue;
-		huge_page_release(page);
+		prev = page;
+		page = (struct page *)page->private;
+		prev->private = 0UL;
+		huge_page_release(prev);
 	}
 	spin_lock(&htlbpage_lock);
 	key->key = 0;
 	clear_key_busy(key);
-	INIT_RADIX_TREE(&key->tree, GFP_ATOMIC);
+	key->root = NULL;
 	spin_unlock(&htlbpage_lock);
 }
 
@@ -247,7 +242,7 @@
 		vma->vm_end = end;
 	}
 	spin_unlock(&mm->page_table_lock);
-      out_error1:
+out_error1:
 	return -1;
 }
 
@@ -259,7 +254,10 @@
 	struct page *ptepage;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	struct hugetlb_key *key = vma->vm_private_data;
 
+	if ( key )
+		atomic_inc(&key->count);
 	while (addr < end) {
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
@@ -352,6 +350,8 @@
 	spin_unlock(&htlbpage_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		pte = huge_pte_offset(mm, address);
+		if (pte_none(*pte))
+			continue;
 		page = pte_page(*pte);
 		huge_page_release(page);
 		pte_clear(pte);
@@ -381,25 +381,10 @@
 	return 0;
 }
 
-struct page *key_find_page(struct hugetlb_key *key, unsigned long index)
-{
-	struct page *page = radix_tree_lookup(&key->tree, index);
-	if (page)
-		get_page(page);
-	return page;
-}
-
-int key_add_page(struct page *page, struct hugetlb_key *key, unsigned long index)
-{
-	int error = radix_tree_insert(&key->tree, index, page);
-	if (!error)
-		get_page(page);
-	return error;
-}
-
-static int prefault_key(struct hugetlb_key *key, struct vm_area_struct *vma)
+static int prefault_key(struct hugetlb_key *key, struct vm_area_struct *vma, unsigned long *temp)
 {
 	struct mm_struct *mm = current->mm;
+	struct page *page, *prev;
 	unsigned long addr;
 	int ret = 0;
 
@@ -408,21 +393,18 @@
 
 	spin_lock(&mm->page_table_lock);
 	spin_lock(&key->lock);
+	prev = page = key->root;
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
 		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
 
 		if (!pte) {
+			spin_unlock(&key->lock);
 			ret = -ENOMEM;
 			goto out;
 		}
 		if (!pte_none(*pte))
 			continue;
 
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = key_find_page(key, idx);
 		if (!page) {
 			page = alloc_hugetlb_page();
 			if (!page) {
@@ -430,13 +412,20 @@
 				ret = -ENOMEM;
 				goto out;
 			}
-			key_add_page(page, key, idx);
+			if (key->root == NULL)
+				key->root = page;
+			else
+				prev->private = (unsigned long)page;
 		}
+		get_page(page);
 		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+		prev = page;
+		page = (struct page *)page->private;
 	}
 	spin_unlock(&key->lock);
 out:
 	spin_unlock(&mm->page_table_lock);
+	*temp = addr;
 	return ret;
 }
 
@@ -446,6 +435,7 @@
 	struct vm_area_struct *vma;
 	struct hugetlb_key *hugetlb_key;
 	int retval = -ENOMEM;
+	unsigned long temp;
 
 	hugetlb_key = alloc_key(key, len, prot, flag );
 	spin_unlock(&htlbpage_lock);
@@ -455,17 +445,18 @@
 	addr = do_mmap_pgoff(NULL, addr, len, (unsigned long) prot,
 			MAP_NORESERVE|MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0);
 	if (IS_ERR((void *) addr))
-		goto out_release;
+		goto out;
 
 	vma = find_vma(mm, addr);
 	if (!vma) {
 		retval = -EINVAL;
-		goto out_release;
+		goto out;
 	}
 
-	retval = prefault_key(hugetlb_key, vma);
+	retval = prefault_key(hugetlb_key, vma, &temp);
+	addr = temp;
 	if (retval)
-		goto out;
+		goto out_release;
 
 	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
 	vma->vm_ops = &hugetlb_vm_ops;
@@ -474,7 +465,7 @@
 	clear_key_busy(hugetlb_key);
 	spin_unlock(&htlbpage_lock);
 	return retval;
-out:
+out_release:
 	if (addr > vma->vm_start) {
 		unsigned long raddr;
 		raddr = vma->vm_end;
@@ -482,10 +473,8 @@
 		zap_hugepage_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 		vma->vm_end = raddr;
 	}
-	spin_lock(&mm->page_table_lock);
 	do_munmap(mm, vma->vm_start, len);
-	spin_unlock(&mm->page_table_lock);
-out_release:
+out:
 	hugetlb_release_key(hugetlb_key);
 	return retval;
 }
@@ -533,10 +522,8 @@
 
 static int alloc_private_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot, int flag)
 {
-	if (!capable(CAP_SYS_ADMIN)) {
-		if (!in_group_p(0))
-			return -EPERM;
-	}
+	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_IPC_LOCK) && !in_group_p(0))
+		return -EPERM;
 	addr = do_mmap_pgoff(NULL, addr, len, prot,
 			MAP_NORESERVE|MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, 0);
 	if (IS_ERR((void *) addr))
--- linux-2.5.48/arch/i386/kernel/sys_i386.c	Sun Nov 17 20:29:56 2002
+++ linux-2.5.48.work/arch/i386/kernel/sys_i386.c	Thu Nov 21 12:01:08 2002
@@ -294,17 +294,17 @@
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
-	struct hugetlb_key *key;
 	int retval;
 
-	vma = find_vma(current->mm, addr);
-	if (!vma || !(vma->vm_flags & VM_HUGETLB) || vma->vm_start != addr)
-		return -EINVAL;
 	down_write(&mm->mmap_sem);
-	key = (struct hugetlb_key *)vma->vm_private_data;
+	vma = find_vma(current->mm, addr);
+	if (!vma || !(vma->vm_flags & VM_HUGETLB) || vma->vm_start != addr) {
+		retval =  -EINVAL;
+		goto out;
+	}
 	retval = do_munmap(vma->vm_mm, addr, vma->vm_end - addr);
+out:
 	up_write(&mm->mmap_sem);
-	hugetlb_release_key(key);
 	return retval;
 }
 #else

^ permalink raw reply	[flat|nested] 9+ messages in thread
* RE: hugetlb page patch for 2.5.48-bug fixes
@ 2002-11-22  1:54 Seth, Rohit
  2002-11-22  2:04 ` William Lee Irwin III
  0 siblings, 1 reply; 9+ messages in thread
From: Seth, Rohit @ 2002-11-22  1:54 UTC (permalink / raw)
  To: 'William Lee Irwin III', Seth, Rohit
  Cc: linux-mm, linux-kernel, akpm, torvalds

Thanks Bill for your comments. 

> Okay, first off why are you using a list linked through page->private?
> page->list is fully available for such tasks.

Don't really need a list_head kind of thing for always inorder complete
traversal. list_head (slightly) adds fat in data structures as well as
insertaion/removal. Please le me know if anything that prohibits the use of
page_private field for internal use.
> 
> Second, the if (key == NULL) check in hugetlb_release_key() 
> is bogus; someone is forgetting to check for NULL, probably 
> in alloc_shared_hugetlb_pages().
> 
This if condition will be removed.  It does not serve any purpose.  As there
is no way to release_key without a valid key.

> Third, the hugetlb_release_key() in unmap_hugepage_range() is 
> the one that should be removed [along with its corresponding 
> mark_key_busy()], not the one in sys_free_hugepages(). 
> unmap_hugepage_range() is doing neither setup nor teardown of 
> the key itself, only the pages and PTE's. I would say 
> key-level refcounting belongs to sys_free_hugepages().
> 
> Bill
> 
It is not mandatory that user app calls free_pages.  Or even in case of app
aborts this call will not be made.  The internal structures are always
released during the exit (with last ref count) along with free of underlying
physical pages.  

rohit
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 9+ messages in thread
* RE: hugetlb page patch for 2.5.48-bug fixes
@ 2002-11-22  3:23 Seth, Rohit
  2002-11-24 14:44 ` Ed Tomlinson
  0 siblings, 1 reply; 9+ messages in thread
From: Seth, Rohit @ 2002-11-22  3:23 UTC (permalink / raw)
  To: 'William Lee Irwin III', Seth, Rohit
  Cc: linux-mm, linux-kernel, akpm, torvalds

> At some point in the past, I wrote:
> >> Okay, first off why are you using a list linked through 
> >> page->private?
> >> page->list is fully available for such tasks.
> 
> On Thu, Nov 21, 2002 at 05:54:22PM -0800, Seth, Rohit wrote:
> > Don't really need a list_head kind of thing for always inorder 
> > complete traversal. list_head (slightly) adds fat in data 
> structures 
> > as well as insertaion/removal. Please le me know if anything that 
> > prohibits the use of page_private field for internal use.
> 
> page->private is also available for internal use. The objection here
> was about not using the standardized list macros. I'm not 
> convinced about the fat since the keyspace is tightly bounded 
> and the back pointers are in struct page regardless. (And we 
> also just happen to know page->lru is also available though 
> I'd not suggest using it.)
> 

Either way. That is fine. I will make the change.

> 
> At some point in the past, I wrote:
> >> Third, the hugetlb_release_key() in unmap_hugepage_range() is
> >> the one that should be removed [along with its corresponding 
> >> mark_key_busy()], not the one in sys_free_hugepages(). 
> >> unmap_hugepage_range() is doing neither setup nor teardown of 
> >> the key itself, only the pages and PTE's. I would say 
> >> key-level refcounting belongs to sys_free_hugepages().
> 
> On Thu, Nov 21, 2002 at 05:54:22PM -0800, Seth, Rohit wrote:
> > It is not mandatory that user app calls free_pages.  Or 
> even in case 
> > of app aborts this call will not be made.  The internal 
> structures are 
> > always released during the exit (with last ref count) along 
> with free 
> > of underlying physical pages.
> 
> Hmm, I can understand caution wrt. touching core. I suspect 
> vma->close() should do hugetlb_key_release() instead of 
> sys_free_hugepages()?
> 
That is a good option for 2.5. I will update the patch tomorrow.

rohit
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2002-11-24 15:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-11-21 22:05 hugetlb page patch for 2.5.48-bug fixes Rohit Seth
2002-11-21 23:51 ` William Lee Irwin III
2002-11-22  1:54 Seth, Rohit
2002-11-22  2:04 ` William Lee Irwin III
2002-11-22  3:23 Seth, Rohit
2002-11-24 14:44 ` Ed Tomlinson
2002-11-24 14:49   ` William Lee Irwin III
2002-11-24 15:01     ` Ed Tomlinson
2002-11-24 15:00       ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox