linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] VM: per-user overcommit policy
@ 2007-05-07 18:56 Andrea Righi
  2007-05-07 19:16 ` William Lee Irwin III
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Andrea Righi @ 2007-05-07 18:56 UTC (permalink / raw)
  To: LKML, linux-mm

Allow to define per-UID virtual memory overcommit handling. Configuration is
stored in a hash list in kernel space reachable through /proc/overcommit_uid
(surely there're better ways to do it, i.e. via configfs).

Hash elements are defined using a triple:

uid:overcommit_memory:overcommit_ratio

The overcommit_* values have the same semantic of their respective sysctl
variables.

If a user is not present in the hash, the default system policy will be used
(defined by /proc/sys/vm/overcommit_memory and /proc/sys/vm/overcommit_ratio).

Example:

- Enable "always overcommit" policy for admin:
root@host # echo 0:1:0 > /proc/overcommit_uid

- processes belonging to sshd (uid=100) and ntp (uid=102) users can be quite
  critical, so use a classic heuristic overcommit:
root@host # echo 100:0:50 > /proc/overcommit_uid
root@host # echo 102:0:50 > /proc/overcommit_uid

- allow uid=1001 and uid=1002 (common users) to allocate memory only if the
  total committed space is below the 50% of the physical RAM + the size of
  swap:
root@host # echo 1001:2:50 > /proc/overcommit_uid
root@host # echo 1002:2:50 > /proc/overcommit_uid

- Deny VM allocation to others:
root@host # echo 2 > /proc/sys/vm/overcommit_memory && echo 0 > /proc/sys/vm/overcommit_ratio

TODO:
- GID overcommit policy,
- per-user/group VM accounting,
- VM quotas,
- a lot of improvements,
- more testing...

Signed-off-by: Andrea Righi <a.righi@cineca.it>
---

diff -urpN linux-2.6.21/include/linux/mman.h linux-2.6.21-vm-acct-user/include/linux/mman.h
--- linux-2.6.21/include/linux/mman.h	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/include/linux/mman.h	2007-05-07 20:20:42.000000000 +0200
@@ -18,6 +18,14 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern atomic_t vm_committed_space;
+#ifdef CONFIG_VM_ACCT_USER
+struct vm_acct_values
+{
+	int overcommit_memory;
+	int overcommit_ratio;
+};
+extern int vm_acct_get_config(struct vm_acct_values *v, uid_t uid);
+#endif
 
 #ifdef CONFIG_SMP
 extern void vm_acct_memory(long pages);
diff -urpN linux-2.6.21/ipc/shm.c linux-2.6.21-vm-acct-user/ipc/shm.c
--- linux-2.6.21/ipc/shm.c	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/ipc/shm.c	2007-05-07 20:20:42.000000000 +0200
@@ -370,12 +370,24 @@ static int newseg (struct ipc_namespace 
 		shp->mlock_user = current->user;
 	} else {
 		int acctflag = VM_ACCOUNT;
+#ifdef CONFIG_VM_ACCT_USER
+		int overcommit_memory;
+		struct vm_acct_values v;
+	
+		if (!vm_acct_get_config(&v, current->uid)) {
+			overcommit_memory = v.overcommit_memory;
+		} else {
+			overcommit_memory = sysctl_overcommit_memory;
+		}
+#else 
+#define overcommit_memory sysctl_overcommit_memory
+#endif
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
 		 */
 		if  ((shmflg & SHM_NORESERVE) &&
-				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+				overcommit_memory != OVERCOMMIT_NEVER)
 			acctflag = 0;
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, acctflag);
diff -urpN linux-2.6.21/mm/Kconfig linux-2.6.21-vm-acct-user/mm/Kconfig
--- linux-2.6.21/mm/Kconfig	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/Kconfig	2007-05-07 20:21:21.000000000 +0200
@@ -163,3 +163,11 @@ config ZONE_DMA_FLAG
 	default "0" if !ZONE_DMA
 	default "1"
 
+config VM_ACCT_USER
+	bool "Per-user VM overcommit policy (EXPERIMENTAL)" 
+	depends on PROC_FS && EXPERIMENTAL
+	def_bool n
+	help
+	  Say Y here to enable per-user virtual memory overcommit handling.
+	  Overcommit configuration will be available via /proc/overcommit_uid.
+
diff -urpN linux-2.6.21/mm/mmap.c linux-2.6.21-vm-acct-user/mm/mmap.c
--- linux-2.6.21/mm/mmap.c	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/mmap.c	2007-05-07 20:20:42.000000000 +0200
@@ -95,16 +95,30 @@ atomic_t vm_committed_space = ATOMIC_INI
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
-
+#ifdef CONFIG_VM_ACCT_USER
+	int overcommit_memory, overcommit_ratio;
+	struct vm_acct_values v;
+	
+	if (!vm_acct_get_config(&v, current->uid)) {
+		overcommit_memory = v.overcommit_memory;
+		overcommit_ratio = v.overcommit_ratio;
+	} else {
+		overcommit_memory = sysctl_overcommit_memory;
+		overcommit_ratio = sysctl_overcommit_ratio;
+	}
+#else 
+#define overcommit_memory sysctl_overcommit_memory
+#define overcommit_ratio sysctl_overcommit_ratio
+#endif
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+	if (overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -155,7 +169,7 @@ int __vm_enough_memory(long pages, int c
 	}
 
 	allowed = (totalram_pages - hugetlb_total_pages())
-	       	* sysctl_overcommit_ratio / 100;
+	       	* overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */
@@ -901,6 +915,10 @@ unsigned long do_mmap_pgoff(struct file 
 	struct rb_node ** rb_link, * rb_parent;
 	int accountable = 1;
 	unsigned long charged = 0, reqprot = prot;
+#ifdef CONFIG_VM_ACCT_USER
+	int overcommit_memory;
+	struct vm_acct_values v;
+#endif
 
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1040,8 +1058,15 @@ munmap_back:
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
+#ifdef CONFIG_VM_ACCT_USER
+	if (!vm_acct_get_config(&v, current->uid)) {
+		overcommit_memory = v.overcommit_memory;
+	} else {
+		overcommit_memory = sysctl_overcommit_memory;
+	}
+#endif
 	if (accountable && (!(flags & MAP_NORESERVE) ||
-			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+			    overcommit_memory == OVERCOMMIT_NEVER)) {
 		if (vm_flags & VM_SHARED) {
 			/* Check memory availability in shmem_file_setup? */
 			vm_flags |= VM_ACCOUNT;
diff -urpN linux-2.6.21/mm/nommu.c linux-2.6.21-vm-acct-user/mm/nommu.c
--- linux-2.6.21/mm/nommu.c	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/nommu.c	2007-05-07 20:20:42.000000000 +0200
@@ -1240,16 +1240,31 @@ EXPORT_SYMBOL(get_unmapped_area);
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
+#ifdef CONFIG_VM_ACCT_USER
+	int overcommit_memory, overcommit_ratio;
+	struct vm_acct_values v;
+
+	if (!vm_acct_get_config(&v, current->uid)) {
+		overcommit_memory = v.overcommit_memory;
+		overcommit_ratio = v.overcommit_ratio;
+	} else {
+		overcommit_memory = sysctl_overcommit_memory;
+		overcommit_ratio = sysctl_overcommit_ratio;
+	}
+#else
+#define overcommit_memory sysctl_overcommit_memory
+#define overcommit_ratio sysctl_overcommit_ratio
+#endif
 
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+	if (overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -1299,7 +1314,7 @@ int __vm_enough_memory(long pages, int c
 		goto error;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = totalram_pages * overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */
diff -urpN linux-2.6.21/mm/swap.c linux-2.6.21-vm-acct-user/mm/swap.c
--- linux-2.6.21/mm/swap.c	2007-05-07 20:20:24.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/swap.c	2007-05-07 20:20:42.000000000 +0200
@@ -30,6 +30,10 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -455,6 +459,196 @@ unsigned pagevec_lookup_tag(struct pagev
 
 EXPORT_SYMBOL(pagevec_lookup_tag);
 
+#ifdef CONFIG_VM_ACCT_USER
+
+#define VM_ACCT_HASH_SHIFT	10
+#define VM_ACCT_HASH_SIZE	(1UL << VM_ACCT_HASH_SHIFT)
+#define vm_acct_hashfn(uid) hash_long((unsigned long)uid, VM_ACCT_HASH_SHIFT)
+
+/* User VM overcommit configuration */
+typedef struct vm_acct_hash_struct
+{
+	uid_t uid;
+	struct vm_acct_values val;
+	struct hlist_node vm_acct_chain;
+} vm_acct_hash_t;
+
+/* Hash list used to store per-user VM overcommit configurations */
+static struct hlist_head *vm_acct_hash;
+
+/* VM overcommit hash table spinlock */
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vm_acct_lock);
+
+/*
+ * Get user VM configuration from the hash list.
+ */
+int vm_acct_get_config(struct vm_acct_values *v, uid_t uid)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
+			     vm_acct_chain) {
+		if (p->uid == uid) {
+			v->overcommit_memory = p->val.overcommit_memory;
+			v->overcommit_ratio = p->val.overcommit_ratio;
+			spin_unlock_irq(&vm_acct_lock);
+			return 0;
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	return -ENOENT;
+}
+
+/*
+ * Create a new element in the VM configuration hash list.
+ */
+static int __vm_acct_set_element(uid_t uid,
+			int overcommit_memory, int overcommit_ratio)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+	int ret = 0;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
+			     vm_acct_chain) {
+		if (p->uid == uid) {
+			p->val.overcommit_memory = overcommit_memory;
+			p->val.overcommit_ratio = overcommit_ratio;
+			goto out;
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	/* Allocate new element */
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (unlikely(!p)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	p->uid = uid;
+	p->val.overcommit_memory = overcommit_memory;
+	p->val.overcommit_ratio = overcommit_ratio;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_add_head(&p->vm_acct_chain, &vm_acct_hash[vm_acct_hashfn(uid)]);
+out:
+	spin_unlock_irq(&vm_acct_lock);
+	return ret;
+}
+
+/*
+ * Set VM user parameters via /proc/overcommit_uid.
+ */
+static int vm_acct_set(struct file *filp, const char __user *buffer,
+		       size_t count, loff_t *data)
+{
+	char buf[128];
+	char *om, *or;
+	int __ret;
+
+	/*
+	 * Parse ':'-separated arguments
+	 *     uid:overcommit_memory:overcommit_ratio
+	 */
+	if (count > sizeof(buf) - 1)
+		return -EFAULT;
+
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[sizeof(buf) - 1] = '\0';
+
+	om = strstr(buf, ":");
+	if ((om == NULL) || (*++om == '\0')) {
+		return -EINVAL;
+	}
+
+	or = strstr(om, ":");
+	if ((or == NULL) || (*++or == '\0')) {
+		return -EINVAL;
+	}
+
+	/* Set VM configuration */
+	__ret = __vm_acct_set_element((uid_t)simple_strtoul(buf, NULL, 10),
+			    (int)simple_strtol(om, NULL, 10),
+			    (int)simple_strtol(or, NULL, 10));
+	if (__ret)
+		return __ret;
+
+	return count;
+}
+
+/*
+ * Print VM overcommit configurations.
+ */
+static int vm_acct_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+	int i;
+
+	spin_lock_irq(&vm_acct_lock);
+	for (i = 0; i < VM_ACCT_HASH_SIZE; i++) {
+		if (!&vm_acct_hash[i])
+			continue;
+		hlist_for_each_entry(p, elem, &vm_acct_hash[i],
+				vm_acct_chain) {
+			seq_printf(m, "%i:%i:%i\n",
+				   p->uid, p->val.overcommit_memory,
+				   p->val.overcommit_ratio);
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	return 0;
+}
+
+static int vm_acct_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, vm_acct_show, NULL);
+}
+
+static struct file_operations vm_acct_ops = {
+	.open		= vm_acct_open,
+	.read		= seq_read,
+	.write		= vm_acct_set,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_vm_acct(void)
+{
+	struct proc_dir_entry *pe;
+	int i;
+
+	vm_acct_hash = kmalloc(VM_ACCT_HASH_SIZE * sizeof(*(vm_acct_hash)),
+			       GFP_KERNEL);
+	if (!vm_acct_hash)
+		return -ENOMEM;
+
+	printk(KERN_INFO "vm_acct_uid hash table entries: %lu\n",
+	       VM_ACCT_HASH_SIZE / sizeof(*(vm_acct_hash)));
+
+	spin_lock_irq(&vm_acct_lock);
+	for (i = 0; i < VM_ACCT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&vm_acct_hash[i]);
+	spin_unlock_irq(&vm_acct_lock);
+
+	pe = create_proc_entry("overcommit_uid", 0600, NULL);
+	if (!pe)
+		return -ENOMEM;
+	pe->proc_fops = &vm_acct_ops;
+
+	return 0;
+}
+__initcall(init_vm_acct);
+
+#endif /* CONFIG_VM_ACCT_USER */
+
 #ifdef CONFIG_SMP
 /*
  * We tolerate a little inaccuracy to avoid ping-ponging the counter between

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 18:56 [RFC][PATCH] VM: per-user overcommit policy Andrea Righi
@ 2007-05-07 19:16 ` William Lee Irwin III
  2007-05-07 19:49   ` William Lee Irwin III
  2007-05-07 19:31 ` Luca Tettamanti
  2007-05-07 20:23 ` Alan Cox
  2 siblings, 1 reply; 8+ messages in thread
From: William Lee Irwin III @ 2007-05-07 19:16 UTC (permalink / raw)
  To: Andrea Righi; +Cc: LKML, linux-mm

On Mon, May 07, 2007 at 08:56:39PM +0200, Andrea Righi wrote:
> Allow to define per-UID virtual memory overcommit handling. Configuration is
> stored in a hash list in kernel space reachable through /proc/overcommit_uid
> (surely there're better ways to do it, i.e. via configfs).
> Hash elements are defined using a triple:
> uid:overcommit_memory:overcommit_ratio
> The overcommit_* values have the same semantic of their respective sysctl
> variables.
> If a user is not present in the hash, the default system policy will be used
> (defined by /proc/sys/vm/overcommit_memory and /proc/sys/vm/overcommit_ratio).

While I think it's a step in the right direction, I'm not convinced of
the soundness of the approach. I expect one might be better served by
per-user limits on committed memory, perhaps even proportional limits.

The basic idea is that committed memory is a relatively global resource.
You can apportion it and limit the global pool, but it's difficult to
arrange for overall overcommitment policy on a per-anything basis
without some sort of OOM-isolated domains for users and processes to run
within. Those are particularly interesting as they relate to kernel
memory allocations.

The /proc/ interface is probably going to raise a few eyebrows. I'm
unaware of what sorts of interfaces would be recommended for all this.

The following stanza occurs often:
+       if (!vm_acct_get_config(&v, current->uid)) {
+               overcommit_memory = v.overcommit_memory;
+               overcommit_ratio = v.overcommit_ratio;
+       } else {
+               overcommit_memory = sysctl_overcommit_memory;
+               overcommit_ratio = sysctl_overcommit_ratio;
+       }

suggesting that vm_acct_get_config() isn't the proper abstraction.

Instead of
	int vm_acct_get_config(struct vm_acct_values *, uid_t);
you could just have
	int vm_acct_get_config(struct vm_acct_values *);
which conditionally uses current->uid, and then unconditionally use
v.overcommit_memory and v.overcommit_ratio vs. sysctl_overcommit_memory
and sysctl_overcommit_ratio in the sequel.


-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 18:56 [RFC][PATCH] VM: per-user overcommit policy Andrea Righi
  2007-05-07 19:16 ` William Lee Irwin III
@ 2007-05-07 19:31 ` Luca Tettamanti
  2007-05-07 20:23 ` Alan Cox
  2 siblings, 0 replies; 8+ messages in thread
From: Luca Tettamanti @ 2007-05-07 19:31 UTC (permalink / raw)
  To: Andrea Righi; +Cc: linux-kernel, linux-mm

Hi,
a few comments on the patch:

Andrea Righi <righiandr@users.sourceforge.net> ha scritto:
> diff -urpN linux-2.6.21/include/linux/mman.h linux-2.6.21-vm-acct-user/include/linux/mman.h
> --- linux-2.6.21/include/linux/mman.h   2007-05-07 20:20:24.000000000 +0200
> +++ linux-2.6.21-vm-acct-user/include/linux/mman.h      2007-05-07 20:20:42.000000000 +0200
> @@ -18,6 +18,14 @@
> extern int sysctl_overcommit_memory;
> extern int sysctl_overcommit_ratio;
> extern atomic_t vm_committed_space;
> +#ifdef CONFIG_VM_ACCT_USER
> +struct vm_acct_values
> +{
> +       int overcommit_memory;
> +       int overcommit_ratio;
> +};
> +extern int vm_acct_get_config(struct vm_acct_values *v, uid_t uid);
> +#endif
> 
> #ifdef CONFIG_SMP
> extern void vm_acct_memory(long pages);
> diff -urpN linux-2.6.21/ipc/shm.c linux-2.6.21-vm-acct-user/ipc/shm.c
> --- linux-2.6.21/ipc/shm.c      2007-05-07 20:20:24.000000000 +0200
> +++ linux-2.6.21-vm-acct-user/ipc/shm.c 2007-05-07 20:20:42.000000000 +0200
> @@ -370,12 +370,24 @@ static int newseg (struct ipc_namespace 
>                shp->mlock_user = current->user;
>        } else {
>                int acctflag = VM_ACCOUNT;
> +#ifdef CONFIG_VM_ACCT_USER
> +               int overcommit_memory;
> +               struct vm_acct_values v;
> +       
> +               if (!vm_acct_get_config(&v, current->uid)) {
> +                       overcommit_memory = v.overcommit_memory;
> +               } else {
> +                       overcommit_memory = sysctl_overcommit_memory;
> +               }
> +#else 
> +#define overcommit_memory sysctl_overcommit_memory
> +#endif
>                /*
>                 * Do not allow no accounting for OVERCOMMIT_NEVER, even
>                 * if it's asked for.
>                 */
>                if  ((shmflg & SHM_NORESERVE) &&
> -                               sysctl_overcommit_memory != OVERCOMMIT_NEVER)
> +                               overcommit_memory != OVERCOMMIT_NEVER)

Ugly... very ugly ;) 

Don't hide 'overcommit_memory' inside the ifdef block. The compiler
should be smart enough to optimize away the extra var.

There's also a problem with the #ifdef scattered all over the code. You
need a static inline 'vm_acct_get_config' for the !CONFIG_VM_ACCT_USER
case:

static inline int vm_acct_get_config(struct vm_acct_values *v,
        uid_t uid)
{
        return 0;
}

in this way you can remove the #ifdef. Futhermore, I'd also move the
branch with the fallback to sysctl values inside the vm_acct_get_config.
So, for !CONFIG_VM_ACCT_USER:

static inline int vm_acct_get_config(struct vm_acct_values *v,
        uid_t uid)
{
       v->overcommit_memory = sysctl_overcommit_memory;
       v->overcommit_ratio = sysctl_overcommit_ratio;
}

(Yes, gcc will optimize it)

and for CONFIG_VM_ACCT_USER:

int vm_acct_get_config(struct vm_acct_values *v, uid_t uid)
{
        if (found uid) {
                v->overcommit_memory = foo;
                v->overcommit_ratio = bar;
        } else {
                v->overcommit_memory = sysctl_overcommit_memory;
                v->overcommit_ratio = sysctl_overcommit_ratio;
        }
}


> diff -urpN linux-2.6.21/mm/swap.c linux-2.6.21-vm-acct-user/mm/swap.c
> --- linux-2.6.21/mm/swap.c      2007-05-07 20:20:24.000000000 +0200
> +++ linux-2.6.21-vm-acct-user/mm/swap.c 2007-05-07 20:20:42.000000000 +0200
> @@ -30,6 +30,10 @@
> #include <linux/cpu.h>
> #include <linux/notifier.h>
> #include <linux/init.h>
> +#include <linux/hash.h>
> +#include <linux/seq_file.h>
> +#include <linux/kernel.h>
> +#include <linux/proc_fs.h>
> 
> /* How many pages do we try to swap or page in/out together? */
> int page_cluster;
> @@ -455,6 +459,196 @@ unsigned pagevec_lookup_tag(struct pagev
> 
> EXPORT_SYMBOL(pagevec_lookup_tag);
> 
> +#ifdef CONFIG_VM_ACCT_USER
> +
> +#define VM_ACCT_HASH_SHIFT     10
> +#define VM_ACCT_HASH_SIZE      (1UL << VM_ACCT_HASH_SHIFT)
> +#define vm_acct_hashfn(uid) hash_long((unsigned long)uid, VM_ACCT_HASH_SHIFT)
> +
> +/* User VM overcommit configuration */
> +typedef struct vm_acct_hash_struct
> +{
> +       uid_t uid;
> +       struct vm_acct_values val;
> +       struct hlist_node vm_acct_chain;
> +} vm_acct_hash_t;
> +
> +/* Hash list used to store per-user VM overcommit configurations */
> +static struct hlist_head *vm_acct_hash;
> +
> +/* VM overcommit hash table spinlock */
> +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vm_acct_lock);
> +
> +/*
> + * Get user VM configuration from the hash list.
> + */
> +int vm_acct_get_config(struct vm_acct_values *v, uid_t uid)
> +{
> +       struct hlist_node *elem;
> +       vm_acct_hash_t *p;
> +
> +       spin_lock_irq(&vm_acct_lock);
> +       hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
> +                            vm_acct_chain) {
> +               if (p->uid == uid) {
> +                       v->overcommit_memory = p->val.overcommit_memory;
> +                       v->overcommit_ratio = p->val.overcommit_ratio;
> +                       spin_unlock_irq(&vm_acct_lock);
> +                       return 0;
> +               }
> +       }
> +       spin_unlock_irq(&vm_acct_lock);
> +
> +       return -ENOENT;
> +}
> +
> +/*
> + * Create a new element in the VM configuration hash list.
> + */
> +static int __vm_acct_set_element(uid_t uid,
> +                       int overcommit_memory, int overcommit_ratio)
> +{
> +       struct hlist_node *elem;
> +       vm_acct_hash_t *p;
> +       int ret = 0;
> +
> +       spin_lock_irq(&vm_acct_lock);
> +       hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
> +                            vm_acct_chain) {
> +               if (p->uid == uid) {
> +                       p->val.overcommit_memory = overcommit_memory;
> +                       p->val.overcommit_ratio = overcommit_ratio;
> +                       goto out;
> +               }
> +       }
> +       spin_unlock_irq(&vm_acct_lock);
> +
> +       /* Allocate new element */
> +       p = kzalloc(sizeof(*p), GFP_KERNEL);
> +       if (unlikely(!p)) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +       p->uid = uid;
> +       p->val.overcommit_memory = overcommit_memory;
> +       p->val.overcommit_ratio = overcommit_ratio;
> +
> +       spin_lock_irq(&vm_acct_lock);
> +       hlist_add_head(&p->vm_acct_chain, &vm_acct_hash[vm_acct_hashfn(uid)]);
> +out:

In the error path (kzalloc failure) you release vm_acct_lock which is not
held.

> +       spin_unlock_irq(&vm_acct_lock);
> +       return ret;
> +}
> +
> +/*
> + * Set VM user parameters via /proc/overcommit_uid.
> + */
> +static int vm_acct_set(struct file *filp, const char __user *buffer,
> +                      size_t count, loff_t *data)
> +{
> +       char buf[128];
> +       char *om, *or;
> +       int __ret;
              ^^ uh?

> +
> +       /*
> +        * Parse ':'-separated arguments
> +        *     uid:overcommit_memory:overcommit_ratio
> +        */
> +       if (count > sizeof(buf) - 1)
> +               return -EFAULT;
> +
> +       if (copy_from_user(buf, buffer, count))
> +               return -EFAULT;
> +
> +       buf[sizeof(buf) - 1] = '\0';
> +
> +       om = strstr(buf, ":");
> +       if ((om == NULL) || (*++om == '\0')) {
> +               return -EINVAL;
> +       }
> +
> +       or = strstr(om, ":");
> +       if ((or == NULL) || (*++or == '\0')) {
> +               return -EINVAL;
> +       }
> +
> +       /* Set VM configuration */
> +       __ret = __vm_acct_set_element((uid_t)simple_strtoul(buf, NULL, 10),
> +                           (int)simple_strtol(om, NULL, 10),
> +                           (int)simple_strtol(or, NULL, 10));
> +       if (__ret)
> +               return __ret;
> +
> +       return count;
> +}
> +
> +/*
> + * Print VM overcommit configurations.
> + */
> +static int vm_acct_show(struct seq_file *m, void *v)
> +{
> +       struct hlist_node *elem;
> +       vm_acct_hash_t *p;
> +       int i;
> +
> +       spin_lock_irq(&vm_acct_lock);
> +       for (i = 0; i < VM_ACCT_HASH_SIZE; i++) {
> +               if (!&vm_acct_hash[i])
> +                       continue;
> +               hlist_for_each_entry(p, elem, &vm_acct_hash[i],
> +                               vm_acct_chain) {
> +                       seq_printf(m, "%i:%i:%i\n",
> +                                  p->uid, p->val.overcommit_memory,
> +                                  p->val.overcommit_ratio);
> +               }
> +       }
> +       spin_unlock_irq(&vm_acct_lock);
> +
> +       return 0;
> +}
> +
> +static int vm_acct_open(struct inode *inode, struct file *filp)
> +{
> +       return single_open(filp, vm_acct_show, NULL);
> +}
> +
> +static struct file_operations vm_acct_ops = {
> +       .open           = vm_acct_open,
> +       .read           = seq_read,
> +       .write          = vm_acct_set,
> +       .llseek         = seq_lseek,
> +       .release        = seq_release,
> +};
> +
> +static int __init init_vm_acct(void)
> +{
> +       struct proc_dir_entry *pe;
> +       int i;
> +
> +       vm_acct_hash = kmalloc(VM_ACCT_HASH_SIZE * sizeof(*(vm_acct_hash)),
> +                              GFP_KERNEL);
> +       if (!vm_acct_hash)
> +               return -ENOMEM;
> +
> +       printk(KERN_INFO "vm_acct_uid hash table entries: %lu\n",
> +              VM_ACCT_HASH_SIZE / sizeof(*(vm_acct_hash)));
> +
> +       spin_lock_irq(&vm_acct_lock);
> +       for (i = 0; i < VM_ACCT_HASH_SIZE; i++)
> +               INIT_HLIST_HEAD(&vm_acct_hash[i]);
> +       spin_unlock_irq(&vm_acct_lock);
> +
> +       pe = create_proc_entry("overcommit_uid", 0600, NULL);
> +       if (!pe)
> +               return -ENOMEM;
> +       pe->proc_fops = &vm_acct_ops;
> +
> +       return 0;
> +}
> +__initcall(init_vm_acct);
> +
> +#endif /* CONFIG_VM_ACCT_USER */
> +
> #ifdef CONFIG_SMP
> /*
>  * We tolerate a little inaccuracy to avoid ping-ponging the counter between
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


Luca
-- 
Porc i' mond che cio' sott i piedi!
V. Catozzo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 19:16 ` William Lee Irwin III
@ 2007-05-07 19:49   ` William Lee Irwin III
  2007-05-07 22:48     ` Andrea Righi
  0 siblings, 1 reply; 8+ messages in thread
From: William Lee Irwin III @ 2007-05-07 19:49 UTC (permalink / raw)
  To: Andrea Righi; +Cc: LKML, linux-mm

On Mon, May 07, 2007 at 12:16:58PM -0700, William Lee Irwin III wrote:
> The following stanza occurs often:
> +       if (!vm_acct_get_config(&v, current->uid)) {
> +               overcommit_memory = v.overcommit_memory;
> +               overcommit_ratio = v.overcommit_ratio;
> +       } else {
> +               overcommit_memory = sysctl_overcommit_memory;
> +               overcommit_ratio = sysctl_overcommit_ratio;
> +       }
> 
> suggesting that vm_acct_get_config() isn't the proper abstraction.
> Instead of
> 	int vm_acct_get_config(struct vm_acct_values *, uid_t);
> you could just have
> 	int vm_acct_get_config(struct vm_acct_values *);
> which conditionally uses current->uid, and then unconditionally use
> v.overcommit_memory and v.overcommit_ratio vs. sysctl_overcommit_memory
> and sysctl_overcommit_ratio in the sequel.

Something like this (untested/uncompiled) may do.


Index: righi/include/linux/mman.h
===================================================================
--- righi.orig/include/linux/mman.h	2007-05-07 12:36:05.897386369 -0700
+++ righi/include/linux/mman.h	2007-05-07 12:42:29.803263919 -0700
@@ -18,14 +18,13 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern atomic_t vm_committed_space;
-#ifdef CONFIG_VM_ACCT_USER
+
 struct vm_acct_values
 {
 	int overcommit_memory;
 	int overcommit_ratio;
 };
-extern int vm_acct_get_config(struct vm_acct_values *v, uid_t uid);
-#endif
+void vm_acct_get_config(struct vm_acct_values *v);
 
 #ifdef CONFIG_SMP
 extern void vm_acct_memory(long pages);
Index: righi/mm/swap.c
===================================================================
--- righi.orig/mm/swap.c	2007-05-07 12:37:13.965265337 -0700
+++ righi/mm/swap.c	2007-05-07 12:42:14.914415451 -0700
@@ -482,10 +482,11 @@
 /*
  * Get user VM configuration from the hash list.
  */
-int vm_acct_get_config(struct vm_acct_values *v, uid_t uid)
+void vm_acct_get_config(struct vm_acct_values *v)
 {
 	struct hlist_node *elem;
 	vm_acct_hash_t *p;
+	uid_t uid = current->uid;
 
 	spin_lock_irq(&vm_acct_lock);
 	hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
@@ -494,12 +495,12 @@
 			v->overcommit_memory = p->val.overcommit_memory;
 			v->overcommit_ratio = p->val.overcommit_ratio;
 			spin_unlock_irq(&vm_acct_lock);
-			return 0;
+			return;
 		}
 	}
 	spin_unlock_irq(&vm_acct_lock);
-
-	return -ENOENT;
+	v->overcommit_memory = sysctl_overcommit_memory;
+	v->overcommit_ratio = sysctl_overcommit_ratio;
 }
 
 /*
@@ -646,8 +647,13 @@
 	return 0;
 }
 __initcall(init_vm_acct);
-
-#endif /* CONFIG_VM_ACCT_USER */
+#else /* !CONFIG_VM_ACCT_USER */
+void vm_acct_get_config(struct vm_acct_values *v)
+{
+	v->overcommit_memory = sysctl_overcommit_memory;
+	v->overcommit_ratio = sysctl_overcommit_ratio;
+}
+#endif /* !CONFIG_VM_ACCT_USER */
 
 #ifdef CONFIG_SMP
 /*
Index: righi/ipc/shm.c
===================================================================
--- righi.orig/ipc/shm.c	2007-05-07 12:40:35.576754521 -0700
+++ righi/ipc/shm.c	2007-05-07 12:43:32.714849046 -0700
@@ -370,24 +370,15 @@
 		shp->mlock_user = current->user;
 	} else {
 		int acctflag = VM_ACCOUNT;
-#ifdef CONFIG_VM_ACCT_USER
-		int overcommit_memory;
 		struct vm_acct_values v;
 
-		if (!vm_acct_get_config(&v, current->uid)) {
-			overcommit_memory = v.overcommit_memory;
-		} else {
-			overcommit_memory = sysctl_overcommit_memory;
-		}
-#else
-#define overcommit_memory sysctl_overcommit_memory
-#endif
+		vm_acct_get_config(&v);
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
 		 */
 		if  ((shmflg & SHM_NORESERVE) &&
-				overcommit_memory != OVERCOMMIT_NEVER)
+				v.overcommit_memory != OVERCOMMIT_NEVER)
 			acctflag = 0;
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, acctflag);
Index: righi/mm/mmap.c
===================================================================
--- righi.orig/mm/mmap.c	2007-05-07 12:43:48.143728287 -0700
+++ righi/mm/mmap.c	2007-05-07 12:46:02.775400509 -0700
@@ -96,30 +96,18 @@
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
-#ifdef CONFIG_VM_ACCT_USER
-	int overcommit_memory, overcommit_ratio;
 	struct vm_acct_values v;
 
-	if (!vm_acct_get_config(&v, current->uid)) {
-		overcommit_memory = v.overcommit_memory;
-		overcommit_ratio = v.overcommit_ratio;
-	} else {
-		overcommit_memory = sysctl_overcommit_memory;
-		overcommit_ratio = sysctl_overcommit_ratio;
-	}
-#else
-#define overcommit_memory sysctl_overcommit_memory
-#define overcommit_ratio sysctl_overcommit_ratio
-#endif
+	vm_acct_get_config(&v);
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (overcommit_memory == OVERCOMMIT_GUESS) {
+	if (v.overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -170,7 +158,7 @@
 	}
 
 	allowed = (totalram_pages - hugetlb_total_pages())
-	       	* overcommit_ratio / 100;
+	       	* v.overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */
@@ -916,10 +904,7 @@
 	struct rb_node ** rb_link, * rb_parent;
 	int accountable = 1;
 	unsigned long charged = 0, reqprot = prot;
-#ifdef CONFIG_VM_ACCT_USER
-	int overcommit_memory;
 	struct vm_acct_values v;
-#endif
 
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1059,15 +1044,9 @@
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-#ifdef CONFIG_VM_ACCT_USER
-	if (!vm_acct_get_config(&v, current->uid)) {
-		overcommit_memory = v.overcommit_memory;
-	} else {
-		overcommit_memory = sysctl_overcommit_memory;
-	}
-#endif
+	vm_acct_get_config(&v);
 	if (accountable && (!(flags & MAP_NORESERVE) ||
-			    overcommit_memory == OVERCOMMIT_NEVER)) {
+			    v.overcommit_memory == OVERCOMMIT_NEVER)) {
 		if (vm_flags & VM_SHARED) {
 			/* Check memory availability in shmem_file_setup? */
 			vm_flags |= VM_ACCOUNT;
Index: righi/mm/nommu.c
===================================================================
--- righi.orig/mm/nommu.c	2007-05-07 12:46:09.667793284 -0700
+++ righi/mm/nommu.c	2007-05-07 12:46:52.490233596 -0700
@@ -1240,31 +1240,18 @@
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
-#ifdef CONFIG_VM_ACCT_USER
-	int overcommit_memory, overcommit_ratio;
 	struct vm_acct_values v;
 
-	if (!vm_acct_get_config(&v, current->uid)) {
-		overcommit_memory = v.overcommit_memory;
-		overcommit_ratio = v.overcommit_ratio;
-	} else {
-		overcommit_memory = sysctl_overcommit_memory;
-		overcommit_ratio = sysctl_overcommit_ratio;
-	}
-#else
-#define overcommit_memory sysctl_overcommit_memory
-#define overcommit_ratio sysctl_overcommit_ratio
-#endif
-
+	vm_acct_get_config(&v);
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (overcommit_memory == OVERCOMMIT_GUESS) {
+	if (v.overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -1314,7 +1301,7 @@
 		goto error;
 	}
 
-	allowed = totalram_pages * overcommit_ratio / 100;
+	allowed = totalram_pages * v.overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 18:56 [RFC][PATCH] VM: per-user overcommit policy Andrea Righi
  2007-05-07 19:16 ` William Lee Irwin III
  2007-05-07 19:31 ` Luca Tettamanti
@ 2007-05-07 20:23 ` Alan Cox
  2007-05-07 22:49   ` Andrea Righi
  2 siblings, 1 reply; 8+ messages in thread
From: Alan Cox @ 2007-05-07 20:23 UTC (permalink / raw)
  To: righiandr; +Cc: LKML, linux-mm

> - allow uid=1001 and uid=1002 (common users) to allocate memory only if the
>   total committed space is below the 50% of the physical RAM + the size of
>   swap:
> root@host # echo 1001:2:50 > /proc/overcommit_uid
> root@host # echo 1002:2:50 > /proc/overcommit_uid

There are some fundamental problems with this model - the moment you mix
strict overcommit with anything else it ceases to be a strict overcommit
and you might as well use existing overcommit rules for most stuff

The other thing you are sort of faking is per user resource management -
which is a subset of per group of users resource management which is
useful - eg "students can't hog the machine"

I don't see that this is the right approach compared with the container
work and openvz work that is currently active and far more flexible.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 19:49   ` William Lee Irwin III
@ 2007-05-07 22:48     ` Andrea Righi
  0 siblings, 0 replies; 8+ messages in thread
From: Andrea Righi @ 2007-05-07 22:48 UTC (permalink / raw)
  To: William Lee Irwin III, Luca Tettamanti; +Cc: LKML, linux-mm

William Lee Irwin III wrote:
> On Mon, May 07, 2007 at 12:16:58PM -0700, William Lee Irwin III wrote:
>> The following stanza occurs often:
>> +       if (!vm_acct_get_config(&v, current->uid)) {
>> +               overcommit_memory = v.overcommit_memory;
>> +               overcommit_ratio = v.overcommit_ratio;
>> +       } else {
>> +               overcommit_memory = sysctl_overcommit_memory;
>> +               overcommit_ratio = sysctl_overcommit_ratio;
>> +       }
>>
>> suggesting that vm_acct_get_config() isn't the proper abstraction.
>> Instead of
>> 	int vm_acct_get_config(struct vm_acct_values *, uid_t);
>> you could just have
>> 	int vm_acct_get_config(struct vm_acct_values *);
>> which conditionally uses current->uid, and then unconditionally use
>> v.overcommit_memory and v.overcommit_ratio vs. sysctl_overcommit_memory
>> and sysctl_overcommit_ratio in the sequel.
> 
> Something like this (untested/uncompiled) may do.

[snip]

I agree with everything, applied all the changes and fixed the bug reported by
Luca (see below). It seems to compile and work without problem. Thanks!

Signed-off-by: Andrea Righi <a.righi@cineca.it>
---

diff -urpN linux-2.6.21/include/linux/mman.h linux-2.6.21-vm-acct-user/include/linux/mman.h
--- linux-2.6.21/include/linux/mman.h	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/include/linux/mman.h	2007-05-07 23:33:16.000000000 +0200
@@ -18,6 +18,20 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern atomic_t vm_committed_space;
+struct vm_acct_values
+{
+	int overcommit_memory;
+	int overcommit_ratio;
+};
+#ifdef CONFIG_VM_ACCT_USER
+extern void vm_acct_get_config(struct vm_acct_values *v);
+#else
+static inline void vm_acct_get_config(struct vm_acct_values *v)
+{
+        v->overcommit_memory = sysctl_overcommit_memory;
+        v->overcommit_ratio = sysctl_overcommit_ratio;
+}
+#endif
 
 #ifdef CONFIG_SMP
 extern void vm_acct_memory(long pages);
diff -urpN linux-2.6.21/ipc/shm.c linux-2.6.21-vm-acct-user/ipc/shm.c
--- linux-2.6.21/ipc/shm.c	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/ipc/shm.c	2007-05-07 23:24:04.000000000 +0200
@@ -370,12 +370,15 @@ static int newseg (struct ipc_namespace 
 		shp->mlock_user = current->user;
 	} else {
 		int acctflag = VM_ACCOUNT;
+		struct vm_acct_values v;
+	
+		vm_acct_get_config(&v);
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
 		 */
 		if  ((shmflg & SHM_NORESERVE) &&
-				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+				v.overcommit_memory != OVERCOMMIT_NEVER)
 			acctflag = 0;
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, acctflag);
diff -urpN linux-2.6.21/mm/Kconfig linux-2.6.21-vm-acct-user/mm/Kconfig
--- linux-2.6.21/mm/Kconfig	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/Kconfig	2007-05-07 23:15:51.000000000 +0200
@@ -163,3 +163,11 @@ config ZONE_DMA_FLAG
 	default "0" if !ZONE_DMA
 	default "1"
 
+config VM_ACCT_USER
+	bool "Per-user VM overcommit policy (EXPERIMENTAL)" 
+	depends on PROC_FS && EXPERIMENTAL
+	def_bool n
+	help
+	  Say Y here to enable per-user virtual memory overcommit handling.
+	  Overcommit configuration will be available via /proc/overcommit_uid.
+
diff -urpN linux-2.6.21/mm/mmap.c linux-2.6.21-vm-acct-user/mm/mmap.c
--- linux-2.6.21/mm/mmap.c	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/mmap.c	2007-05-07 23:25:56.000000000 +0200
@@ -95,16 +95,18 @@ atomic_t vm_committed_space = ATOMIC_INI
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
-
+	struct vm_acct_values v;
+	
+	vm_acct_get_config(&v);
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+	if (v.overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -155,7 +157,7 @@ int __vm_enough_memory(long pages, int c
 	}
 
 	allowed = (totalram_pages - hugetlb_total_pages())
-	       	* sysctl_overcommit_ratio / 100;
+	       	* v.overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */
@@ -901,6 +903,7 @@ unsigned long do_mmap_pgoff(struct file 
 	struct rb_node ** rb_link, * rb_parent;
 	int accountable = 1;
 	unsigned long charged = 0, reqprot = prot;
+	struct vm_acct_values v;
 
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1040,8 +1043,9 @@ munmap_back:
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
+	vm_acct_get_config(&v);
 	if (accountable && (!(flags & MAP_NORESERVE) ||
-			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+			    v.overcommit_memory == OVERCOMMIT_NEVER)) {
 		if (vm_flags & VM_SHARED) {
 			/* Check memory availability in shmem_file_setup? */
 			vm_flags |= VM_ACCOUNT;
diff -urpN linux-2.6.21/mm/nommu.c linux-2.6.21-vm-acct-user/mm/nommu.c
--- linux-2.6.21/mm/nommu.c	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/nommu.c	2007-05-07 23:27:03.000000000 +0200
@@ -1240,16 +1240,18 @@ EXPORT_SYMBOL(get_unmapped_area);
 int __vm_enough_memory(long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed;
+	struct vm_acct_values v;
 
+	vm_acct_get_config(&v);
 	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have
 	 */
-	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+	if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
 		return 0;
 
-	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+	if (v.overcommit_memory == OVERCOMMIT_GUESS) {
 		unsigned long n;
 
 		free = global_page_state(NR_FILE_PAGES);
@@ -1299,7 +1301,7 @@ int __vm_enough_memory(long pages, int c
 		goto error;
 	}
 
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed = totalram_pages * v.overcommit_ratio / 100;
 	/*
 	 * Leave the last 3% for root
 	 */
diff -urpN linux-2.6.21/mm/swap.c linux-2.6.21-vm-acct-user/mm/swap.c
--- linux-2.6.21/mm/swap.c	2007-05-07 20:44:50.000000000 +0200
+++ linux-2.6.21-vm-acct-user/mm/swap.c	2007-05-07 23:33:40.000000000 +0200
@@ -30,6 +30,10 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -455,6 +459,196 @@ unsigned pagevec_lookup_tag(struct pagev
 
 EXPORT_SYMBOL(pagevec_lookup_tag);
 
+#ifdef CONFIG_VM_ACCT_USER
+
+#define VM_ACCT_HASH_SHIFT	10
+#define VM_ACCT_HASH_SIZE	(1UL << VM_ACCT_HASH_SHIFT)
+#define vm_acct_hashfn(uid) hash_long((unsigned long)uid, VM_ACCT_HASH_SHIFT)
+
+/* User VM overcommit configuration */
+typedef struct vm_acct_hash_struct
+{
+	uid_t uid;
+	struct vm_acct_values val;
+	struct hlist_node vm_acct_chain;
+} vm_acct_hash_t;
+
+/* Hash list used to store per-user VM overcommit configurations */
+static struct hlist_head *vm_acct_hash;
+
+/* VM overcommit hash table spinlock */
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(vm_acct_lock);
+
+/*
+ * Get user VM configuration from the hash list.
+ */
+void vm_acct_get_config(struct vm_acct_values *v)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+	uid_t uid = current->uid;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
+			     vm_acct_chain) {
+		if (p->uid == uid) {
+			v->overcommit_memory = p->val.overcommit_memory;
+			v->overcommit_ratio = p->val.overcommit_ratio;
+			spin_unlock_irq(&vm_acct_lock);
+			return;
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	v->overcommit_memory = sysctl_overcommit_memory;
+	v->overcommit_ratio = sysctl_overcommit_ratio;
+}
+
+/*
+ * Create a new element in the VM configuration hash list.
+ */
+static int __vm_acct_set_element(uid_t uid,
+			int overcommit_memory, int overcommit_ratio)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+	int ret = 0;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_for_each_entry(p, elem, &vm_acct_hash[vm_acct_hashfn(uid)],
+			     vm_acct_chain) {
+		if (p->uid == uid) {
+			p->val.overcommit_memory = overcommit_memory;
+			p->val.overcommit_ratio = overcommit_ratio;
+			goto out;
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	/* Allocate new element */
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (unlikely(!p)) {
+		return -ENOMEM;
+	}
+	p->uid = uid;
+	p->val.overcommit_memory = overcommit_memory;
+	p->val.overcommit_ratio = overcommit_ratio;
+
+	spin_lock_irq(&vm_acct_lock);
+	hlist_add_head(&p->vm_acct_chain, &vm_acct_hash[vm_acct_hashfn(uid)]);
+out:
+	spin_unlock_irq(&vm_acct_lock);
+	return ret;
+}
+
+/*
+ * Set VM user parameters via /proc/overcommit_uid.
+ */
+static int vm_acct_set(struct file *filp, const char __user *buffer,
+		       size_t count, loff_t *data)
+{
+	char buf[128];
+	char *om, *or;
+	int ret;
+
+	/*
+	 * Parse ':'-separated arguments
+	 *     uid:overcommit_memory:overcommit_ratio
+	 */
+	if (count > sizeof(buf) - 1)
+		return -EFAULT;
+
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[sizeof(buf) - 1] = '\0';
+
+	om = strstr(buf, ":");
+	if ((om == NULL) || (*++om == '\0')) {
+		return -EINVAL;
+	}
+
+	or = strstr(om, ":");
+	if ((or == NULL) || (*++or == '\0')) {
+		return -EINVAL;
+	}
+
+	/* Set VM configuration */
+	ret = __vm_acct_set_element((uid_t)simple_strtoul(buf, NULL, 10),
+			    (int)simple_strtol(om, NULL, 10),
+			    (int)simple_strtol(or, NULL, 10));
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Print VM overcommit configurations.
+ */
+static int vm_acct_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *elem;
+	vm_acct_hash_t *p;
+	int i;
+
+	spin_lock_irq(&vm_acct_lock);
+	for (i = 0; i < VM_ACCT_HASH_SIZE; i++) {
+		if (!&vm_acct_hash[i])
+			continue;
+		hlist_for_each_entry(p, elem, &vm_acct_hash[i],
+				vm_acct_chain) {
+			seq_printf(m, "%i:%i:%i\n",
+				   p->uid, p->val.overcommit_memory,
+				   p->val.overcommit_ratio);
+		}
+	}
+	spin_unlock_irq(&vm_acct_lock);
+
+	return 0;
+}
+
+static int vm_acct_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, vm_acct_show, NULL);
+}
+
+static struct file_operations vm_acct_ops = {
+	.open		= vm_acct_open,
+	.read		= seq_read,
+	.write		= vm_acct_set,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_vm_acct(void)
+{
+	struct proc_dir_entry *pe;
+	int i;
+
+	vm_acct_hash = kmalloc(VM_ACCT_HASH_SIZE * sizeof(*(vm_acct_hash)),
+			       GFP_KERNEL);
+	if (!vm_acct_hash)
+		return -ENOMEM;
+
+	printk(KERN_INFO "vm_acct_uid hash table entries: %lu\n",
+	       VM_ACCT_HASH_SIZE / sizeof(*(vm_acct_hash)));
+
+	spin_lock_irq(&vm_acct_lock);
+	for (i = 0; i < VM_ACCT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&vm_acct_hash[i]);
+	spin_unlock_irq(&vm_acct_lock);
+
+	pe = create_proc_entry("overcommit_uid", 0600, NULL);
+	if (!pe)
+		return -ENOMEM;
+	pe->proc_fops = &vm_acct_ops;
+
+	return 0;
+}
+__initcall(init_vm_acct);
+#endif /* CONFIG_VM_ACCT_USER */
+
 #ifdef CONFIG_SMP
 /*
  * We tolerate a little inaccuracy to avoid ping-ponging the counter between

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 20:23 ` Alan Cox
@ 2007-05-07 22:49   ` Andrea Righi
  2007-05-08 11:54     ` Alan Cox
  0 siblings, 1 reply; 8+ messages in thread
From: Andrea Righi @ 2007-05-07 22:49 UTC (permalink / raw)
  To: Alan Cox; +Cc: LKML, linux-mm

Alan Cox wrote:
>> - allow uid=1001 and uid=1002 (common users) to allocate memory only if the
>>   total committed space is below the 50% of the physical RAM + the size of
>>   swap:
>> root@host # echo 1001:2:50 > /proc/overcommit_uid
>> root@host # echo 1002:2:50 > /proc/overcommit_uid
> 
> There are some fundamental problems with this model - the moment you mix
> strict overcommit with anything else it ceases to be a strict overcommit
> and you might as well use existing overcommit rules for most stuff
> 
> The other thing you are sort of faking is per user resource management -
> which is a subset of per group of users resource management which is
> useful - eg "students can't hog the machine"
> 
> I don't see that this is the right approach compared with the container
> work and openvz work that is currently active and far more flexible.
> 

Obviously I was not proposing a nice theoretical model, my work is more similar
to a quick and dirty hack that could resolve some problems (at least in my case)
like the crash of critical services due to OOM-killing (or due to the failure of
a malloc() when OOM-killer is disabled).

When $VERY_CRITICAL_DAEMON dies *all* the users blame the sysadmin [me]. If a
user application dies because a malloc() returns NULL, the sysadmin [I] can
blame the user saying: "hey! _you_ tried to hog the machine and _your_
application is not able to handle the NULL result of the malloc()s!"... :-)

A solution could be to define the critical processes unkillable via
/proc/<pid>/oom_adj, but the per-process approach doesn't resolve all the
possible cases and it's quite difficult to manage in big environments, like HPC
clusters.

Anyway, it seems that I need to deepen my knowledge about the recent development
of process containers and openvz...

Thanks,
-Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] VM: per-user overcommit policy
  2007-05-07 22:49   ` Andrea Righi
@ 2007-05-08 11:54     ` Alan Cox
  0 siblings, 0 replies; 8+ messages in thread
From: Alan Cox @ 2007-05-08 11:54 UTC (permalink / raw)
  To: righiandr; +Cc: LKML, linux-mm

> When $VERY_CRITICAL_DAEMON dies *all* the users blame the sysadmin [me]. If a
> user application dies because a malloc() returns NULL, the sysadmin [I] can
> blame the user saying: "hey! _you_ tried to hog the machine and _your_
> application is not able to handle the NULL result of the malloc()s!"... :-)

If you allow overcommit by the daemons and not user space then some of
the time you will still get out of memory kills which may well hit your
daemon process.

> A solution could be to define the critical processes unkillable via
> /proc/<pid>/oom_adj, but the per-process approach doesn't resolve all the
> possible cases and it's quite difficult to manage in big environments, like HPC
> clusters.

If you are running no overcommit you should never get an out of memory
kill.

> Anyway, it seems that I need to deepen my knowledge about the recent development
> of process containers and openvz...

I think that does what you need - you'd create containers for critical
services and for the users and split resources to protect one from the
other.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-05-08 11:54 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-05-07 18:56 [RFC][PATCH] VM: per-user overcommit policy Andrea Righi
2007-05-07 19:16 ` William Lee Irwin III
2007-05-07 19:49   ` William Lee Irwin III
2007-05-07 22:48     ` Andrea Righi
2007-05-07 19:31 ` Luca Tettamanti
2007-05-07 20:23 ` Alan Cox
2007-05-07 22:49   ` Andrea Righi
2007-05-08 11:54     ` Alan Cox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox