diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/entry.S linux-2.5.41-memory_binding_api/arch/i386/kernel/entry.S --- linux-2.5.41-vanilla/arch/i386/kernel/entry.S Mon Oct 7 11:23:58 2002 +++ linux-2.5.41-memory_binding_api/arch/i386/kernel/entry.S Wed Oct 9 17:54:31 2002 @@ -736,6 +736,8 @@ .long sys_alloc_hugepages /* 250 */ .long sys_free_hugepages .long sys_exit_group + .long sys_mem_setbinding + .long sys_mem_getbinding .rept NR_syscalls-(.-sys_call_table)/4 .long sys_ni_syscall diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/numaq.c linux-2.5.41-memory_binding_api/arch/i386/kernel/numaq.c --- linux-2.5.41-vanilla/arch/i386/kernel/numaq.c Mon Oct 7 11:23:33 2002 +++ linux-2.5.41-memory_binding_api/arch/i386/kernel/numaq.c Wed Oct 9 17:54:16 2002 @@ -52,6 +52,10 @@ numnodes = 0; for(node = 0; node < MAX_NUMNODES; node++) { if(scd->quads_present31_0 & (1 << node)) { + if (test_and_set_bit(numnodes, &node_online_map)){ + printk("smp_dump_qct: node alread counted?!?!\n"); + BUG(); + } numnodes++; eq = &scd->eq[node]; /* Convert to pages */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/arch/i386/kernel/smpboot.c linux-2.5.41-memory_binding_api/arch/i386/kernel/smpboot.c --- linux-2.5.41-vanilla/arch/i386/kernel/smpboot.c Mon Oct 7 11:24:14 2002 +++ linux-2.5.41-memory_binding_api/arch/i386/kernel/smpboot.c Wed Oct 9 17:54:16 2002 @@ -61,6 +61,10 @@ /* Bitmask of currently online CPUs */ unsigned long cpu_online_map; +/* Bitmask of currently online memory blocks */ +unsigned long memblk_online_map = 0UL; +/* Bitmask of currently online nodes */ +unsigned long node_online_map = 0UL; static volatile unsigned long cpu_callin_map; volatile unsigned long cpu_callout_map; diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/asm-i386/smp.h linux-2.5.41-memory_binding_api/include/asm-i386/smp.h --- linux-2.5.41-vanilla/include/asm-i386/smp.h Mon Oct 7 11:23:22 2002 +++ linux-2.5.41-memory_binding_api/include/asm-i386/smp.h Wed Oct 9 17:54:16 2002 @@ -54,6 +54,8 @@ extern void smp_alloc_memory(void); extern unsigned long phys_cpu_present_map; extern unsigned long cpu_online_map; +extern unsigned long memblk_online_map; +extern unsigned long node_online_map; extern volatile unsigned long smp_invalidate_needed; extern int pic_mode; extern int smp_num_siblings; @@ -102,6 +104,20 @@ return -1; } +#define memblk_online(memblk) (memblk_online_map & (1<<(memblk))) + +extern inline unsigned int num_online_memblks(void) +{ + return hweight32(memblk_online_map); +} + +#define node_online(node) (node_online_map & (1<<(node))) + +extern inline unsigned int num_online_nodes(void) +{ + return hweight32(node_online_map); +} + static __inline int hard_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/asm-i386/unistd.h linux-2.5.41-memory_binding_api/include/asm-i386/unistd.h --- linux-2.5.41-vanilla/include/asm-i386/unistd.h Mon Oct 7 11:24:44 2002 +++ linux-2.5.41-memory_binding_api/include/asm-i386/unistd.h Wed Oct 9 17:54:31 2002 @@ -257,6 +257,8 @@ #define __NR_alloc_hugepages 250 #define __NR_free_hugepages 251 #define __NR_exit_group 252 +#define __NR_mem_setbinding 253 +#define __NR_mem_getbinding 254 /* user-visible error numbers are in the range -1 - -124: see */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/init_task.h linux-2.5.41-memory_binding_api/include/linux/init_task.h --- linux-2.5.41-vanilla/include/linux/init_task.h Mon Oct 7 11:23:25 2002 +++ linux-2.5.41-memory_binding_api/include/linux/init_task.h Wed Oct 9 17:54:08 2002 @@ -76,6 +76,10 @@ .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ + .memblk_binding = { \ + .bitmask = MEMBLK_NO_BINDING, \ + .behavior = MPOL_STRICT, \ + }, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ .real_timer = { \ .function = it_real_fn \ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/membind.h linux-2.5.41-memory_binding_api/include/linux/membind.h --- linux-2.5.41-vanilla/include/linux/membind.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.41-memory_binding_api/include/linux/membind.h Wed Oct 9 17:54:08 2002 @@ -0,0 +1,50 @@ +/* + * linux/include/linux/membind.h + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ +#ifndef _LINUX_MEMBIND_H +#define _LINUX_MEMBIND_H + +#include + +#define MEMBLK_NO_BINDING (~0UL) + +typedef struct memblk_list { + unsigned long bitmask; + int behavior; +} memblk_list_t; + + +#define is_valid_memblk_behavior(x) (1) /* for now */ +#define is_memblk_subset(x, y) (!(~(x) & (y))) /* test whether x is a subset of y */ + +#define MPOL_STRICT 0 /* Memory MUST be allocated according to binding */ +#define MPOL_LOOSE 1 /* Memory will be allocated according to binding, but + can fall back to other memory blocks if necessary. */ +#define MPOL_FIRST 2 /* UNUSED FOR NOW */ +#define MPOL_STRIPE 4 /* UNUSED FOR NOW */ +#define MPOL_RR 8 /* UNUSED FOR NOW */ + +#endif /* _LINUX_MEMBIND_H */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/mmzone.h linux-2.5.41-memory_binding_api/include/linux/mmzone.h --- linux-2.5.41-vanilla/include/linux/mmzone.h Mon Oct 7 11:22:55 2002 +++ linux-2.5.41-memory_binding_api/include/linux/mmzone.h Wed Oct 9 17:54:25 2002 @@ -167,8 +167,9 @@ unsigned long node_start_pfn; unsigned long node_size; int node_id; + int memblk_id; /* A unique ID for each memory block */ struct pglist_data *pgdat_next; - wait_queue_head_t kswapd_wait; + wait_queue_head_t kswapd_wait; } pg_data_t; extern int numnodes; @@ -249,6 +250,26 @@ #define for_each_zone(zone) \ for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +/** + * for_each_valid_zone - helper macro to iterate over all memory zones + * in a zonelist + * @zone - pointer to struct zone variable + * @zonelist - pointer to struct zonelist variable + * + * for_each_valid_zone() is basically an easier to read version of this + * piece of code: + * + * for (i = 0; zonelist->zones[i] != NULL; i++) { + * struct zone *z = zonelist->zones[i]; + * ... + * } + * + * Useful for several loops in __alloc_pages. + */ +#define for_each_valid_zone(zone, zonelist) \ + for (zone = *zonelist->zones; zone; zone++) \ + if (current->memblk_binding.bitmask & (1 << zone->zone_pgdat->memblk_id)) + #ifdef CONFIG_NUMA #define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ #else /* !CONFIG_NUMA */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/sched.h linux-2.5.41-memory_binding_api/include/linux/sched.h --- linux-2.5.41-vanilla/include/linux/sched.h Mon Oct 7 11:23:25 2002 +++ linux-2.5.41-memory_binding_api/include/linux/sched.h Wed Oct 9 17:54:08 2002 @@ -29,6 +29,7 @@ #include #include #include +#include struct exec_domain; @@ -335,6 +336,9 @@ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; + /* additional Memory Binding stuff */ + memblk_list_t memblk_binding; + wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ int *user_tid; /* for CLONE_CLEARTID */ diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/include/linux/smp.h linux-2.5.41-memory_binding_api/include/linux/smp.h --- linux-2.5.41-vanilla/include/linux/smp.h Mon Oct 7 11:24:39 2002 +++ linux-2.5.41-memory_binding_api/include/linux/smp.h Wed Oct 9 17:54:16 2002 @@ -94,7 +94,13 @@ #define cpu_online(cpu) ({ BUG_ON((cpu) != 0); 1; }) #define num_online_cpus() 1 #define num_booting_cpus() 1 -#define cpu_possible(cpu) ({ BUG_ON((cpu) != 0); 1; }) +#define cpu_possible(cpu) ({ BUG_ON((cpu) != 0); 1; }) +#define memblk_online_map 1 +#define memblk_online(memblk) ({ BUG_ON((memblk) != 0); 1; }) +#define num_online_memblks() 1 +#define node_online_map 1 +#define node_online(node) ({ BUG_ON((node) != 0); 1; }) +#define num_online_nodes() 1 struct notifier_block; diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/kernel/sys.c linux-2.5.41-memory_binding_api/kernel/sys.c --- linux-2.5.41-vanilla/kernel/sys.c Mon Oct 7 11:23:25 2002 +++ linux-2.5.41-memory_binding_api/kernel/sys.c Wed Oct 9 17:54:31 2002 @@ -1303,6 +1303,96 @@ return mask; } +/** + * sys_mem_setbinding - set the memory binding of a process + * @pid: pid of the process + * @memblks: new bitmask of memory blocks + * @behavior: new behavior + */ +asmlinkage long sys_mem_setbinding(pid_t pid, unsigned long memblks, + unsigned int behavior) +{ + long ret; + struct task_struct *p; + + /* + * Make sure that at least one of the memblks in the + * new mask is online. + */ + memblks &= memblk_online_map; + if (!memblks) + return -EINVAL; + + /* + * Test to make sure the behavior argument is valid. + */ + if (!is_valid_memblk_behavior(behavior)) + return -EINVAL; + + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + + get_task_struct(p); + read_unlock(&tasklist_lock); + + /* + * The caller must either own the process or have CAP_SYS_NICE. + */ + ret = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + ret = 0; + current->memblk_binding.bitmask = memblks; + current->memblk_binding.behavior = behavior; + +out_unlock: + put_task_struct(p); + return ret; +} + +/** + * sys_mem_getbinding - get the memory binding of a process + * @pid: pid of the process + * @user_bitmask: bitmask of memory blocks + * @user_behavior: behavior + */ +asmlinkage long sys_mem_getbinding(pid_t pid, unsigned long *user_bitmask, + unsigned int *user_behavior) +{ + long ret; + unsigned long bitmask; + unsigned int behavior; + struct task_struct *p; + + read_lock(&tasklist_lock); + + ret = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + ret = 0; + bitmask = p->memblk_binding.bitmask; + behavior = p->memblk_binding.behavior; + +out_unlock: + read_unlock(&tasklist_lock); + if (ret) + return ret; + if (copy_to_user(user_bitmask, &bitmask, sizeof(unsigned long))) + return -EFAULT; + if (copy_to_user(user_behavior, &behavior, sizeof(unsigned int))) + return -EFAULT; + return ret; +} + asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/mm/numa.c linux-2.5.41-memory_binding_api/mm/numa.c --- linux-2.5.41-vanilla/mm/numa.c Mon Oct 7 11:24:50 2002 +++ linux-2.5.41-memory_binding_api/mm/numa.c Wed Oct 9 17:54:16 2002 @@ -8,6 +8,7 @@ #include #include #include +#include int numnodes = 1; /* Initialized for UMA platforms */ @@ -29,6 +30,7 @@ pgdat = &contig_page_data; contig_page_data.node_id = 0; + contig_page_data.memblk_id = 0; contig_page_data.node_start_pfn = node_start_pfn; calculate_totalpages (&contig_page_data, zones_size, zholes_size); if (pmap == (struct page *)0) { @@ -37,6 +39,7 @@ } contig_page_data.node_mem_map = pmap; free_area_init_core(&contig_page_data, zones_size, zholes_size); + memblk_online_map = 1UL; mem_map = contig_page_data.node_mem_map; } @@ -66,6 +69,7 @@ unsigned long size; pgdat->node_id = nid; + pgdat->memblk_id = __node_to_memblk(nid); pgdat->node_start_pfn = node_start_pfn; calculate_totalpages (pgdat, zones_size, zholes_size); if (pmap == (struct page *)0) { @@ -74,6 +78,10 @@ } pgdat->node_mem_map = pmap; free_area_init_core(pgdat, zones_size, zholes_size); + if (test_and_set_bit(num_online_memblks(), &memblk_online_map)){ + printk("free_area_init_core: memblk alread counted?!?!\n"); + BUG(); + } /* * Get space for the valid bitmap. diff -Nur --exclude-from=/usr/src/.dontdiff linux-2.5.41-vanilla/mm/page_alloc.c linux-2.5.41-memory_binding_api/mm/page_alloc.c --- linux-2.5.41-vanilla/mm/page_alloc.c Mon Oct 7 11:23:24 2002 +++ linux-2.5.41-memory_binding_api/mm/page_alloc.c Wed Oct 9 17:54:25 2002 @@ -318,57 +318,46 @@ struct zonelist *zonelist) { unsigned long min; - struct zone **zones, *classzone; + struct zone *classzone, *zone; struct page * page; - int freed, i; + int freed; if (gfp_mask & __GFP_WAIT) might_sleep(); - mod_page_state(pgalloc, 1<zones; /* the list of zones suitable for gfp_mask */ - classzone = zones[0]; + classzone = zonelist->zones[0]; if (classzone == NULL) /* no zones in the zonelist */ return NULL; /* Go through the zonelist once, looking for a zone with enough free */ min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - + for_each_valid_zone(zone, zonelist) { /* the incremental min is allegedly to discourage fallback */ - min += z->pages_low; - if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); - if (page) + min += zone->pages_low; + if (zone->free_pages > min || zone->free_pages >= zone->pages_high) + if (page = rmqueue(zone, order)) return page; - } } /* we're somewhat low on memory, failed to find what we needed */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - if (z->free_pages <= z->pages_low && - waitqueue_active(&z->zone_pgdat->kswapd_wait)) - wake_up_interruptible(&z->zone_pgdat->kswapd_wait); + for_each_valid_zone(zone, zonelist) { + if (zone->free_pages <= zone->pages_low && + waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - unsigned long local_min; - struct zone *z = zones[i]; - - local_min = z->pages_min; + for_each_valid_zone(zone, zonelist) { if (gfp_mask & __GFP_HIGH) - local_min >>= 2; - min += local_min; - if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); - if (page) + min += zone->pages_min >> 2; + else + min += zone->pages_min; + if (zone->free_pages > min || zone->free_pages >= zone->pages_high) + if (page = rmqueue(zone, order)) return page; - } } /* here we're in the low on memory slow path */ @@ -376,13 +365,9 @@ rebalance: if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { /* go through the zonelist yet again, ignoring mins */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - page = rmqueue(z, order); - if (page) + for_each_valid_zone(zone, zonelist) + if (page = rmqueue(zone, order)) return page; - } nopage: if (!(current->flags & PF_NOWARN)) { printk("%s: page allocation failure." @@ -403,15 +388,11 @@ /* go through the zonelist yet one more time */ min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min += z->pages_min; - if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); - if (page) + for_each_valid_zone(zone, zonelist) { + min += zone->pages_min; + if (zone->free_pages > min || zone->free_pages >= zone->pages_high) + if (page = rmqueue(zone, order)) return page; - } } /* Don't let big-order allocations loop */