* [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 19:14 Christoph Lameter
2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter
` (4 more replies)
0 siblings, 5 replies; 33+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:14 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo
From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu
Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).
With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.
One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
---
fs/proc/task_mmu.c | 14 +++++++++-
include/linux/mm_types.h | 16 ++++--------
include/linux/sched.h | 61 ++++++++++++++++++++---------------------------
kernel/fork.c | 25 ++++++++++++++-----
mm/filemap_xip.c | 2 -
mm/fremap.c | 2 -
mm/init-mm.c | 3 ++
mm/memory.c | 20 +++++++--------
mm/rmap.c | 10 +++----
mm/swapfile.c | 2 -
10 files changed, 84 insertions(+), 71 deletions(-)
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;
#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+ long file;
+ long anon;
+};
/*
* Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
* by mmlist_lock
*/
- /* Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- mm_counter_t _file_rss;
- mm_counter_t _anon_rss;
+ /* Special percpu counters */
+ struct mm_counter *rss;
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/sched.h 2009-11-04 13:13:42.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm) \
- (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm) do { \
- unsigned long _rss = get_mm_rss(mm); \
- if ((mm)->hiwater_rss < _rss) \
- (mm)->hiwater_rss = _rss; \
-} while (0)
-#define update_hiwater_vm(mm) do { \
- if ((mm)->hiwater_vm < (mm)->total_vm) \
- (mm)->hiwater_vm = (mm)->total_vm; \
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+ int cpu;
+ unsigned long r = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ r = c->file + c->anon;
+ }
+
+ return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+ unsigned long _rss = get_mm_rss(mm);
+ if (mm->hiwater_rss < _rss)
+ mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+ if (mm->hiwater_vm < mm->total_vm)
+ mm->hiwater_vm = mm->total_vm;
+}
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600
@@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
+ int cpu;
+
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
@@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
mm->nr_ptes = 0;
- set_mm_counter(mm, file_rss, 0);
- set_mm_counter(mm, anon_rss, 0);
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *m;
+
+ memset(m, sizeof(struct mm_counter), 0);
+ }
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
@@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (mm->rss)
+ mm = mm_init(mm, current);
+ else {
+ free_mm(mm);
+ mm = NULL;
+ }
}
return mm;
}
@@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+ free_percpu(mm->rss);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
+ mm->rss = alloc_percpu(struct mm_counter);
+ if (!mm->rss)
+ goto fail_nomem;
/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -661,15 +676,13 @@ free_pt:
mm->binfmt = NULL;
mmput(mm);
-fail_nomem:
- return NULL;
-
fail_nocontext:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
mm_free_pgd(mm);
+fail_nomem:
free_mm(mm);
return NULL;
}
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c 2009-11-04 13:13:42.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- *shared = get_mm_counter(mm, file_rss);
+ int cpu;
+ int anon_rss = 0;
+ int file_rss = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+ anon_rss += c->anon;
+ file_rss += c->file;
+ }
+ *shared = file_rss;
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = *shared + get_mm_counter(mm, anon_rss);
+ *resident = *shared + anon_rss;
return mm->total_vm;
}
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c 2009-11-04 13:13:42.000000000 -0600
@@ -194,7 +194,7 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/fremap.c 2009-11-04 13:13:42.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
}
} else {
if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/memory.c 2009-11-04 13:13:42.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
{
if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
+ __this_cpu_add(mm->rss->file, file_rss);
if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ __this_cpu_add(mm->rss->anon, anon_rss);
}
/*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -2148,11 +2148,11 @@ gotten:
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->file);
+ __this_cpu_inc(mm->rss->anon);
}
} else
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
* discarded at swap_free().
*/
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
- inc_mm_counter(mm, anon_rss);
+ __this_cpu_inc(mm->rss->anon);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter(mm, file_rss);
+ __this_cpu_inc(mm->rss->file);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/rmap.c 2009-11-04 13:13:42.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
if (PageAnon(page))
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
+ __this_cpu_dec(mm->rss->anon);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ __this_cpu_dec(mm->rss->file);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/swapfile.c 2009-11-04 13:13:42.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
goto out;
}
- inc_mm_counter(vma->vm_mm, anon_rss);
+ __this_cpu_inc(vma->vm_mm->rss->anon);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c 2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/init-mm.c 2009-11-04 13:13:42.000000000 -0600
@@ -8,6 +8,8 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>
+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ .rss = &init_mm_counters,
};
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [MM] Remove rss batching from copy_page_range() 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter @ 2009-11-04 19:17 ` Christoph Lameter 2009-11-04 21:02 ` Andi Kleen 2009-11-04 21:01 ` [MM] Make mm counters per cpu instead of atomic Andi Kleen ` (3 subsequent siblings) 4 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-04 19:17 UTC (permalink / raw) To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo From: Christoph Lameter <cl@linux-foundation.org> Subject: Remove rss batching from copy_page_range() With per cpu counters in mm there is no need for batching mm counter updates anymore. Update counters directly while copying pages. Signed-off-by: Christoph Lameter <cl@linux-foundation.org> --- mm/memory.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2009-11-04 12:15:03.000000000 -0600 +++ linux-2.6/mm/memory.c 2009-11-04 13:03:45.000000000 -0600 @@ -376,14 +376,6 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) -{ - if (file_rss) - __this_cpu_add(mm->rss->file, file_rss); - if (anon_rss) - __this_cpu_add(mm->rss->anon, anon_rss); -} - /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in @@ -575,7 +567,7 @@ out: static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + unsigned long addr) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -630,7 +622,10 @@ copy_one_pte(struct mm_struct *dst_mm, s if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + __this_cpu_inc(dst_mm->rss->anon); + else + __this_cpu_inc(dst_mm->rss->file); } out_set_pte: @@ -645,10 +640,8 @@ static int copy_pte_range(struct mm_stru pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; again: - rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -674,14 +667,13 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (addr != end) @@ -803,8 +795,6 @@ static unsigned long zap_pte_range(struc struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -850,14 +840,14 @@ static unsigned long zap_pte_range(struc set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + __this_cpu_dec(mm->rss->anon); else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + __this_cpu_dec(mm->rss->file); } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -880,7 +870,6 @@ static unsigned long zap_pte_range(struc pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Remove rss batching from copy_page_range() 2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter @ 2009-11-04 21:02 ` Andi Kleen 2009-11-04 22:02 ` Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: Andi Kleen @ 2009-11-04 21:02 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo Christoph Lameter <cl@linux-foundation.org> writes: > From: Christoph Lameter <cl@linux-foundation.org> > Subject: Remove rss batching from copy_page_range() > > With per cpu counters in mm there is no need for batching > mm counter updates anymore. Update counters directly while > copying pages. Hmm, but with all the inlining with some luck the local counters will be in registers. That will never be the case with the per cpu counters. So I'm not sure it's really an improvement? -Andi -- ak@linux.intel.com -- Speaking for myself only. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Remove rss batching from copy_page_range() 2009-11-04 21:02 ` Andi Kleen @ 2009-11-04 22:02 ` Christoph Lameter 2009-11-05 8:27 ` Andi Kleen 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-04 22:02 UTC (permalink / raw) To: Andi Kleen Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Wed, 4 Nov 2009, Andi Kleen wrote: > > With per cpu counters in mm there is no need for batching > > mm counter updates anymore. Update counters directly while > > copying pages. > > Hmm, but with all the inlining with some luck the local > counters will be in registers. That will never be the case > with the per cpu counters. The function is too big for that to occur and the counters have to be preserved across function calls. The code is shorter with the patch applied: christoph@:~/n/linux-2.6$ size mm/memory.o text data bss dec hex filename 20140 56 40 20236 4f0c mm/memory.o christoph@:~/n/linux-2.6$ quilt push Applying patch mmcounter patching file include/linux/mm_types.h patching file include/linux/sched.h patching file kernel/fork.c patching file fs/proc/task_mmu.c patching file mm/filemap_xip.c patching file mm/fremap.c patching file mm/memory.c patching file mm/rmap.c patching file mm/swapfile.c patching file mm/init-mm.c Now at patch mmcounter christoph@:~/n/linux-2.6$ make mm/memory.o CHK include/linux/version.h CHK include/linux/utsrelease.h UPD include/linux/utsrelease.h SYMLINK include/asm -> include/asm-x86 CC arch/x86/kernel/asm-offsets.s GEN include/asm/asm-offsets.h CALL scripts/checksyscalls.sh CC mm/memory.o christoph@:~/n/linux-2.6$ size mm/memory.o text data bss dec hex filename 20028 56 40 20124 4e9c mm/memory.o christoph@:~/n/linux-2.6$ quilt push Applying patch simplify patching file mm/memory.c Now at patch simplify christoph@:~/n/linux-2.6$ make mm/memory.o CHK include/linux/version.h CHK include/linux/utsrelease.h SYMLINK include/asm -> include/asm-x86 CALL scripts/checksyscalls.sh CC mm/memory.o christoph@:~/n/linux-2.6$ size mm/memory.o text data bss dec hex filename 19888 56 40 19984 4e10 mm/memory.o -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Remove rss batching from copy_page_range() 2009-11-04 22:02 ` Christoph Lameter @ 2009-11-05 8:27 ` Andi Kleen 0 siblings, 0 replies; 33+ messages in thread From: Andi Kleen @ 2009-11-05 8:27 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Wed, Nov 04, 2009 at 05:02:12PM -0500, Christoph Lameter wrote: > On Wed, 4 Nov 2009, Andi Kleen wrote: > > > > With per cpu counters in mm there is no need for batching > > > mm counter updates anymore. Update counters directly while > > > copying pages. > > > > Hmm, but with all the inlining with some luck the local > > counters will be in registers. That will never be the case > > with the per cpu counters. > > The function is too big for that to occur and the counters have to be If it's only called once then gcc doesn't care about size. > preserved across function calls. The code is shorter with the patch > applied: I see. Thanks for the data. -Andi -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter 2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter @ 2009-11-04 21:01 ` Andi Kleen 2009-11-04 23:49 ` Dave Jones ` (2 subsequent siblings) 4 siblings, 0 replies; 33+ messages in thread From: Andi Kleen @ 2009-11-04 21:01 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo Christoph Lameter <cl@linux-foundation.org> writes: > > One price to pay for these improvements is the need to scan over all percpu > counters when the actual count values are needed. Do you have numbers how costly alloc_percpu() is? I wonder what this does to fork() overhead. -Andi -- ak@linux.intel.com -- Speaking for myself only. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter 2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter 2009-11-04 21:01 ` [MM] Make mm counters per cpu instead of atomic Andi Kleen @ 2009-11-04 23:49 ` Dave Jones 2009-11-05 15:04 ` Christoph Lameter 2009-11-05 1:16 ` [MM] Make mm counters per cpu instead of atomic KAMEZAWA Hiroyuki 2009-11-17 6:48 ` Zhang, Yanmin 4 siblings, 1 reply; 33+ messages in thread From: Dave Jones @ 2009-11-04 23:49 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote: > + memset(m, sizeof(struct mm_counter), 0); Args wrong way around. Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-04 23:49 ` Dave Jones @ 2009-11-05 15:04 ` Christoph Lameter 2009-11-05 15:36 ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-05 15:04 UTC (permalink / raw) To: Dave Jones Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Wed, 4 Nov 2009, Dave Jones wrote: > On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote: > > > + memset(m, sizeof(struct mm_counter), 0); > > Args wrong way around. Right. It works because percpu_alloc zeroes the data anyways. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-05 15:04 ` Christoph Lameter @ 2009-11-05 15:36 ` Christoph Lameter 2009-11-06 1:11 ` KAMEZAWA Hiroyuki ` (2 more replies) 0 siblings, 3 replies; 33+ messages in thread From: Christoph Lameter @ 2009-11-05 15:36 UTC (permalink / raw) To: Dave Jones Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo From: Christoph Lameter <cl@linux-foundation.org> Subject: Make mm counters per cpu V2 Changing the mm counters to per cpu counters is possible after the introduction of the generic per cpu operations (currently in percpu and -next). With that the contention on the counters in mm_struct can be avoided. The USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not need to perform atomic updates to mm counters anymore. Various code paths can be simplified since per cpu counter updates are fast and batching of counter updates is no longer needed. One price to pay for these improvements is the need to scan over all percpu counters when the actual count values are needed. V1->V2 - Remove useless and buggy per cpu counter initialization. alloc_percpu already zeros the values. Signed-off-by: Christoph Lameter <cl@linux-foundation.org> --- fs/proc/task_mmu.c | 14 +++++++++- include/linux/mm_types.h | 16 ++++-------- include/linux/sched.h | 61 ++++++++++++++++++++--------------------------- kernel/fork.c | 18 +++++++++---- mm/filemap_xip.c | 2 - mm/fremap.c | 2 - mm/init-mm.c | 3 ++ mm/memory.c | 20 +++++++-------- mm/rmap.c | 10 +++---- mm/swapfile.c | 2 - 10 files changed, 77 insertions(+), 71 deletions(-) Index: linux-2.6/include/linux/mm_types.h =================================================================== --- linux-2.6.orig/include/linux/mm_types.h 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/include/linux/mm_types.h 2009-11-05 09:22:37.000000000 -0600 @@ -24,11 +24,10 @@ struct address_space; #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#if USE_SPLIT_PTLOCKS -typedef atomic_long_t mm_counter_t; -#else /* !USE_SPLIT_PTLOCKS */ -typedef unsigned long mm_counter_t; -#endif /* !USE_SPLIT_PTLOCKS */ +struct mm_counter { + long file; + long anon; +}; /* * Each physical page in the system has a struct page associated with @@ -223,11 +222,8 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - mm_counter_t _file_rss; - mm_counter_t _anon_rss; + /* Special percpu counters */ + struct mm_counter *rss; unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/include/linux/sched.h 2009-11-05 09:22:37.000000000 -0600 @@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); -#if USE_SPLIT_PTLOCKS -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) - -#else /* !USE_SPLIT_PTLOCKS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ -#define set_mm_counter(mm, member, value) (mm)->_##member = (value) -#define get_mm_counter(mm, member) ((mm)->_##member) -#define add_mm_counter(mm, member, value) (mm)->_##member += (value) -#define inc_mm_counter(mm, member) (mm)->_##member++ -#define dec_mm_counter(mm, member) (mm)->_##member-- - -#endif /* !USE_SPLIT_PTLOCKS */ - -#define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -#define update_hiwater_rss(mm) do { \ - unsigned long _rss = get_mm_rss(mm); \ - if ((mm)->hiwater_rss < _rss) \ - (mm)->hiwater_rss = _rss; \ -} while (0) -#define update_hiwater_vm(mm) do { \ - if ((mm)->hiwater_vm < (mm)->total_vm) \ - (mm)->hiwater_vm = (mm)->total_vm; \ -} while (0) +static inline unsigned long get_mm_rss(struct mm_struct *mm) +{ + int cpu; + unsigned long r = 0; + + for_each_possible_cpu(cpu) { + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); + + r = c->file + c->anon; + } + + return r; +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ + unsigned long _rss = get_mm_rss(mm); + if (mm->hiwater_rss < _rss) + mm->hiwater_rss = _rss; +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ + if (mm->hiwater_vm < mm->total_vm) + mm->hiwater_vm = mm->total_vm; +} static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/kernel/fork.c 2009-11-05 09:25:30.000000000 -0600 @@ -452,8 +452,6 @@ static struct mm_struct * mm_init(struct (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -480,7 +478,13 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); + mm->rss = alloc_percpu(struct mm_counter); + if (mm->rss) + mm = mm_init(mm, current); + else { + free_mm(mm); + mm = NULL; + } } return mm; } @@ -496,6 +500,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); + free_percpu(mm->rss); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -631,6 +636,9 @@ struct mm_struct *dup_mm(struct task_str goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); + mm->rss = alloc_percpu(struct mm_counter); + if (!mm->rss) + goto fail_nomem; /* Initializing for Swap token stuff */ mm->token_priority = 0; @@ -661,15 +669,13 @@ free_pt: mm->binfmt = NULL; mmput(mm); -fail_nomem: - return NULL; - fail_nocontext: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ mm_free_pgd(mm); +fail_nomem: free_mm(mm); return NULL; } Index: linux-2.6/fs/proc/task_mmu.c =================================================================== --- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/fs/proc/task_mmu.c 2009-11-05 09:22:37.000000000 -0600 @@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + int cpu; + int anon_rss = 0; + int file_rss = 0; + + for_each_possible_cpu(cpu) { + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); + + anon_rss += c->anon; + file_rss += c->file; + } + *shared = file_rss; *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + anon_rss; return mm->total_vm; } Index: linux-2.6/mm/filemap_xip.c =================================================================== --- linux-2.6.orig/mm/filemap_xip.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/filemap_xip.c 2009-11-05 09:22:37.000000000 -0600 @@ -194,7 +194,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + __this_cpu_dec(mm->rss->file); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/fremap.c 2009-11-05 09:22:37.000000000 -0600 @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + __this_cpu_dec(mm->rss->file); } } else { if (!pte_file(pte)) Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/memory.c 2009-11-05 09:22:37.000000000 -0600 @@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) { if (file_rss) - add_mm_counter(mm, file_rss, file_rss); + __this_cpu_add(mm->rss->file, file_rss); if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + __this_cpu_add(mm->rss->anon, anon_rss); } /* @@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + __this_cpu_inc(mm->rss->file); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2148,11 +2148,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + __this_cpu_dec(mm->rss->file); + __this_cpu_inc(mm->rss->anon); } } else - inc_mm_counter(mm, anon_rss); + __this_cpu_inc(mm->rss->anon); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + __this_cpu_inc(mm->rss->anon); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + __this_cpu_inc(mm->rss->anon); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct * if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + __this_cpu_inc(mm->rss->anon); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + __this_cpu_inc(mm->rss->file); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/rmap.c 2009-11-05 09:22:37.000000000 -0600 @@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + __this_cpu_dec(mm->rss->anon); else - dec_mm_counter(mm, file_rss); + __this_cpu_dec(mm->rss->file); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + __this_cpu_dec(mm->rss->anon); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); + __this_cpu_dec(mm->rss->file); page_remove_rmap(page); @@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + __this_cpu_dec(mm->rss->file); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/swapfile.c 2009-11-05 09:22:37.000000000 -0600 @@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + __this_cpu_inc(vma->vm_mm->rss->anon); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); Index: linux-2.6/mm/init-mm.c =================================================================== --- linux-2.6.orig/mm/init-mm.c 2009-11-05 09:22:17.000000000 -0600 +++ linux-2.6/mm/init-mm.c 2009-11-05 09:22:37.000000000 -0600 @@ -8,6 +8,8 @@ #include <asm/atomic.h> #include <asm/pgtable.h> +DEFINE_PER_CPU(struct mm_counter, init_mm_counters); + struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, @@ -17,4 +19,5 @@ struct mm_struct init_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .cpu_vm_mask = CPU_MASK_ALL, + .rss = &init_mm_counters, }; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-05 15:36 ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter @ 2009-11-06 1:11 ` KAMEZAWA Hiroyuki 2009-11-06 3:23 ` KAMEZAWA Hiroyuki 2009-11-10 22:44 ` Andrew Morton 2009-11-06 4:08 ` KAMEZAWA Hiroyuki 2009-11-06 4:15 ` KAMEZAWA Hiroyuki 2 siblings, 2 replies; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 1:11 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Thu, 5 Nov 2009 10:36:06 -0500 (EST) Christoph Lameter <cl@linux-foundation.org> wrote: > From: Christoph Lameter <cl@linux-foundation.org> > Subject: Make mm counters per cpu V2 > > Changing the mm counters to per cpu counters is possible after the introduction > of the generic per cpu operations (currently in percpu and -next). > > With that the contention on the counters in mm_struct can be avoided. The > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > need to perform atomic updates to mm counters anymore. Various code paths > can be simplified since per cpu counter updates are fast and batching > of counter updates is no longer needed. > > One price to pay for these improvements is the need to scan over all percpu > counters when the actual count values are needed. > > V1->V2 > - Remove useless and buggy per cpu counter initialization. > alloc_percpu already zeros the values. > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > Thanks. My small concern is read-side. This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep) on my 8cpu, SMP box. == [Before] Performance counter stats for 'top -b -n 1' (5 runs): 406.690304 task-clock-msecs # 0.442 CPUs ( +- 3.327% ) 32 context-switches # 0.000 M/sec ( +- 0.000% ) 0 CPU-migrations # 0.000 M/sec ( +- 0.000% ) 718 page-faults # 0.002 M/sec ( +- 0.000% ) 987832447 cycles # 2428.955 M/sec ( +- 2.655% ) 933831356 instructions # 0.945 IPC ( +- 2.585% ) 17383990 cache-references # 42.745 M/sec ( +- 1.676% ) 353620 cache-misses # 0.870 M/sec ( +- 0.614% ) 0.920712639 seconds time elapsed ( +- 1.609% ) == [After] Performance counter stats for 'top -b -n 1' (5 runs): 675.926348 task-clock-msecs # 0.568 CPUs ( +- 0.601% ) 62 context-switches # 0.000 M/sec ( +- 1.587% ) 0 CPU-migrations # 0.000 M/sec ( +- 0.000% ) 1095 page-faults # 0.002 M/sec ( +- 0.000% ) 1896320818 cycles # 2805.514 M/sec ( +- 1.494% ) 1790600289 instructions # 0.944 IPC ( +- 1.333% ) 35406398 cache-references # 52.382 M/sec ( +- 0.876% ) 722781 cache-misses # 1.069 M/sec ( +- 0.192% ) 1.190605561 seconds time elapsed ( +- 0.417% ) Because I know 'ps' related workload is used in various ways, "How this will be in large smp" is my concern. Maybe usual use of 'ps -elf' will not read RSS value and not affected by this. If this counter supports single-thread-mode (most of apps are single threaded), impact will not be big. Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 1:11 ` KAMEZAWA Hiroyuki @ 2009-11-06 3:23 ` KAMEZAWA Hiroyuki 2009-11-06 17:32 ` Christoph Lameter 2009-11-10 22:44 ` Andrew Morton 1 sibling, 1 reply; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 3:23 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Fri, 6 Nov 2009 10:11:06 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep) > on my 8cpu, SMP box. > > == [Before] > Performance counter stats for 'top -b -n 1' (5 runs): > > 406.690304 task-clock-msecs # 0.442 CPUs ( +- 3.327% ) > 32 context-switches # 0.000 M/sec ( +- 0.000% ) > 0 CPU-migrations # 0.000 M/sec ( +- 0.000% ) > 718 page-faults # 0.002 M/sec ( +- 0.000% ) > 987832447 cycles # 2428.955 M/sec ( +- 2.655% ) > 933831356 instructions # 0.945 IPC ( +- 2.585% ) > 17383990 cache-references # 42.745 M/sec ( +- 1.676% ) > 353620 cache-misses # 0.870 M/sec ( +- 0.614% ) > > 0.920712639 seconds time elapsed ( +- 1.609% ) > > == [After] > Performance counter stats for 'top -b -n 1' (5 runs): > > 675.926348 task-clock-msecs # 0.568 CPUs ( +- 0.601% ) > 62 context-switches # 0.000 M/sec ( +- 1.587% ) > 0 CPU-migrations # 0.000 M/sec ( +- 0.000% ) > 1095 page-faults # 0.002 M/sec ( +- 0.000% ) > 1896320818 cycles # 2805.514 M/sec ( +- 1.494% ) > 1790600289 instructions # 0.944 IPC ( +- 1.333% ) > 35406398 cache-references # 52.382 M/sec ( +- 0.876% ) > 722781 cache-misses # 1.069 M/sec ( +- 0.192% ) > > 1.190605561 seconds time elapsed ( +- 0.417% ) > > Because I know 'ps' related workload is used in various ways, "How this will > be in large smp" is my concern. > > Maybe usual use of 'ps -elf' will not read RSS value and not affected by this. > If this counter supports single-thread-mode (most of apps are single threaded), > impact will not be big. > Measured extreme case benefits with attached program. please see # of page faults. Bigger is better. please let me know my program is buggy. Excuse: My .config may not be for extreme performace challenge, and my host only have 8cpus. (memcg is enabled, hahaha...) # of page fault is not very stable (affected by task-clock-msecs.) but maybe we have some improvements. I'd like to see score of "top" and this in big servers...... BTW, can't we have single-thread-mode for this counter ? Usual program's read-side will get much benefit..... ==[Before]== Performance counter stats for './multi-fault 8' (5 runs): 474810.516710 task-clock-msecs # 7.912 CPUs ( +- 0.006% ) 10713 context-switches # 0.000 M/sec ( +- 2.529% ) 8 CPU-migrations # 0.000 M/sec ( +- 0.000% ) 16669105 page-faults # 0.035 M/sec ( +- 0.449% ) 1487101488902 cycles # 3131.989 M/sec ( +- 0.012% ) 307164795479 instructions # 0.207 IPC ( +- 0.177% ) 2355518599 cache-references # 4.961 M/sec ( +- 0.420% ) 901969818 cache-misses # 1.900 M/sec ( +- 0.824% ) 60.008425257 seconds time elapsed ( +- 0.004% ) ==[After]== Performance counter stats for './multi-fault 8' (5 runs): 474212.969563 task-clock-msecs # 7.902 CPUs ( +- 0.007% ) 10281 context-switches # 0.000 M/sec ( +- 0.156% ) 9 CPU-migrations # 0.000 M/sec ( +- 0.000% ) 16795696 page-faults # 0.035 M/sec ( +- 2.218% ) 1485411063159 cycles # 3132.371 M/sec ( +- 0.014% ) 305810331186 instructions # 0.206 IPC ( +- 0.133% ) 2391293765 cache-references # 5.043 M/sec ( +- 0.737% ) 890490519 cache-misses # 1.878 M/sec ( +- 0.212% ) 60.010631769 seconds time elapsed ( +- 0.004% ) Thanks, -Kame == /* * multi-fault.c :: causes 60secs of parallel page fault in multi-thread. * % gcc -O2 -o multi-fault multi-fault.c -lpthread * % multi-fault # of cpus. */ #define _GNU_SOURCE #include <stdio.h> #include <pthread.h> #include <sched.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #define NR_THREADS 32 pthread_t threads[NR_THREADS]; /* * For avoiding contention in page table lock, FAULT area is * sparse. If FAULT_LENGTH is too large for your cpus, decrease it. */ #define MMAP_LENGTH (8 * 1024 * 1024) #define FAULT_LENGTH (2 * 1024 * 1024) void *mmap_area[NR_THREADS]; #define PAGE_SIZE 4096 pthread_barrier_t barrier; int name[NR_THREADS]; void *worker(void *data) { int cpu = *(int *)data; cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); sched_setaffinity(0, sizeof(set), &set); pthread_barrier_wait(&barrier); while (1) { char *c; char *start = mmap_area[cpu]; char *end = mmap_area[cpu] + FAULT_LENGTH; for (c = start; c < end; c += PAGE_SIZE) *c = 0; madvise(start, FAULT_LENGTH, MADV_DONTNEED); } return NULL; } int main(int argc, char *argv[]) { int i, num, ret; if (argc < 2) return 0; num = atoi(argv[1]); pthread_barrier_init(&barrier, NULL, num + 1); for (i = 0; i < num; i++) { name[i] = i; ret = pthread_create(&threads[i], NULL, worker, &name[i]); if (ret < 0) { perror("pthread create"); return 0; } mmap_area[i] = mmap(NULL, MMAP_LENGTH, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); } pthread_barrier_wait(&barrier); sleep(60); return 0; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 3:23 ` KAMEZAWA Hiroyuki @ 2009-11-06 17:32 ` Christoph Lameter 2009-11-06 19:03 ` KAMEZAWA Hiroyuki 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-06 17:32 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote: > BTW, can't we have single-thread-mode for this counter ? > Usual program's read-side will get much benefit..... Thanks for the measurements. A single thread mode would be good. Ideas on how to add that would be appreciated. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 17:32 ` Christoph Lameter @ 2009-11-06 19:03 ` KAMEZAWA Hiroyuki 2009-11-06 19:13 ` Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 19:03 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo Christoph Lameter wrote: > On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote: > >> BTW, can't we have single-thread-mode for this counter ? >> Usual program's read-side will get much benefit..... > > Thanks for the measurements. > > A single thread mode would be good. Ideas on how to add that would be > appreciated. > Maybe there are some ways....At brief thought.... == struct usage_counter { long rss; long file; } struct mm_struct { .... atomic_long_t rss; /* only updated when usage_counter is NULL */ atomic_long_t file; /* only updated when usage_counter is NULL */ struct usage_counter *usage; /* percpu counter used when multi-threaded */ ..... } And allocate mm->usage only when the first CLONE_THREAD is specified. if (mm->usage) access per cpu else atomic_long_xxx and read operation will be val = atomic_read(mm->rss); if (mm->usage) for_each_possible_cpu().... == Does "if" seems too costly ? If this idea is bad, I think moving mm_counter to task_struct from mm_struct and doing slow-sync is an idea instead of percpu. for example struct task_struct { .... mm_counter_t temp_counter; .... }; struct mm_struct { ..... atomic_long_t rss; atomic_long_t file; }; And adds temp_counter's value to mm_struct at some good point....before sleep ? kswapd and reclaim routine can update mm_struct's counter, directly. Readers just read mm_struct's counter. Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 19:03 ` KAMEZAWA Hiroyuki @ 2009-11-06 19:13 ` Christoph Lameter 2009-11-06 19:20 ` KAMEZAWA Hiroyuki 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-06 19:13 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote: > And allocate mm->usage only when the first CLONE_THREAD is specified. Ok. > if (mm->usage) > access per cpu > else > atomic_long_xxx If we just have one thread: Do we need atomic access at all? > and read operation will be > > val = atomic_read(mm->rss); > if (mm->usage) > for_each_possible_cpu().... or val = m->rss for_each_cpu(cpu) val+= percpu ... > == > Does "if" seems too costly ? The above method would avoid the if. > If this idea is bad, I think moving mm_counter to task_struct from > mm_struct and doing slow-sync is an idea instead of percpu. Yeah then the access is effectively percpu as long as preempt is disabled. But then for the mmap_writer_lock we would need to traverse a doubly linked list to add up the counters. Bad caching on that one and we would have to lock the list too. Sigh. > kswapd and reclaim routine can update mm_struct's counter, directly. > Readers just read mm_struct's counter. Would work for rss counters but not for avoiding the rw semaphore I guess. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 19:13 ` Christoph Lameter @ 2009-11-06 19:20 ` KAMEZAWA Hiroyuki 2009-11-06 19:47 ` Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 19:20 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo Christoph Lameter wrote: > On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote: > >> And allocate mm->usage only when the first CLONE_THREAD is specified. > > Ok. > >> if (mm->usage) >> access per cpu >> else >> atomic_long_xxx > > If we just have one thread: Do we need atomic access at all? > Unfortunately, kswapd/vmscan touch this. Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 19:20 ` KAMEZAWA Hiroyuki @ 2009-11-06 19:47 ` Christoph Lameter 0 siblings, 0 replies; 33+ messages in thread From: Christoph Lameter @ 2009-11-06 19:47 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote: > > If we just have one thread: Do we need atomic access at all? > > > Unfortunately, kswapd/vmscan touch this. Right. And those can also occur from another processor that the process never has run on before. Argh. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-06 1:11 ` KAMEZAWA Hiroyuki 2009-11-06 3:23 ` KAMEZAWA Hiroyuki @ 2009-11-10 22:44 ` Andrew Morton 2009-11-10 23:20 ` Christoph Lameter 1 sibling, 1 reply; 33+ messages in thread From: Andrew Morton @ 2009-11-10 22:44 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm, linux-kernel, Tejun Heo On Fri, 6 Nov 2009 10:11:06 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > On Thu, 5 Nov 2009 10:36:06 -0500 (EST) > Christoph Lameter <cl@linux-foundation.org> wrote: > > > From: Christoph Lameter <cl@linux-foundation.org> > > Subject: Make mm counters per cpu V2 > > > > Changing the mm counters to per cpu counters is possible after the introduction > > of the generic per cpu operations (currently in percpu and -next). > > > > With that the contention on the counters in mm_struct can be avoided. The > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > > need to perform atomic updates to mm counters anymore. Various code paths > > can be simplified since per cpu counter updates are fast and batching > > of counter updates is no longer needed. > > > > One price to pay for these improvements is the need to scan over all percpu > > counters when the actual count values are needed. > > > > V1->V2 > > - Remove useless and buggy per cpu counter initialization. > > alloc_percpu already zeros the values. > > > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > > > Thanks. My small concern is read-side. Me too. For example, with 1000 possible CPUs (possible, not present and not online), and 1000 processes, ps(1) will have to wallow through a million cachelines in task_statm(). And then we have get_mm_rs(), which now will hit 1000 cachelines. And get_mm_rs() is called (via account_user_time()->acct_update_integrals()) from the clock tick. Adding a thousand cache misses to the timer interrupt is the sort of thing which makes people unhappy? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-10 22:44 ` Andrew Morton @ 2009-11-10 23:20 ` Christoph Lameter 0 siblings, 0 replies; 33+ messages in thread From: Christoph Lameter @ 2009-11-10 23:20 UTC (permalink / raw) To: Andrew Morton Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm, linux-kernel, Tejun Heo On Tue, 10 Nov 2009, Andrew Morton wrote: > Adding a thousand cache misses to the timer interrupt is the sort of > thing which makes people unhappy? Obviously I was hoping for new ideas instead of just restatements of the problem. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-05 15:36 ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter 2009-11-06 1:11 ` KAMEZAWA Hiroyuki @ 2009-11-06 4:08 ` KAMEZAWA Hiroyuki 2009-11-06 4:15 ` KAMEZAWA Hiroyuki 2 siblings, 0 replies; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 4:08 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Thu, 5 Nov 2009 10:36:06 -0500 (EST) Christoph Lameter <cl@linux-foundation.org> wrote: > +static inline unsigned long get_mm_rss(struct mm_struct *mm) > +{ > + int cpu; > + unsigned long r = 0; > + > + for_each_possible_cpu(cpu) { > + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); > + > + r = c->file + c->anon; r += c->file + c->anon; Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic V2 2009-11-05 15:36 ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter 2009-11-06 1:11 ` KAMEZAWA Hiroyuki 2009-11-06 4:08 ` KAMEZAWA Hiroyuki @ 2009-11-06 4:15 ` KAMEZAWA Hiroyuki 2 siblings, 0 replies; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-06 4:15 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Thu, 5 Nov 2009 10:36:06 -0500 (EST) Christoph Lameter <cl@linux-foundation.org> wrote: > +static inline unsigned long get_mm_rss(struct mm_struct *mm) > +{ > + int cpu; > + unsigned long r = 0; > + > + for_each_possible_cpu(cpu) { > + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); > + > + r = c->file + c->anon; > + } > + > + return r; > +} > + > +static inline void update_hiwater_rss(struct mm_struct *mm) > +{ > + unsigned long _rss = get_mm_rss(mm); > + if (mm->hiwater_rss < _rss) > + mm->hiwater_rss = _rss; > +} > + I'm sorry for my replies are scatterd. Isn't it better to add some filter in following path ? == static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, enum ttu_flags flags) { <snip> /* Update high watermark before we lower rss */ update_hiwater_rss(mm); == Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter ` (2 preceding siblings ...) 2009-11-04 23:49 ` Dave Jones @ 2009-11-05 1:16 ` KAMEZAWA Hiroyuki 2009-11-05 15:10 ` Christoph Lameter 2009-11-17 6:48 ` Zhang, Yanmin 4 siblings, 1 reply; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-05 1:16 UTC (permalink / raw) To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Wed, 4 Nov 2009 14:14:41 -0500 (EST) Christoph Lameter <cl@linux-foundation.org> wrote: > From: Christoph Lameter <cl@linux-foundation.org> > Subject: Make mm counters per cpu > > Changing the mm counters to per cpu counters is possible after the introduction > of the generic per cpu operations (currently in percpu and -next). > > With that the contention on the counters in mm_struct can be avoided. The > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > need to perform atomic updates to mm counters anymore. Various code paths > can be simplified since per cpu counter updates are fast and batching > of counter updates is no longer needed. > > One price to pay for these improvements is the need to scan over all percpu > counters when the actual count values are needed. > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > Hmm, I don't fully understand _new_ percpu but... In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig. BIGSMP. Then, if 65536 process runs, this consumes 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes. But x86's vmalloc area just has 80? MBytes. I (and my customers) don't have this kind of exteme machine, but cpus tend to be many-core (and still support 32bit mode), now. If 32, 64 cpus, 65536 * 8 * 32 = 16MB 65536 * 8 * 32 = 32MB And if I add swap_usage, 65536 * 12 * 32 = 24MB. It's influenced by the number of deivces attached to the sysytem but people will see more -ENOMEM. It seems this consumption/footprint is very big. Thanks, -Kame > --- > fs/proc/task_mmu.c | 14 +++++++++- > include/linux/mm_types.h | 16 ++++-------- > include/linux/sched.h | 61 ++++++++++++++++++++--------------------------- > kernel/fork.c | 25 ++++++++++++++----- > mm/filemap_xip.c | 2 - > mm/fremap.c | 2 - > mm/init-mm.c | 3 ++ > mm/memory.c | 20 +++++++-------- > mm/rmap.c | 10 +++---- > mm/swapfile.c | 2 - > 10 files changed, 84 insertions(+), 71 deletions(-) > > Index: linux-2.6/include/linux/mm_types.h > =================================================================== > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600 > @@ -24,11 +24,10 @@ struct address_space; > > #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) > > -#if USE_SPLIT_PTLOCKS > -typedef atomic_long_t mm_counter_t; > -#else /* !USE_SPLIT_PTLOCKS */ > -typedef unsigned long mm_counter_t; > -#endif /* !USE_SPLIT_PTLOCKS */ > +struct mm_counter { > + long file; > + long anon; > +}; > > /* > * Each physical page in the system has a struct page associated with > @@ -223,11 +222,8 @@ struct mm_struct { > * by mmlist_lock > */ > > - /* Special counters, in some configurations protected by the > - * page_table_lock, in other configurations by being atomic. > - */ > - mm_counter_t _file_rss; > - mm_counter_t _anon_rss; > + /* Special percpu counters */ > + struct mm_counter *rss; > > unsigned long hiwater_rss; /* High-watermark of RSS usage */ > unsigned long hiwater_vm; /* High-water virtual memory usage */ > Index: linux-2.6/include/linux/sched.h > =================================================================== > --- linux-2.6.orig/include/linux/sched.h 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/include/linux/sched.h 2009-11-04 13:13:42.000000000 -0600 > @@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi > extern void arch_unmap_area(struct mm_struct *, unsigned long); > extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); > > -#if USE_SPLIT_PTLOCKS > -/* > - * The mm counters are not protected by its page_table_lock, > - * so must be incremented atomically. > - */ > -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) > -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) > -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) > -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) > -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) > - > -#else /* !USE_SPLIT_PTLOCKS */ > -/* > - * The mm counters are protected by its page_table_lock, > - * so can be incremented directly. > - */ > -#define set_mm_counter(mm, member, value) (mm)->_##member = (value) > -#define get_mm_counter(mm, member) ((mm)->_##member) > -#define add_mm_counter(mm, member, value) (mm)->_##member += (value) > -#define inc_mm_counter(mm, member) (mm)->_##member++ > -#define dec_mm_counter(mm, member) (mm)->_##member-- > - > -#endif /* !USE_SPLIT_PTLOCKS */ > - > -#define get_mm_rss(mm) \ > - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) > -#define update_hiwater_rss(mm) do { \ > - unsigned long _rss = get_mm_rss(mm); \ > - if ((mm)->hiwater_rss < _rss) \ > - (mm)->hiwater_rss = _rss; \ > -} while (0) > -#define update_hiwater_vm(mm) do { \ > - if ((mm)->hiwater_vm < (mm)->total_vm) \ > - (mm)->hiwater_vm = (mm)->total_vm; \ > -} while (0) > +static inline unsigned long get_mm_rss(struct mm_struct *mm) > +{ > + int cpu; > + unsigned long r = 0; > + > + for_each_possible_cpu(cpu) { > + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); > + > + r = c->file + c->anon; > + } > + > + return r; > +} > + > +static inline void update_hiwater_rss(struct mm_struct *mm) > +{ > + unsigned long _rss = get_mm_rss(mm); > + if (mm->hiwater_rss < _rss) > + mm->hiwater_rss = _rss; > +} > + > +static inline void update_hiwater_vm(struct mm_struct *mm) > +{ > + if (mm->hiwater_vm < mm->total_vm) > + mm->hiwater_vm = mm->total_vm; > +} > > static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) > { > Index: linux-2.6/kernel/fork.c > =================================================================== > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600 > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct > > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) > { > + int cpu; > + > atomic_set(&mm->mm_users, 1); > atomic_set(&mm->mm_count, 1); > init_rwsem(&mm->mmap_sem); > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; > mm->core_state = NULL; > mm->nr_ptes = 0; > - set_mm_counter(mm, file_rss, 0); > - set_mm_counter(mm, anon_rss, 0); > + for_each_possible_cpu(cpu) { > + struct mm_counter *m; > + > + memset(m, sizeof(struct mm_counter), 0); > + } > spin_lock_init(&mm->page_table_lock); > mm->free_area_cache = TASK_UNMAPPED_BASE; > mm->cached_hole_size = ~0UL; > @@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void) > mm = allocate_mm(); > if (mm) { > memset(mm, 0, sizeof(*mm)); > - mm = mm_init(mm, current); > + mm->rss = alloc_percpu(struct mm_counter); > + if (mm->rss) > + mm = mm_init(mm, current); > + else { > + free_mm(mm); > + mm = NULL; > + } > } > return mm; > } > @@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm) > mm_free_pgd(mm); > destroy_context(mm); > mmu_notifier_mm_destroy(mm); > + free_percpu(mm->rss); > free_mm(mm); > } > EXPORT_SYMBOL_GPL(__mmdrop); > @@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str > goto fail_nomem; > > memcpy(mm, oldmm, sizeof(*mm)); > + mm->rss = alloc_percpu(struct mm_counter); > + if (!mm->rss) > + goto fail_nomem; > > /* Initializing for Swap token stuff */ > mm->token_priority = 0; > @@ -661,15 +676,13 @@ free_pt: > mm->binfmt = NULL; > mmput(mm); > > -fail_nomem: > - return NULL; > - > fail_nocontext: > /* > * If init_new_context() failed, we cannot use mmput() to free the mm > * because it calls destroy_context() > */ > mm_free_pgd(mm); > +fail_nomem: > free_mm(mm); > return NULL; > } > Index: linux-2.6/fs/proc/task_mmu.c > =================================================================== > --- linux-2.6.orig/fs/proc/task_mmu.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/fs/proc/task_mmu.c 2009-11-04 13:13:42.000000000 -0600 > @@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc > int task_statm(struct mm_struct *mm, int *shared, int *text, > int *data, int *resident) > { > - *shared = get_mm_counter(mm, file_rss); > + int cpu; > + int anon_rss = 0; > + int file_rss = 0; > + > + for_each_possible_cpu(cpu) { > + struct mm_counter *c = per_cpu_ptr(mm->rss, cpu); > + > + anon_rss += c->anon; > + file_rss += c->file; > + } > + *shared = file_rss; > *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) > >> PAGE_SHIFT; > *data = mm->total_vm - mm->shared_vm; > - *resident = *shared + get_mm_counter(mm, anon_rss); > + *resident = *shared + anon_rss; > return mm->total_vm; > } > > Index: linux-2.6/mm/filemap_xip.c > =================================================================== > --- linux-2.6.orig/mm/filemap_xip.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/filemap_xip.c 2009-11-04 13:13:42.000000000 -0600 > @@ -194,7 +194,7 @@ retry: > flush_cache_page(vma, address, pte_pfn(*pte)); > pteval = ptep_clear_flush_notify(vma, address, pte); > page_remove_rmap(page); > - dec_mm_counter(mm, file_rss); > + __this_cpu_dec(mm->rss->file); > BUG_ON(pte_dirty(pteval)); > pte_unmap_unlock(pte, ptl); > page_cache_release(page); > Index: linux-2.6/mm/fremap.c > =================================================================== > --- linux-2.6.orig/mm/fremap.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/fremap.c 2009-11-04 13:13:42.000000000 -0600 > @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm > page_remove_rmap(page); > page_cache_release(page); > update_hiwater_rss(mm); > - dec_mm_counter(mm, file_rss); > + __this_cpu_dec(mm->rss->file); > } > } else { > if (!pte_file(pte)) > Index: linux-2.6/mm/memory.c > =================================================================== > --- linux-2.6.orig/mm/memory.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/memory.c 2009-11-04 13:13:42.000000000 -0600 > @@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig > static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) > { > if (file_rss) > - add_mm_counter(mm, file_rss, file_rss); > + __this_cpu_add(mm->rss->file, file_rss); > if (anon_rss) > - add_mm_counter(mm, anon_rss, anon_rss); > + __this_cpu_add(mm->rss->anon, anon_rss); > } > > /* > @@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st > > /* Ok, finally just insert the thing.. */ > get_page(page); > - inc_mm_counter(mm, file_rss); > + __this_cpu_inc(mm->rss->file); > page_add_file_rmap(page); > set_pte_at(mm, addr, pte, mk_pte(page, prot)); > > @@ -2148,11 +2148,11 @@ gotten: > if (likely(pte_same(*page_table, orig_pte))) { > if (old_page) { > if (!PageAnon(old_page)) { > - dec_mm_counter(mm, file_rss); > - inc_mm_counter(mm, anon_rss); > + __this_cpu_dec(mm->rss->file); > + __this_cpu_inc(mm->rss->anon); > } > } else > - inc_mm_counter(mm, anon_rss); > + __this_cpu_inc(mm->rss->anon); > flush_cache_page(vma, address, pte_pfn(orig_pte)); > entry = mk_pte(new_page, vma->vm_page_prot); > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > @@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct > * discarded at swap_free(). > */ > > - inc_mm_counter(mm, anon_rss); > + __this_cpu_inc(mm->rss->anon); > pte = mk_pte(page, vma->vm_page_prot); > if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > @@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s > if (!pte_none(*page_table)) > goto release; > > - inc_mm_counter(mm, anon_rss); > + __this_cpu_inc(mm->rss->anon); > page_add_new_anon_rmap(page, vma, address); > setpte: > set_pte_at(mm, address, page_table, entry); > @@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct * > if (flags & FAULT_FLAG_WRITE) > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > if (anon) { > - inc_mm_counter(mm, anon_rss); > + __this_cpu_inc(mm->rss->anon); > page_add_new_anon_rmap(page, vma, address); > } else { > - inc_mm_counter(mm, file_rss); > + __this_cpu_inc(mm->rss->file); > page_add_file_rmap(page); > if (flags & FAULT_FLAG_WRITE) { > dirty_page = page; > Index: linux-2.6/mm/rmap.c > =================================================================== > --- linux-2.6.orig/mm/rmap.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/rmap.c 2009-11-04 13:13:42.000000000 -0600 > @@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page > > if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { > if (PageAnon(page)) > - dec_mm_counter(mm, anon_rss); > + __this_cpu_dec(mm->rss->anon); > else > - dec_mm_counter(mm, file_rss); > + __this_cpu_dec(mm->rss->file); > set_pte_at(mm, address, pte, > swp_entry_to_pte(make_hwpoison_entry(page))); > } else if (PageAnon(page)) { > @@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page > list_add(&mm->mmlist, &init_mm.mmlist); > spin_unlock(&mmlist_lock); > } > - dec_mm_counter(mm, anon_rss); > + __this_cpu_dec(mm->rss->anon); > } else if (PAGE_MIGRATION) { > /* > * Store the pfn of the page in a special migration > @@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page > entry = make_migration_entry(page, pte_write(pteval)); > set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > } else > - dec_mm_counter(mm, file_rss); > + __this_cpu_dec(mm->rss->file); > > > page_remove_rmap(page); > @@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned > > page_remove_rmap(page); > page_cache_release(page); > - dec_mm_counter(mm, file_rss); > + __this_cpu_dec(mm->rss->file); > (*mapcount)--; > } > pte_unmap_unlock(pte - 1, ptl); > Index: linux-2.6/mm/swapfile.c > =================================================================== > --- linux-2.6.orig/mm/swapfile.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/swapfile.c 2009-11-04 13:13:42.000000000 -0600 > @@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru > goto out; > } > > - inc_mm_counter(vma->vm_mm, anon_rss); > + __this_cpu_inc(vma->vm_mm->rss->anon); > get_page(page); > set_pte_at(vma->vm_mm, addr, pte, > pte_mkold(mk_pte(page, vma->vm_page_prot))); > Index: linux-2.6/mm/init-mm.c > =================================================================== > --- linux-2.6.orig/mm/init-mm.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/mm/init-mm.c 2009-11-04 13:13:42.000000000 -0600 > @@ -8,6 +8,8 @@ > #include <asm/atomic.h> > #include <asm/pgtable.h> > > +DEFINE_PER_CPU(struct mm_counter, init_mm_counters); > + > struct mm_struct init_mm = { > .mm_rb = RB_ROOT, > .pgd = swapper_pg_dir, > @@ -17,4 +19,5 @@ struct mm_struct init_mm = { > .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > .cpu_vm_mask = CPU_MASK_ALL, > + .rss = &init_mm_counters, > }; > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-05 1:16 ` [MM] Make mm counters per cpu instead of atomic KAMEZAWA Hiroyuki @ 2009-11-05 15:10 ` Christoph Lameter 2009-11-05 23:42 ` KAMEZAWA Hiroyuki 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-05 15:10 UTC (permalink / raw) To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote: > Hmm, I don't fully understand _new_ percpu but... > In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig. > BIGSMP. x86-32 only supports 32 processors. Plus per cpu areas are only allocated for the possible processors. > Then, if 65536 process runs, this consumes > > 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes. With 32 possible cpus this results in 16m of per cpu space use. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-05 15:10 ` Christoph Lameter @ 2009-11-05 23:42 ` KAMEZAWA Hiroyuki 0 siblings, 0 replies; 33+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-11-05 23:42 UTC (permalink / raw) To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo On Thu, 5 Nov 2009 10:10:56 -0500 (EST) Christoph Lameter <cl@linux-foundation.org> wrote: > On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote: > > > Hmm, I don't fully understand _new_ percpu but... > > In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig. > > BIGSMP. > > x86-32 only supports 32 processors. Plus per cpu areas are only allocated > for the possible processors. > My number is just from Kconfig. > > Then, if 65536 process runs, this consumes > > > > 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes. > > With 32 possible cpus this results in 16m of per cpu space use. > If swap_usage is added, 24m, 25% of vmalloc area. (But, yes, returning -ENOMEM to fork() is ok to me, 65536 proc are extreme.) Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter ` (3 preceding siblings ...) 2009-11-05 1:16 ` [MM] Make mm counters per cpu instead of atomic KAMEZAWA Hiroyuki @ 2009-11-17 6:48 ` Zhang, Yanmin 2009-11-17 7:31 ` Zhang, Yanmin 4 siblings, 1 reply; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-17 6:48 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote: > From: Christoph Lameter <cl@linux-foundation.org> > Subject: Make mm counters per cpu > > Changing the mm counters to per cpu counters is possible after the introduction > of the generic per cpu operations (currently in percpu and -next). > > With that the contention on the counters in mm_struct can be avoided. The > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > need to perform atomic updates to mm counters anymore. Various code paths > can be simplified since per cpu counter updates are fast and batching > of counter updates is no longer needed. > > One price to pay for these improvements is the need to scan over all percpu > counters when the actual count values are needed. > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > > --- > fs/proc/task_mmu.c | 14 +++++++++- > include/linux/mm_types.h | 16 ++++-------- > include/linux/sched.h | 61 ++++++++++++++++++++--------------------------- > kernel/fork.c | 25 ++++++++++++++----- > mm/filemap_xip.c | 2 - > mm/fremap.c | 2 - > mm/init-mm.c | 3 ++ > mm/memory.c | 20 +++++++-------- > mm/rmap.c | 10 +++---- > mm/swapfile.c | 2 - > 10 files changed, 84 insertions(+), 71 deletions(-) > > Index: linux-2.6/include/linux/mm_types.h > =================================================================== > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600 > @@ -24,11 +24,10 @@ struct address_space; > Index: linux-2.6/kernel/fork.c > =================================================================== > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600 > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600 > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct > > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) > { > + int cpu; > + > atomic_set(&mm->mm_users, 1); > atomic_set(&mm->mm_count, 1); > init_rwsem(&mm->mmap_sem); > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; > mm->core_state = NULL; > mm->nr_ptes = 0; > - set_mm_counter(mm, file_rss, 0); > - set_mm_counter(mm, anon_rss, 0); > + for_each_possible_cpu(cpu) { > + struct mm_counter *m; > + > + memset(m, sizeof(struct mm_counter), 0); Above memset is wrong. 1) m isn't initiated; 2) It seems the 2nd and the 3rd parameters should be interchanged. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-17 6:48 ` Zhang, Yanmin @ 2009-11-17 7:31 ` Zhang, Yanmin 2009-11-17 9:34 ` Zhang, Yanmin 0 siblings, 1 reply; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-17 7:31 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote: > On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote: > > From: Christoph Lameter <cl@linux-foundation.org> > > Subject: Make mm counters per cpu > > > > Changing the mm counters to per cpu counters is possible after the introduction > > of the generic per cpu operations (currently in percpu and -next). > > > > With that the contention on the counters in mm_struct can be avoided. The > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > > need to perform atomic updates to mm counters anymore. Various code paths > > can be simplified since per cpu counter updates are fast and batching > > of counter updates is no longer needed. > > > > One price to pay for these improvements is the need to scan over all percpu > > counters when the actual count values are needed. > > > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > > > > --- > > fs/proc/task_mmu.c | 14 +++++++++- > > include/linux/mm_types.h | 16 ++++-------- > > include/linux/sched.h | 61 ++++++++++++++++++++--------------------------- > > kernel/fork.c | 25 ++++++++++++++----- > > mm/filemap_xip.c | 2 - > > mm/fremap.c | 2 - > > mm/init-mm.c | 3 ++ > > mm/memory.c | 20 +++++++-------- > > mm/rmap.c | 10 +++---- > > mm/swapfile.c | 2 - > > 10 files changed, 84 insertions(+), 71 deletions(-) > > > > Index: linux-2.6/include/linux/mm_types.h > > =================================================================== > > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600 > > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600 > > @@ -24,11 +24,10 @@ struct address_space; > > > Index: linux-2.6/kernel/fork.c > > =================================================================== > > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600 > > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600 > > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct > > > > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) > > { > > + int cpu; > > + > > atomic_set(&mm->mm_users, 1); > > atomic_set(&mm->mm_count, 1); > > init_rwsem(&mm->mmap_sem); > > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct > > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; > > mm->core_state = NULL; > > mm->nr_ptes = 0; > > - set_mm_counter(mm, file_rss, 0); > > - set_mm_counter(mm, anon_rss, 0); > > + for_each_possible_cpu(cpu) { > > + struct mm_counter *m; > > + > > + memset(m, sizeof(struct mm_counter), 0); > Above memset is wrong. > 1) m isn't initiated; > 2) It seems the 2nd and the 3rd parameters should be interchanged. Changing it to below fixes the command hang issue. for_each_possible_cpu(cpu) { struct mm_counter *m = per_cpu(mm->rss->readers, cpu); memset(m, 0, sizeof(struct mm_counter)); } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-17 7:31 ` Zhang, Yanmin @ 2009-11-17 9:34 ` Zhang, Yanmin 2009-11-17 17:25 ` Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-17 9:34 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 2009-11-17 at 15:31 +0800, Zhang, Yanmin wrote: > On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote: > > On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote: > > > From: Christoph Lameter <cl@linux-foundation.org> > > > Subject: Make mm counters per cpu > > > > > > Changing the mm counters to per cpu counters is possible after the introduction > > > of the generic per cpu operations (currently in percpu and -next). > > > > > > With that the contention on the counters in mm_struct can be avoided. The > > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not > > > need to perform atomic updates to mm counters anymore. Various code paths > > > can be simplified since per cpu counter updates are fast and batching > > > of counter updates is no longer needed. > > > > > > One price to pay for these improvements is the need to scan over all percpu > > > counters when the actual count values are needed. > > > > > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org> > > > > > > --- > > > fs/proc/task_mmu.c | 14 +++++++++- > > > include/linux/mm_types.h | 16 ++++-------- > > > include/linux/sched.h | 61 ++++++++++++++++++++--------------------------- > > > kernel/fork.c | 25 ++++++++++++++----- > > > mm/filemap_xip.c | 2 - > > > mm/fremap.c | 2 - > > > mm/init-mm.c | 3 ++ > > > mm/memory.c | 20 +++++++-------- > > > mm/rmap.c | 10 +++---- > > > mm/swapfile.c | 2 - > > > 10 files changed, 84 insertions(+), 71 deletions(-) > > > > > > Index: linux-2.6/include/linux/mm_types.h > > > =================================================================== > > > --- linux-2.6.orig/include/linux/mm_types.h 2009-11-04 13:08:33.000000000 -0600 > > > +++ linux-2.6/include/linux/mm_types.h 2009-11-04 13:13:42.000000000 -0600 > > > @@ -24,11 +24,10 @@ struct address_space; > > > > > Index: linux-2.6/kernel/fork.c > > > =================================================================== > > > --- linux-2.6.orig/kernel/fork.c 2009-11-04 13:08:33.000000000 -0600 > > > +++ linux-2.6/kernel/fork.c 2009-11-04 13:14:19.000000000 -0600 > > > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct > > > > > > static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) > > > { > > > + int cpu; > > > + > > > atomic_set(&mm->mm_users, 1); > > > atomic_set(&mm->mm_count, 1); > > > init_rwsem(&mm->mmap_sem); > > > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct > > > (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; > > > mm->core_state = NULL; > > > mm->nr_ptes = 0; > > > - set_mm_counter(mm, file_rss, 0); > > > - set_mm_counter(mm, anon_rss, 0); > > > + for_each_possible_cpu(cpu) { > > > + struct mm_counter *m; > > > + > > > + memset(m, sizeof(struct mm_counter), 0); > > Above memset is wrong. > > 1) m isn't initiated; > > 2) It seems the 2nd and the 3rd parameters should be interchanged. > Changing it to below fixes the command hang issue. > > for_each_possible_cpu(cpu) { > struct mm_counter *m = per_cpu(mm->rss->readers, cpu); > > memset(m, 0, sizeof(struct mm_counter)); > } Sorry. I was too optimistic and used another kernel to boot. The right change above should be: struct mm_counter *m = per_cpu_ptr(mm->rss, cpu); Compiler doesn't report error/warning when I use any member. With the change, command 'make oldconfig' and a boot command still hangs. Yanmin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-17 9:34 ` Zhang, Yanmin @ 2009-11-17 17:25 ` Christoph Lameter 2009-11-19 0:48 ` Zhang, Yanmin 2009-11-23 8:51 ` Zhang, Yanmin 0 siblings, 2 replies; 33+ messages in thread From: Christoph Lameter @ 2009-11-17 17:25 UTC (permalink / raw) To: Zhang, Yanmin Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 17 Nov 2009, Zhang, Yanmin wrote: > The right change above should be: > struct mm_counter *m = per_cpu_ptr(mm->rss, cpu); Right. > With the change, command 'make oldconfig' and a boot command still > hangs. Not sure if its worth spending more time on this but if you want I will consolidate the fixes so far and put out another patchset. Where does it hang during boot? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-17 17:25 ` Christoph Lameter @ 2009-11-19 0:48 ` Zhang, Yanmin 2009-11-23 8:51 ` Zhang, Yanmin 1 sibling, 0 replies; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-19 0:48 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote: > On Tue, 17 Nov 2009, Zhang, Yanmin wrote: > > > The right change above should be: > > struct mm_counter *m = per_cpu_ptr(mm->rss, cpu); > > Right. > > > With the change, command 'make oldconfig' and a boot command still > > hangs. > > Not sure if its worth spending more time on this but if you want I will > consolidate the fixes so far and put out another patchset. > > Where does it hang during boot? > 1) A init boot script calss pidof and pidof hands in access_process_vm => (mutex_lock <=> mutex_unlock), so actually in mm_reader_lock. 2) 'make oldconfig' hangs in sys_map => msleep, actually in mm_writer_lock. I will check it today. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-17 17:25 ` Christoph Lameter 2009-11-19 0:48 ` Zhang, Yanmin @ 2009-11-23 8:51 ` Zhang, Yanmin 2009-11-23 14:31 ` Christoph Lameter 1 sibling, 1 reply; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-23 8:51 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote: > On Tue, 17 Nov 2009, Zhang, Yanmin wrote: > > > The right change above should be: > > struct mm_counter *m = per_cpu_ptr(mm->rss, cpu); > > Right. > > > With the change, command 'make oldconfig' and a boot command still > > hangs. > > Not sure if its worth spending more time on this but if you want I will > consolidate the fixes so far and put out another patchset. > > Where does it hang during boot? Definitely faint. 1) In function exec_mmap: in the 2nd 'if (old_mm) {', mm_reader_unlock should be used. Your patch uses mm_reader_lock. I found it when reviewing your patch, but forgot to fix it when testing. 2) In function madvise: the last unlock should be mm_reader_unlock. Your patch uses mm_writer_unlock. It's easy to hit the issues with normal testing. I'm surprised you didn't hit them. Another theoretic issue is below scenario: Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter will overflow. Considering multiple thread cases, it might be faster to overflow than what we imagine. When it overflows, processes will hang there. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-23 8:51 ` Zhang, Yanmin @ 2009-11-23 14:31 ` Christoph Lameter 2009-11-24 8:02 ` Zhang, Yanmin 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-23 14:31 UTC (permalink / raw) To: Zhang, Yanmin Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Mon, 23 Nov 2009, Zhang, Yanmin wrote: > Another theoretic issue is below scenario: > Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then > it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter > will overflow. Considering multiple thread cases, it might be faster to > overflow than what we imagine. When it overflows, processes will hang there. True.... We need to find some alternative to per cpu data to scale mmap sem then. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-23 14:31 ` Christoph Lameter @ 2009-11-24 8:02 ` Zhang, Yanmin 2009-11-24 15:17 ` Christoph Lameter 0 siblings, 1 reply; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-24 8:02 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Mon, 2009-11-23 at 08:31 -0600, Christoph Lameter wrote: > On Mon, 23 Nov 2009, Zhang, Yanmin wrote: > > > Another theoretic issue is below scenario: > > Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then > > it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter > > will overflow. Considering multiple thread cases, it might be faster to > > overflow than what we imagine. When it overflows, processes will hang there. > > True.... We need to find some alternative to per cpu data to scale mmap > sem then. I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical cpu machine, and didn't find big result difference between with your patch and without your patch. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-24 8:02 ` Zhang, Yanmin @ 2009-11-24 15:17 ` Christoph Lameter 2009-11-25 1:23 ` Zhang, Yanmin 0 siblings, 1 reply; 33+ messages in thread From: Christoph Lameter @ 2009-11-24 15:17 UTC (permalink / raw) To: Zhang, Yanmin Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 24 Nov 2009, Zhang, Yanmin wrote: > > True.... We need to find some alternative to per cpu data to scale mmap > > sem then. > I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone > /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical > cpu machine, and didn't find big result difference between with your patch and without > your patch. This affects loads that heavily use mmap_sem. You wont find too many issues in tests that do not run processes with a large thread count and cause lots of faults or uses of get_user_pages(). The tests you list are not of that nature. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [MM] Make mm counters per cpu instead of atomic 2009-11-24 15:17 ` Christoph Lameter @ 2009-11-25 1:23 ` Zhang, Yanmin 0 siblings, 0 replies; 33+ messages in thread From: Zhang, Yanmin @ 2009-11-25 1:23 UTC (permalink / raw) To: Christoph Lameter Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo, Andi Kleen On Tue, 2009-11-24 at 09:17 -0600, Christoph Lameter wrote: > On Tue, 24 Nov 2009, Zhang, Yanmin wrote: > > > > True.... We need to find some alternative to per cpu data to scale mmap > > > sem then. > > I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone > > /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical > > cpu machine, and didn't find big result difference between with your patch and without > > your patch. > > This affects loads that heavily use mmap_sem. You wont find too many > issues in tests that do not run processes with a large thread count and > cause lots of faults or uses of get_user_pages(). The tests you list are > not of that nature. sysbench_oltp(mysql) is kind of such workload. Both sysbench and mysql are multi-threaded. 2 years ago, I investigated a scalability issue of such workload and found mysql causes frequent down_read(mm->mmap_sem). Nick changes it to down_read to fix it. But this workload doesn't work well with more than 64 threads because mysql has some unreasonable big locks in userspace (implemented as a conditional spinlock in userspace). Yanmin -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 33+ messages in thread
end of thread, other threads:[~2009-11-25 1:20 UTC | newest] Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter 2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter 2009-11-04 21:02 ` Andi Kleen 2009-11-04 22:02 ` Christoph Lameter 2009-11-05 8:27 ` Andi Kleen 2009-11-04 21:01 ` [MM] Make mm counters per cpu instead of atomic Andi Kleen 2009-11-04 23:49 ` Dave Jones 2009-11-05 15:04 ` Christoph Lameter 2009-11-05 15:36 ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter 2009-11-06 1:11 ` KAMEZAWA Hiroyuki 2009-11-06 3:23 ` KAMEZAWA Hiroyuki 2009-11-06 17:32 ` Christoph Lameter 2009-11-06 19:03 ` KAMEZAWA Hiroyuki 2009-11-06 19:13 ` Christoph Lameter 2009-11-06 19:20 ` KAMEZAWA Hiroyuki 2009-11-06 19:47 ` Christoph Lameter 2009-11-10 22:44 ` Andrew Morton 2009-11-10 23:20 ` Christoph Lameter 2009-11-06 4:08 ` KAMEZAWA Hiroyuki 2009-11-06 4:15 ` KAMEZAWA Hiroyuki 2009-11-05 1:16 ` [MM] Make mm counters per cpu instead of atomic KAMEZAWA Hiroyuki 2009-11-05 15:10 ` Christoph Lameter 2009-11-05 23:42 ` KAMEZAWA Hiroyuki 2009-11-17 6:48 ` Zhang, Yanmin 2009-11-17 7:31 ` Zhang, Yanmin 2009-11-17 9:34 ` Zhang, Yanmin 2009-11-17 17:25 ` Christoph Lameter 2009-11-19 0:48 ` Zhang, Yanmin 2009-11-23 8:51 ` Zhang, Yanmin 2009-11-23 14:31 ` Christoph Lameter 2009-11-24 8:02 ` Zhang, Yanmin 2009-11-24 15:17 ` Christoph Lameter 2009-11-25 1:23 ` Zhang, Yanmin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox