From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Tue, 24 Jul 2001 16:30:10 -0700 From: William Lee Irwin III Subject: __alloc_pages_core speedup Message-ID: <20010724163010.A3593@holomorphy.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Description: brief message Content-Disposition: inline Sender: owner-linux-mm@kvack.org Return-Path: To: linux-mm@kvack.org List-ID: On IBM NUMA hardware, there was a peculiar delay during boot just prior to "ACPI: Installing SCI 9 handler pass". It was determined that this delay (of approximately 80 seconds) consisted of calls to __alloc_bootmem_core. It appears that it is only called when needed, and that an individual call was taking a great deal of time. I'm not sure what the right way to fix this is, but I've tried a few approaches. I am interested in hearing of other ways to cope with this situation that would be more effective or more palatable. First, I tried to alter this so that it read in blocks of some compiler-supported size (e.g. u64) and performed its checks on 64-bit blocks at a time, with (of course) some provisions for end cases. This was not successful, either due to unexpected interactions or implementation errors. The following merely provides some non-atomic bit operations, and replaces the calls to the atomic versions in bootmem.c with calls to them. It provides a speedup of 8 seconds, providing the following timings: ---- seconds from EFI to APCI: Installing SCI 9 handler pass Atomic: 79 Non-atomic: 71 ---- seconds from above to VFS mount Atomic: 122 Non-atomic: 114 ... and other significant landmarks in booting differ by 8 seconds. I don't believe this issue is truly architecture-specific, as it appears to me that the properties of the algorithm would likely scale similarly on different architectures. And I can also provide implementations of the non-atomic bit operations for other architectures. Cheers, Bill P.S.: The diffs follow: --- linux-old/mm/bootmem.c Mon Jul 23 11:37:58 2001 +++ linux-0626/mm/bootmem.c Wed Jul 18 14:50:02 2001 @@ -89,17 +89,17 @@ BUG(); if (sidx >= eidx) BUG(); if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) BUG(); if (end > bdata->node_low_pfn) BUG(); for (i = sidx; i < eidx; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) + if (__test_and_set_bit(i, bdata->node_bootmem_map)) printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) { unsigned long i; unsigned long start; /* @@ -116,17 +116,17 @@ /* * Round up the beginning of the address. */ start = (addr + PAGE_SIZE-1) / PAGE_SIZE; sidx = start - (bdata->node_boot_start/PAGE_SIZE); for (i = sidx; i < eidx; i++) { - if (!test_and_clear_bit(i, bdata->node_bootmem_map)) + if (!__test_and_clear_bit(i, bdata->node_bootmem_map)) BUG(); } } /* * We 'merge' subsequent allocations to save space. We might 'lose' * some fraction of a page if allocations cannot be satisfied due to * size constraints on boxes where there is physical RAM space @@ -166,22 +166,22 @@ preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; incr = align >> PAGE_SHIFT ? : 1; restart_scan: for (i = preferred; i < eidx; i += incr) { unsigned long j; - if (test_bit(i, bdata->node_bootmem_map)) + if (__test_bit(i, bdata->node_bootmem_map)) continue; for (j = i + 1; j < i + areasize; ++j) { if (j >= eidx) goto fail_block; - if (test_bit (j, bdata->node_bootmem_map)) + if (__test_bit (j, bdata->node_bootmem_map)) goto fail_block; } start = i; goto found; fail_block:; } if (preferred) { preferred = 0; @@ -222,17 +222,17 @@ bdata->last_pos = start + areasize - 1; bdata->last_offset = size & ~PAGE_MASK; ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); } /* * Reserve the area now: */ for (i = start; i < start+areasize; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) + if (__test_and_set_bit(i, bdata->node_bootmem_map)) BUG(); memset(ret, 0, size); return ret; } static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { struct page *page = pgdat->node_mem_map; @@ -240,17 +240,17 @@ unsigned long i, count, total = 0; unsigned long idx; if (!bdata->node_bootmem_map) BUG(); count = 0; idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); for (i = 0; i < idx; i++, page++) { - if (!test_bit(i, bdata->node_bootmem_map)) { + if (!__test_bit(i, bdata->node_bootmem_map)) { count++; ClearPageReserved(page); set_page_count(page, 1); __free_page(page); } } total += count; --- linux-old/include/asm-ia64/bitops.h Mon Jul 23 11:38:28 2001 +++ linux-0626/include/asm-ia64/bitops.h Wed Jul 18 14:53:34 2001 @@ -30,16 +30,27 @@ bit = 1 << (nr & 31); do { CMPXCHG_BUGCHECK(m); old = *m; new = old | bit; } while (cmpxchg_acq(m, old, new) != old); } +static __inline__ void +__set_bit(int nr, void * addr) +{ + __u32 bit, old, *m; + m = ((__u32 *)addr) + (nr >> 5); + bit = 0x1 << (nr & 0x1f); + old = *m; + *m |= bit; + return; +} + /* * clear_bit() doesn't provide any barrier for the compiler. */ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() static __inline__ void clear_bit (int nr, volatile void *addr) { @@ -51,16 +62,27 @@ mask = ~(1 << (nr & 31)); do { CMPXCHG_BUGCHECK(m); old = *m; new = old & mask; } while (cmpxchg_acq(m, old, new) != old); } +static __inline__ void +__clear_bit(int nr, void * addr) +{ + __u32 mask, old, *m; + m = ((__u32 *)addr) + (nr >> 5); + mask = ~(0x1 << (nr & 0x1f)); + old = *m; + *m &= mask; + return; +} + /* * WARNING: non atomic version. */ static __inline__ void __change_bit (int nr, void *addr) { volatile __u32 *m = (__u32 *) addr + (nr >> 5); __u32 bit = (1 << (nr & 31)); @@ -97,32 +119,55 @@ CMPXCHG_BUGCHECK(m); old = *m; new = old | bit; } while (cmpxchg_acq(m, old, new) != old); return (old & bit) != 0; } static __inline__ int +__test_and_set_bit(int nr, void * addr) +{ + __u32 bit, old; + __u32 * m; + m = ((__u32 *)addr) + (nr >> 5); + bit = 0x1 << (nr & 0x1f); + old = *m; + *m |= bit; + return((old & bit) != 0); +} + +static __inline__ int test_and_clear_bit (int nr, volatile void *addr) { __u32 mask, old, new; volatile __u32 *m; CMPXCHG_BUGCHECK_DECL m = (volatile __u32 *) addr + (nr >> 5); mask = ~(1 << (nr & 31)); do { CMPXCHG_BUGCHECK(m); old = *m; new = old & mask; } while (cmpxchg_acq(m, old, new) != old); return (old & ~mask) != 0; } +static __inline__ int +__test_and_clear_bit(int nr, void * addr) +{ + __u32 mask, old, *m; + m = ((__u32 *)addr) + (nr >> 5); + mask = ~(0x1 << (nr & 0x1f)); + old = *m; + *m &= mask; + return((old & ~mask) != 0); +} + /* * WARNING: non atomic version. */ static __inline__ int __test_and_change_bit (int nr, void *addr) { __u32 old, bit = (1 << (nr & 31)); __u32 *m = (__u32 *) addr + (nr >> 5); @@ -148,16 +193,22 @@ } while (cmpxchg_acq(m, old, new) != old); return (old & bit) != 0; } static __inline__ int test_bit (int nr, volatile void *addr) { return 1 & (((const volatile __u32 *) addr)[nr >> 5] >> (nr & 31)); +} + +static __inline__ int +__test_bit(int nr, void * addr) +{ + return 0x1 & (((__u32 *)addr)[nr >> 5] >> (nr & 0x1f)); } /* * ffz = "find first zero". Returns the bit number (0..63) of the first (least * significant) bit that is zero in X. Undefined if no zero exists, so code should check * against ~0UL first... */ static inline unsigned long -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/