linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Separate global/perzone inactive/free shortage
@ 2001-07-14  5:19 Marcelo Tosatti
  2001-07-14  7:11 ` Marcelo Tosatti
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2001-07-14  5:19 UTC (permalink / raw)
  To: lkml
  Cc: Rik van Riel, Dirk Wetter, Mike Galbraith, linux-mm, Stephen C. Tweedie

Hi,

As well known, the VM does not make a distiction between global and
per-zone shortages when trying to free memory. That means if only a given
memory zone is under shortage, the kernel will scan pages from all zones. 

The following patch (against 2.4.6-ac2), changes the kernel behaviour to
avoid freeing pages from zones which do not have an inactive and/or
free shortage.

Now I'm able to run memory hogs allocating 4GB of memory (on 4GB machine)
without getting real long hangs on my ssh session. (which used to happen
on stock -ac2 due to exhaustion of DMA pages for networking).

Comments ? 

Dirk, Can you please try the patch and tell us if it fixes your problem ? 


diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h	Sat Jul 14 02:47:14 2001
+++ linux/include/linux/swap.h	Sat Jul 14 03:27:13 2001
@@ -123,9 +123,14 @@
 extern wait_queue_head_t kreclaimd_wait;
 extern int page_launder(int, int);
 extern int free_shortage(void);
+extern int total_free_shortage(void);
 extern int inactive_shortage(void);
+extern int total_inactive_shortage(void);
 extern void wakeup_kswapd(void);
 extern int try_to_free_pages(unsigned int gfp_mask);
+
+extern unsigned int zone_free_shortage(zone_t *zone);
+extern unsigned int zone_inactive_shortage(zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/page_alloc.c linux/mm/page_alloc.c
--- linux.orig/mm/page_alloc.c	Sat Jul 14 02:47:14 2001
+++ linux/mm/page_alloc.c	Sat Jul 14 02:50:50 2001
@@ -451,7 +451,7 @@
 		 * to give up than to deadlock the kernel looping here.
 		 */
 		if (gfp_mask & __GFP_WAIT) {
-			if (!order || free_shortage()) {
+			if (!order || total_free_shortage()) {
 				int progress = try_to_free_pages(gfp_mask);
 				if (progress || (gfp_mask & __GFP_FS))
 					goto try_again;
@@ -689,6 +689,39 @@
 	return pages;
 }
 #endif
+
+unsigned int zone_free_shortage(zone_t *zone)
+{
+	int sum = 0;
+
+	if (!zone->size)
+		goto ret;
+
+	if (zone->inactive_clean_pages + zone->free_pages
+			< zone->pages_min) {
+		sum += zone->pages_min;
+		sum -= zone->free_pages;
+		sum -= zone->inactive_clean_pages;
+	}
+ret:
+	return sum;
+}
+
+unsigned int zone_inactive_shortage(zone_t *zone) 
+{
+	int sum = 0;
+
+	if (!zone->size)
+		goto ret;
+
+	sum = zone->pages_high;
+	sum -= zone->inactive_dirty_pages;
+	sum -= zone->inactive_clean_pages;
+	sum -= zone->free_pages;
+
+ret:
+     return (sum > 0 ? sum : 0);
+}
 
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/vmscan.c linux/mm/vmscan.c
--- linux.orig/mm/vmscan.c	Sat Jul 14 02:47:14 2001
+++ linux/mm/vmscan.c	Sat Jul 14 03:22:19 2001
@@ -36,11 +36,19 @@
  */
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static void try_to_swap_out(zone_t *zone, struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
 {
 	pte_t pte;
 	swp_entry_t entry;
 
+	/* 
+	 * If we are doing a zone-specific scan, do not
+	 * touch pages from zones which don't have a 
+	 * shortage.
+	 */
+	if (zone && !zone_inactive_shortage(page->zone))
+		return;
+
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
 		page->age += PAGE_AGE_ADV;
@@ -131,7 +139,7 @@
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static int swap_out_pmd(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -155,7 +163,7 @@
 			struct page *page = pte_page(*pte);
 
 			if (VALID_PAGE(page) && !PageReserved(page)) {
-				try_to_swap_out(mm, vma, address, pte, page);
+				try_to_swap_out(zone, mm, vma, address, pte, page);
 				if (!--count)
 					break;
 			}
@@ -168,7 +176,7 @@
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -188,7 +196,7 @@
 		end = pgd_end;
 	
 	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count);
+		count = swap_out_pmd(zone, mm, vma, pmd, address, end, count);
 		if (!count)
 			break;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -198,7 +206,7 @@
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static int swap_out_vma(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -213,7 +221,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+		count = swap_out_pgd(zone, mm, vma, pgdir, address, end, count);
 		if (!count)
 			break;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -225,7 +233,7 @@
 /*
  * Returns non-zero if we scanned all `count' pages
  */
-static int swap_out_mm(struct mm_struct * mm, int count)
+static int swap_out_mm(zone_t *zone, struct mm_struct * mm, int count)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -248,7 +256,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			count = swap_out_vma(mm, vma, address, count);
+			count = swap_out_vma(zone, mm, vma, address, count);
 			if (!count)
 				goto out_unlock;
 			vma = vma->vm_next;
@@ -280,7 +288,7 @@
 	return nr;
 }
 
-static void swap_out(unsigned int priority, int gfp_mask)
+static void swap_out(zone_t *zone, unsigned int priority, int gfp_mask)
 {
 	int counter;
 	int retval = 0;
@@ -288,7 +296,7 @@
 
 	/* Always start by trying to penalize the process that is allocating memory */
 	if (mm)
-		retval = swap_out_mm(mm, swap_amount(mm));
+		retval = swap_out_mm(zone, mm, swap_amount(mm));
 
 	/* Then, look at the other mm's */
 	counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
@@ -310,7 +318,7 @@
 		spin_unlock(&mmlist_lock);
 
 		/* Walk about 6% of the address space each time */
-		retval |= swap_out_mm(mm, swap_amount(mm));
+		retval |= swap_out_mm(zone, mm, swap_amount(mm));
 		mmput(mm);
 	} while (--counter >= 0);
 	return;
@@ -426,7 +434,7 @@
 #define MAX_LAUNDER 		(4 * (1 << page_cluster))
 #define CAN_DO_FS		(gfp_mask & __GFP_FS)
 #define CAN_DO_IO		(gfp_mask & __GFP_IO)
-int page_launder(int gfp_mask, int sync)
+int do_page_launder(zone_t *zone, int gfp_mask, int sync)
 {
 	int launder_loop, maxscan, cleaned_pages, maxlaunder;
 	struct list_head * page_lru;
@@ -461,6 +469,17 @@
 			continue;
 		}
 
+		/* 
+		 * If we are doing zone-specific laundering, 
+		 * avoid touching pages from zones which do 
+		 * not have a free shortage.
+		 */
+		if (zone && !zone_free_shortage(page->zone)) {
+			list_del(page_lru);
+			list_add(page_lru, &inactive_dirty_list);
+			continue;
+		}
+
 		/*
 		 * The page is locked. IO in progress?
 		 * Move it to the back of the list.
@@ -574,8 +593,13 @@
 			 * If we're freeing buffer cache pages, stop when
 			 * we've got enough free memory.
 			 */
-			if (freed_page && !free_shortage())
-				break;
+			if (freed_page) {
+				if (zone) {
+					if (!zone_free_shortage(zone))
+						break;
+				} else if (free_shortage()) 
+					break;
+			}
 			continue;
 		} else if (page->mapping && !PageDirty(page)) {
 			/*
@@ -613,7 +637,8 @@
 	 * loads, flush out the dirty pages before we have to wait on
 	 * IO.
 	 */
-	if (CAN_DO_IO && !launder_loop && free_shortage()) {
+	if (CAN_DO_IO && !launder_loop && (free_shortage() 
+				|| (zone && zone_free_shortage(zone)))) {
 		launder_loop = 1;
 		/* If we cleaned pages, never do synchronous IO. */
 		if (cleaned_pages)
@@ -629,6 +654,34 @@
 	return cleaned_pages;
 }
 
+int page_launder(int gfp_mask, int sync)
+{
+	int type = 0;
+	int ret;
+	pg_data_t *pgdat = pgdat_list;
+	/*
+	 * First do a global scan if there is a 
+	 * global shortage.
+	 */
+	if (free_shortage())
+		ret += do_page_launder(NULL, gfp_mask, sync);
+
+	/*
+	 * Then check if there is any specific zone 
+	 * needs laundering.
+	 */
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		zone_t *zone = pgdat->node_zones + type;
+		
+		if (zone_free_shortage(zone)) 
+			ret += do_page_launder(zone, gfp_mask, sync);
+	} 
+
+	return ret;
+}
+
+
+
 /**
  * refill_inactive_scan - scan the active list and find pages to deactivate
  * @priority: the priority at which to scan
@@ -637,7 +690,7 @@
  * This function will scan a portion of the active list to find
  * unused pages, those pages will then be moved to the inactive list.
  */
-int refill_inactive_scan(unsigned int priority, int target)
+int refill_inactive_scan(zone_t *zone, unsigned int priority, int target)
 {
 	struct list_head * page_lru;
 	struct page * page;
@@ -665,6 +718,16 @@
 			continue;
 		}
 
+		/*
+		 * If we are doing zone-specific scanning, ignore
+		 * pages from zones without shortage.
+		 */
+
+		if (zone && !zone_inactive_shortage(page->zone)) {
+			page_active = 1;
+			goto skip_page;
+		}
+
 		/* Do aging on the pages. */
 		if (PageTestandClearReferenced(page)) {
 			age_page_up_nolock(page);
@@ -694,6 +757,7 @@
 		 * to the other end of the list. Otherwise we exit if
 		 * we have done enough work.
 		 */
+skip_page:
 		if (page_active || PageActive(page)) {
 			list_del(page_lru);
 			list_add(page_lru, &active_list);
@@ -709,12 +773,10 @@
 }
 
 /*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
+ * Check if we have are low on free pages globally.
  */
 int free_shortage(void)
 {
-	pg_data_t *pgdat = pgdat_list;
 	int sum = 0;
 	int freeable = nr_free_pages() + nr_inactive_clean_pages();
 	int freetarget = freepages.high;
@@ -722,6 +784,22 @@
 	/* Are we low on free pages globally? */
 	if (freeable < freetarget)
 		return freetarget - freeable;
+	return 0;
+}
+
+/*
+ *
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int total_free_shortage(void)
+{
+	int sum = 0;
+	pg_data_t *pgdat = pgdat_list;
+
+	/* Do we have a global free shortage? */
+	if((sum = free_shortage()))
+		return sum;
 
 	/* If not, are we very low on any particular zone? */
 	do {
@@ -739,15 +817,15 @@
 	} while (pgdat);
 
 	return sum;
+
 }
 
 /*
- * How many inactive pages are we short?
+ * How many inactive pages are we short globally?
  */
 int inactive_shortage(void)
 {
 	int shortage = 0;
-	pg_data_t *pgdat = pgdat_list;
 
 	/* Is the inactive dirty list too small? */
 
@@ -759,10 +837,20 @@
 
 	if (shortage > 0)
 		return shortage;
+	return 0;
+}
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int total_inactive_shortage(void)
+{
+	int shortage = 0;
+	pg_data_t *pgdat = pgdat_list;
 
-	/* If not, do we have enough per-zone pages on the inactive list? */
+	if((shortage = inactive_shortage()))
+		return shortage;
 
-	shortage = 0;
+	shortage = 0;	
 
 	do {
 		int i;
@@ -802,7 +890,7 @@
  * when called from a user process.
  */
 #define DEF_PRIORITY (6)
-static int refill_inactive(unsigned int gfp_mask, int user)
+static int refill_inactive_global(unsigned int gfp_mask, int user)
 {
 	int count, start_count, maxtry;
 
@@ -824,9 +912,9 @@
 		}
 
 		/* Walk the VM space for a bit.. */
-		swap_out(DEF_PRIORITY, gfp_mask);
+		swap_out(NULL, DEF_PRIORITY, gfp_mask);
 
-		count -= refill_inactive_scan(DEF_PRIORITY, count);
+		count -= refill_inactive_scan(NULL, DEF_PRIORITY, count);
 		if (count <= 0)
 			goto done;
 
@@ -839,6 +927,60 @@
 	return (count < start_count);
 }
 
+static int refill_inactive_zone(zone_t *zone, unsigned int gfp_mask, int user) 
+{
+	int count, start_count, maxtry; 
+	
+	count = start_count = zone_inactive_shortage(zone);
+
+	maxtry = (1 << DEF_PRIORITY);
+
+	do {
+		swap_out(zone, DEF_PRIORITY, gfp_mask);
+
+		count -= refill_inactive_scan(zone, DEF_PRIORITY, count);
+
+		if (count <= 0)
+			goto done;
+
+		if (--maxtry <= 0)
+			return 0;
+
+	} while(zone_inactive_shortage(zone));
+done:
+	return (count < start_count);
+}
+
+
+static int refill_inactive(unsigned int gfp_mask, int user) 
+{
+	int type = 0;
+	int ret;
+	pg_data_t *pgdat = pgdat_list;
+	/*
+	 * First do a global scan if there is a 
+	 * global shortage.
+	 */
+	if (inactive_shortage())
+		ret += refill_inactive_global(gfp_mask, user);
+
+	/*
+	 * Then check if there is any specific zone 
+	 * with a shortage and try to refill it if
+	 * so.
+	 */
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		zone_t *zone = pgdat->node_zones + type;
+		
+		if (zone_inactive_shortage(zone)) 
+			ret += refill_inactive_zone(zone, gfp_mask, user);
+	} 
+
+	return ret;
+}
+
+#define DEF_PRIORITY (6)
+
 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 {
 	int ret = 0;
@@ -851,8 +993,10 @@
 	 * before we get around to moving them to the other
 	 * list, so this is a relatively cheap operation.
 	 */
-	if (free_shortage()) {
-		ret += page_launder(gfp_mask, user);
+
+	ret += page_launder(gfp_mask, user);
+
+	if (total_free_shortage()) {
 		shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
 		shrink_icache_memory(DEF_PRIORITY, gfp_mask);
 	}
@@ -861,8 +1005,7 @@
 	 * If needed, we move pages from the active list
 	 * to the inactive list.
 	 */
-	if (inactive_shortage())
-		ret += refill_inactive(gfp_mask, user);
+	ret += refill_inactive(gfp_mask, user);
 
 	/* 	
 	 * Reclaim unused slab cache if memory is low.
@@ -917,7 +1060,7 @@
 		static long recalc = 0;
 
 		/* If needed, try to free some memory. */
-		if (inactive_shortage() || free_shortage()) 
+		if (total_inactive_shortage() || total_free_shortage()) 
 			do_try_to_free_pages(GFP_KSWAPD, 0);
 
 		/* Once a second ... */
@@ -928,7 +1071,7 @@
 			recalculate_vm_stats();
 
 			/* Do background page aging. */
-			refill_inactive_scan(DEF_PRIORITY, 0);
+			refill_inactive_scan(NULL, DEF_PRIORITY, 0);
 		}
 
 		run_task_queue(&tq_disk);
@@ -944,7 +1087,7 @@
 		 * We go to sleep for one second, but if it's needed
 		 * we'll be woken up earlier...
 		 */
-		if (!free_shortage() || !inactive_shortage()) {
+		if (!total_free_shortage() || !total_inactive_shortage()) {
 			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
 		/*
 		 * If we couldn't free enough memory, we see if it was

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 21+ messages in thread
* Re: [PATCH] Separate global/perzone inactive/free shortage
@ 2001-07-16 13:56 Bulent Abali
  2001-07-16 15:56 ` Stephen C. Tweedie
  2001-07-18  8:54 ` Mike Galbraith
  0 siblings, 2 replies; 21+ messages in thread
From: Bulent Abali @ 2001-07-16 13:56 UTC (permalink / raw)
  To: Stephen C. Tweedie
  Cc: Mike Galbraith, Marcelo Tosatti, Rik van Riel, Dirk Wetter, linux-mm



>> On Sat, 14 Jul 2001, Marcelo Tosatti wrote:
>
>> On highmem machines, wouldn't it save a LOT of time to prevent
allocation
>> of ZONE_DMA as VM pages?  Or, if we really need to, get those pages into
>> the swapcache instantly?  Crawling through nearly 4 gig of VM looking
for
>> 16 MB of ram has got to be very expensive.  Besides, those pages are
just
>> too precious to allow some user task to sit on them.
>
>Can't we balance that automatically?
>
>Why not just round-robin between the eligible zones when allocating,
>biasing each zone based on size?  On a 4GB box you'd basically end up
>doing 3 times as many allocations from the highmem zone as the normal
>zone and only very occasionally would you try to dig into the dma
>zone.
>Cheers,
> Stephen

If I understood page_alloc.c:build_zonelists() correctly
ZONE_HIGHMEM includes ZONE_NORMAL which includes ZONE_DMA.
Memory allocators (other than ZONE_DMA) will dip in to the dma zone
only when there are no highmem and/or normal zone pages available.
So, the current method is more conservative (better) than round-robin
it seems to me.

I think Marcello is proposing to make ZONE_DMA exclusive in large
memory machines, which might make it better for allocators
needing ZONE_DMA pages...
Bulent


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2001-07-18 16:09 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-07-14  5:19 [PATCH] Separate global/perzone inactive/free shortage Marcelo Tosatti
2001-07-14  7:11 ` Marcelo Tosatti
2001-07-14 20:13 ` Dirk
     [not found] ` <Pine.LNX.4.33.0107141023440.283-100000@mikeg.weiden.de>
2001-07-16 13:19   ` Stephen C. Tweedie
2001-07-16 15:44     ` Mike Galbraith
2001-07-16 18:30       ` Stephen C. Tweedie
2001-07-17  2:55         ` Mike Galbraith
2001-07-16 18:42     ` Dirk Wetter
2001-07-16 15:51 ` Kanoj Sarcar
2001-07-16 19:00   ` Rik van Riel
2001-07-17  0:27     ` Marcelo Tosatti
2001-07-17  2:07       ` Kanoj Sarcar
2001-07-17  0:01   ` Marcelo Tosatti
2001-07-16 13:56 Bulent Abali
2001-07-16 15:56 ` Stephen C. Tweedie
2001-07-16 19:04   ` Rik van Riel
2001-07-18  8:54 ` Mike Galbraith
2001-07-18 10:18   ` Stephen C. Tweedie
2001-07-18 14:51     ` Mike Galbraith
2001-07-18 15:07   ` Dave McCracken
2001-07-18 16:09     ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox