[rfc] [patch] mm: zone_reclaim fix for pseudo file systems

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
@ 2007-07-27 23:27 Ravikiran G Thirumalai
  2007-07-30 18:12 ` Christoph Lameter
                   ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-27 23:27 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, Christoph Lameter, shai

Don't go into zone_reclaim if there are no reclaimable pages.

While using RAMFS as scratch space for some tests, we found one of the
processes got into zone reclaim, and got stuck trying to reclaim pages
from a zone.  On examination of the code, we found that the VM was fooled
into believing that the zone had reclaimable pages, when it actually had
RAMFS backed pages, which could not be written back to the disk.

Fix this by adding a zvc "NR_PSEUDO_FS_PAGES" for file pages with no
backing store, and using this counter to determine if reclaim is possible.

Patch tested,on 2.6.22.  Fixes the above mentioned problem.

Comments?

Signed-off-by: Alok Kataria <alok.kataria@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>

Index: linux-2.6.22/drivers/base/node.c
===================================================================
--- linux-2.6.22.orig/drivers/base/node.c
+++ linux-2.6.22/drivers/base/node.c
@@ -61,6 +61,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
+		       "Node %d PseudoFS:     %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n"
@@ -83,6 +84,7 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
+		       nid, K(node_page_state(nid, NR_PSEUDO_FS_PAGES)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
 		       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
Index: linux-2.6.22/include/linux/mmzone.h
===================================================================
--- linux-2.6.22.orig/include/linux/mmzone.h
+++ linux-2.6.22/include/linux/mmzone.h
@@ -55,6 +55,7 @@ enum zone_stat_item {
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
+	NR_PSEUDO_FS_PAGES, /* FS pages witn no backing store eg. ramfs */
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	/* Second 128 byte cacheline */
Index: linux-2.6.22/mm/filemap.c
===================================================================
--- linux-2.6.22.orig/mm/filemap.c
+++ linux-2.6.22/mm/filemap.c
@@ -119,6 +119,8 @@ void __remove_from_page_cache(struct pag
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
+	if (mapping->backing_dev_info->capabilities & BDI_CAP_NO_WRITEBACK)
+		__dec_zone_page_state(page, NR_PSEUDO_FS_PAGES);
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 }
 
@@ -448,6 +450,9 @@ int add_to_page_cache(struct page *page,
 			page->mapping = mapping;
 			page->index = offset;
 			mapping->nrpages++;
+			if (mapping->backing_dev_info->capabilities
+				& BDI_CAP_NO_WRITEBACK)
+				__inc_zone_page_state(page, NR_PSEUDO_FS_PAGES);
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 		}
 		write_unlock_irq(&mapping->tree_lock);
Index: linux-2.6.22/mm/migrate.c
===================================================================
--- linux-2.6.22.orig/mm/migrate.c
+++ linux-2.6.22/mm/migrate.c
@@ -346,6 +346,11 @@ static int migrate_page_move_mapping(str
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 
+	if (mapping->backing_dev_info->capabilities & BDI_CAP_NO_WRITEBACK) {
+		__dec_zone_page_state(page, NR_PSEUDO_FS_PAGES);
+		__inc_zone_page_state(newpage, NR_PSEUDO_FS_PAGES);
+	}
+
 	write_unlock_irq(&mapping->tree_lock);
 
 	return 0;
Index: linux-2.6.22/mm/vmscan.c
===================================================================
--- linux-2.6.22.orig/mm/vmscan.c
+++ linux-2.6.22/mm/vmscan.c
@@ -1627,6 +1627,7 @@ static int __zone_reclaim(struct zone *z
 		.swappiness = vm_swappiness,
 	};
 	unsigned long slab_reclaimable;
+	long unmapped_reclaimable;
 
 	disable_swap_token();
 	cond_resched();
@@ -1639,9 +1640,10 @@ static int __zone_reclaim(struct zone *z
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	if (zone_page_state(zone, NR_FILE_PAGES) -
-		zone_page_state(zone, NR_FILE_MAPPED) >
-		zone->min_unmapped_pages) {
+	unmapped_reclaimable = zone_page_state(zone, NR_FILE_PAGES) -
+				zone_page_state(zone, NR_PSEUDO_FS_PAGES) -
+				zone_page_state(zone, NR_FILE_MAPPED);
+	if (unmapped_reclaimable > (long) zone->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
@@ -1688,6 +1690,7 @@ int zone_reclaim(struct zone *zone, gfp_
 {
 	cpumask_t mask;
 	int node_id;
+	long unmapped_reclaimable;
 
 	/*
 	 * Zone reclaim reclaims unmapped file backed pages and
@@ -1699,8 +1702,10 @@ int zone_reclaim(struct zone *zone, gfp_
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
-	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+	unmapped_reclaimable = zone_page_state(zone, NR_FILE_PAGES) -
+				zone_page_state(zone, NR_PSEUDO_FS_PAGES) -
+				zone_page_state(zone, NR_FILE_MAPPED);
+	if (unmapped_reclaimable <= (long) zone->min_unmapped_pages
 	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
 			<= zone->min_slab_pages)
 		return 0;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-27 23:27 [rfc] [patch] mm: zone_reclaim fix for pseudo file systems Ravikiran G Thirumalai
@ 2007-07-30 18:12 ` Christoph Lameter
  2007-07-30 20:23 ` Andrew Morton
  2007-07-31  2:19 ` Christoph Lameter
  2 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-30 18:12 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-mm, Andrew Morton, Christoph Lameter, shai

On Fri, 27 Jul 2007, Ravikiran G Thirumalai wrote:

> Don't go into zone_reclaim if there are no reclaimable pages.
> 
> While using RAMFS as scratch space for some tests, we found one of the
> processes got into zone reclaim, and got stuck trying to reclaim pages
> from a zone.  On examination of the code, we found that the VM was fooled
> into believing that the zone had reclaimable pages, when it actually had
> RAMFS backed pages, which could not be written back to the disk.
> 
> Fix this by adding a zvc "NR_PSEUDO_FS_PAGES" for file pages with no
> backing store, and using this counter to determine if reclaim is possible.

That is another case where we need a counter for unreclaimable pages. The 
other types of pages that need this as mlocked pages and anonymous pages 
if we have no swap. Could you look at Nick's and my work on mlocked pages 
and come up with a general solution that covers all these cases?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-27 23:27 [rfc] [patch] mm: zone_reclaim fix for pseudo file systems Ravikiran G Thirumalai
  2007-07-30 18:12 ` Christoph Lameter
@ 2007-07-30 20:23 ` Andrew Morton
  2007-07-30 20:31   ` Christoph Lameter
  2007-07-31  0:01   ` Ravikiran G Thirumalai
  2007-07-31  2:19 ` Christoph Lameter
  2 siblings, 2 replies; 34+ messages in thread
From: Andrew Morton @ 2007-07-30 20:23 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-mm, Christoph Lameter, shai

On Fri, 27 Jul 2007 16:27:53 -0700
Ravikiran G Thirumalai <kiran@scalex86.org> wrote:

> Don't go into zone_reclaim if there are no reclaimable pages.
> 
> While using RAMFS as scratch space for some tests, we found one of the
> processes got into zone reclaim, and got stuck trying to reclaim pages
> from a zone.

Would like to see an expanded definition of "stuck", please ;)

ie: let's see the bug report before we see the fix?

>  On examination of the code, we found that the VM was fooled
> into believing that the zone had reclaimable pages, when it actually had
> RAMFS backed pages, which could not be written back to the disk.
> 
> Fix this by adding a zvc "NR_PSEUDO_FS_PAGES" for file pages with no
> backing store, and using this counter to determine if reclaim is possible.
> 
> Patch tested,on 2.6.22.  Fixes the above mentioned problem.

The (cheesy) way in which reclaim currently handles this sort of thing is
to scan like mad, then to eventually set zone->all_unreclaimable.  Once
that has been set, the kernel will reduce the amount of scanning effort it
puts into that zone by a very large amount.  If the zone later comes back
to life, all_unreclaimable gets cleared and things proceed as normal.

All a bit nasty, but it has the advantage of covering _all_ these
scenarios, while a more precise fix such as the one you propose covers only
one of them.

So...  perhaps zone_reclaim() is failing to honour the all_unreclaimable
thing in some fashion?

> Comments?

It is a numa-specific change which adds overhead to non-NUMA builds :(


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-30 20:23 ` Andrew Morton
@ 2007-07-30 20:31   ` Christoph Lameter
  2007-07-30 21:12     ` Lee Schermerhorn
  2007-07-31  0:01   ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-30 20:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, Christoph Lameter, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> It is a numa-specific change which adds overhead to non-NUMA builds :(

It could be generalized to fix the other issues that we have with 
unreclaimable pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-30 20:31   ` Christoph Lameter
@ 2007-07-30 21:12     ` Lee Schermerhorn
  0 siblings, 0 replies; 34+ messages in thread
From: Lee Schermerhorn @ 2007-07-30 21:12 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Ravikiran G Thirumalai, linux-mm, Christoph Lameter, shai

On Mon, 2007-07-30 at 13:31 -0700, Christoph Lameter wrote:
> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > It is a numa-specific change which adds overhead to non-NUMA builds :(
> 
> It could be generalized to fix the other issues that we have with 
> unreclaimable pages.
> 

For example, see the following patches that I posted in response to a
discussion between Andrew, Rik van Riel and Andrea Arcangeli to
resounding silence [for which, perhaps, I should be grateful?]: 

http://marc.info/?l=linux-mm&m=118315682007044&w=4
http://marc.info/?l=linux-mm&m=118315703313729&w=4
http://marc.info/?l=linux-mm&m=118315713323641&w=4
http://marc.info/?l=linux-mm&m=118315742025334&w=4

[By the way:  I have another experimental patch in this series that uses
Rik's page_anon() function from his "split LRU lists" patch to detect
swap backed pages and push them to the "no reclaim list" when no swap
space is available.]

I haven't thought about it much, but perhaps my "page_reclaimable()"
function could be taught to exclude RAMFS pages as well?

Later,
Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-30 20:23 ` Andrew Morton
  2007-07-30 20:31   ` Christoph Lameter
@ 2007-07-31  0:01   ` Ravikiran G Thirumalai
  2007-07-31  0:20     ` Andrew Morton
  1 sibling, 1 reply; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-31  0:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Christoph Lameter, shai

On Mon, Jul 30, 2007 at 01:23:14PM -0700, Andrew Morton wrote:
>On Fri, 27 Jul 2007 16:27:53 -0700
>Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
>
>> Don't go into zone_reclaim if there are no reclaimable pages.
>> 
>> While using RAMFS as scratch space for some tests, we found one of the
>> processes got into zone reclaim, and got stuck trying to reclaim pages
>> from a zone.
>
>Would like to see an expanded definition of "stuck", please ;)

Well, we were running a multiprocess finite element analysis HPC benchmark,
and one of the processes went into 'system' and the benchmark never completed.
Of course this happens only when we use ramfs for scratch IO.  What I mean
is, on invoking 'top', we could see that one of the process was spending
all its time in system - 100% system, for a compute benchmark which should
not be spending any time in the system at all.

>
>ie: let's see the bug report before we see the fix?
>
>>  On examination of the code, we found that the VM was fooled
>> into believing that the zone had reclaimable pages, when it actually had
>> RAMFS backed pages, which could not be written back to the disk.
>> 
>> Fix this by adding a zvc "NR_PSEUDO_FS_PAGES" for file pages with no
>> backing store, and using this counter to determine if reclaim is possible.
>> 
>> Patch tested,on 2.6.22.  Fixes the above mentioned problem.
>
>The (cheesy) way in which reclaim currently handles this sort of thing is
>to scan like mad, then to eventually set zone->all_unreclaimable.  Once
>that has been set, the kernel will reduce the amount of scanning effort it
>puts into that zone by a very large amount.  If the zone later comes back
>to life, all_unreclaimable gets cleared and things proceed as normal.

I see.  But this obviously does not work in this case.  I have noticed the
process getting into 'system' and staying there for hours.  I have never
noticed the app complete.  Perhaps because I did not wait long enough.
So do you think a more aggressive auto setting/unsetting of 'all_unreclaimable'
is a better approach?

> ...
>It is a numa-specific change which adds overhead to non-NUMA builds :(

I can (and will) place it with other NUMA specific counters, so the non-NUMA
builds will not have any overhead.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  0:01   ` Ravikiran G Thirumalai
@ 2007-07-31  0:20     ` Andrew Morton
  2007-07-31  0:27       ` Christoph Lameter
  2007-07-31  1:36       ` Ravikiran G Thirumalai
  0 siblings, 2 replies; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  0:20 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-mm, Christoph Lameter, shai

On Mon, 30 Jul 2007 17:01:38 -0700
Ravikiran G Thirumalai <kiran@scalex86.org> wrote:

> >The (cheesy) way in which reclaim currently handles this sort of thing is
> >to scan like mad, then to eventually set zone->all_unreclaimable.  Once
> >that has been set, the kernel will reduce the amount of scanning effort it
> >puts into that zone by a very large amount.  If the zone later comes back
> >to life, all_unreclaimable gets cleared and things proceed as normal.
> 
> I see.  But this obviously does not work in this case.  I have noticed the
> process getting into 'system' and staying there for hours.  I have never
> noticed the app complete.  Perhaps because I did not wait long enough.
> So do you think a more aggressive auto setting/unsetting of 'all_unreclaimable'
> is a better approach?

The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
You'll note that all the other callers of shrink_zone() do take avoiding
action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
to.

Fixing that could/should fix your CPU consumption problem.  It will further
propagate the existing lameness, but replacing all_unreclaimable with something
more efficient, more accurate and more complex is a separate problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  0:20     ` Andrew Morton
@ 2007-07-31  0:27       ` Christoph Lameter
  2007-07-31  1:06         ` Andrew Morton
  2007-07-31  1:56         ` Ravikiran G Thirumalai
  2007-07-31  1:36       ` Ravikiran G Thirumalai
  1 sibling, 2 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  0:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, Christoph Lameter, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
> You'll note that all the other callers of shrink_zone() do take avoiding
> action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
> to.

zone reclaim only runs if there are unmapped file backed pages that can be 
reclaimed. If the pages are all unreclaimable then they are all mapped and 
global reclaim begins to run. The problem is with global reclaim as far as 
I know.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  0:27       ` Christoph Lameter
@ 2007-07-31  1:06         ` Andrew Morton
  2007-07-31  1:52           ` Christoph Lameter
  2007-07-31  1:56         ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  1:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Ravikiran G Thirumalai, linux-mm, Christoph Lameter, shai

On Mon, 30 Jul 2007 17:27:41 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
> > You'll note that all the other callers of shrink_zone() do take avoiding
> > action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
> > to.
> 
> zone reclaim only runs if there are unmapped file backed pages that can be 
> reclaimed. If the pages are all unreclaimable then they are all mapped and 
> global reclaim begins to run. The problem is with global reclaim as far as 
> I know.

I don't understand how you conclude that.

- Kiran saw CPU meltdown when "one of the processes got into zone reclaim".

- all_unreclaimable is there specifically to prevent CPU meltdown

- zone_reclaim doesn't utilise all_unreclaimable.

so..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  0:20     ` Andrew Morton
  2007-07-31  0:27       ` Christoph Lameter
@ 2007-07-31  1:36       ` Ravikiran G Thirumalai
  2007-07-31  1:53         ` Andrew Morton
  1 sibling, 1 reply; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-31  1:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Christoph Lameter, shai

On Mon, Jul 30, 2007 at 05:20:07PM -0700, Andrew Morton wrote:
>On Mon, 30 Jul 2007 17:01:38 -0700
>Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
>
>> >The (cheesy) way in which reclaim currently handles this sort of thing is
>> >to scan like mad, then to eventually set zone->all_unreclaimable.  Once
>> >that has been set, the kernel will reduce the amount of scanning effort it
>> >puts into that zone by a very large amount.  If the zone later comes back
>> >to life, all_unreclaimable gets cleared and things proceed as normal.
>> 
>> I see.  But this obviously does not work in this case.  I have noticed the
>> process getting into 'system' and staying there for hours.  I have never
>> noticed the app complete.  Perhaps because I did not wait long enough.
>> So do you think a more aggressive auto setting/unsetting of 'all_unreclaimable'
>> is a better approach?
>
>The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
>You'll note that all the other callers of shrink_zone() do take avoiding
>action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
>to.

Ummm... zone_reclaim does look at all_unreclaimable:

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
...
...
        /*
         * Avoid concurrent zone reclaims, do not reclaim in a zone that
         * does
         * not have reclaimable pages and if we should not delay the
         * allocation
         * then do not scan.
         */
        if (!(gfp_mask & __GFP_WAIT) ||
                zone->all_unreclaimable ||
                atomic_read(&zone->reclaim_in_progress) > 0 ||
                (current->flags & PF_MEMALLOC))
                        return 0;

I guess it is not being set correctly for unreclaimable (pseudo fs) pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  1:06         ` Andrew Morton
@ 2007-07-31  1:52           ` Christoph Lameter
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  1:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> > zone reclaim only runs if there are unmapped file backed pages that can be 
> > reclaimed. If the pages are all unreclaimable then they are all mapped and 
> > global reclaim begins to run. The problem is with global reclaim as far as 
> > I know.
> 
> I don't understand how you conclude that.

That is how it is coded:

      if (zone_page_state(zone, NR_FILE_PAGES) -
                zone_page_state(zone, NR_FILE_MAPPED) >
                zone->min_unmapped_pages) {
                /*
                 * Free memory by calling shrink zone with increasing
                 * priorities until we have enough memory freed.
                 */
                priority = ZONE_RECLAIM_PRIORITY;
                do {
                        note_zone_scanning_priority(zone, priority);
                        nr_reclaimed += shrink_zone(priority, zone, &sc);
                        priority--;
                } while (priority >= 0 && nr_reclaimed < nr_pages);
        }
 
> - Kiran saw CPU meltdown when "one of the processes got into zone reclaim".

We have seen the meltdown with regular reclaim in a number of cases due to
unreclaimable pages on the LRU causing heavy lock contention.

AFAIK There must be file backed pages that are unmapped and are not 
reclaimable for zone reclaim to get into this state.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  1:36       ` Ravikiran G Thirumalai
@ 2007-07-31  1:53         ` Andrew Morton
  2007-07-31  1:56           ` Christoph Lameter
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  1:53 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-mm, Christoph Lameter, shai

On Mon, 30 Jul 2007 18:36:49 -0700 Ravikiran G Thirumalai <kiran@scalex86.org> wrote:

> On Mon, Jul 30, 2007 at 05:20:07PM -0700, Andrew Morton wrote:
> >On Mon, 30 Jul 2007 17:01:38 -0700
> >Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
> >
> >> >The (cheesy) way in which reclaim currently handles this sort of thing is
> >> >to scan like mad, then to eventually set zone->all_unreclaimable.  Once
> >> >that has been set, the kernel will reduce the amount of scanning effort it
> >> >puts into that zone by a very large amount.  If the zone later comes back
> >> >to life, all_unreclaimable gets cleared and things proceed as normal.
> >> 
> >> I see.  But this obviously does not work in this case.  I have noticed the
> >> process getting into 'system' and staying there for hours.  I have never
> >> noticed the app complete.  Perhaps because I did not wait long enough.
> >> So do you think a more aggressive auto setting/unsetting of 'all_unreclaimable'
> >> is a better approach?
> >
> >The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
> >You'll note that all the other callers of shrink_zone() do take avoiding
> >action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
> >to.
> 
> Ummm... zone_reclaim does look at all_unreclaimable:

oh crap then we don't know what's going on.  At least, I don't.

> int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
> ...
> ...
>         /*
>          * Avoid concurrent zone reclaims, do not reclaim in a zone that
>          * does
>          * not have reclaimable pages and if we should not delay the
>          * allocation
>          * then do not scan.
>          */
>         if (!(gfp_mask & __GFP_WAIT) ||
>                 zone->all_unreclaimable ||
>                 atomic_read(&zone->reclaim_in_progress) > 0 ||
>                 (current->flags & PF_MEMALLOC))
>                         return 0;
> 
> I guess it is not being set correctly for unreclaimable (pseudo fs) pages.

It doesn't care what type of page we're looking at.

umm, OK, perhaps the problem is that all_unreclaimable isn't getting set,
rather than that we aren't testing it.

Note that shrink_zones() and balance_pgdat() will set all_unreclaimable if
things get screwed up, but afaict zone_reclaim() doesn't.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  1:53         ` Andrew Morton
@ 2007-07-31  1:56           ` Christoph Lameter
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  1:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> Note that shrink_zones() and balance_pgdat() will set all_unreclaimable if
> things get screwed up, but afaict zone_reclaim() doesn't.

I also do not get this. Looks like some VM counters must have gotten 
screwed up?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  0:27       ` Christoph Lameter
  2007-07-31  1:06         ` Andrew Morton
@ 2007-07-31  1:56         ` Ravikiran G Thirumalai
  2007-07-31  2:01           ` Christoph Lameter
  1 sibling, 1 reply; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-31  1:56 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, Christoph Lameter, shai

On Mon, Jul 30, 2007 at 05:27:41PM -0700, Christoph Lameter wrote:
>On Mon, 30 Jul 2007, Andrew Morton wrote:
>
>> The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
>> You'll note that all the other callers of shrink_zone() do take avoiding
>> action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
>> to.
>
>zone reclaim only runs if there are unmapped file backed pages that can be 
>reclaimed. 

Yes, and in this case, without the patch, VM considers RAMFS pages to be
file backed, thus being fooled into entering reclaim.  The process entering
into reclaim in our tests gets in through zone_reclaim.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  1:56         ` Ravikiran G Thirumalai
@ 2007-07-31  2:01           ` Christoph Lameter
  2007-07-31  2:27             ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  2:01 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Andrew Morton, linux-mm, shai

On Mon, 30 Jul 2007, Ravikiran G Thirumalai wrote:

> On Mon, Jul 30, 2007 at 05:27:41PM -0700, Christoph Lameter wrote:
> >On Mon, 30 Jul 2007, Andrew Morton wrote:
> >
> >> The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
> >> You'll note that all the other callers of shrink_zone() do take avoiding
> >> action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
> >> to.
> >
> >zone reclaim only runs if there are unmapped file backed pages that can be 
> >reclaimed. 
> 
> Yes, and in this case, without the patch, VM considers RAMFS pages to be
> file backed, thus being fooled into entering reclaim.  The process entering
> into reclaim in our tests gets in through zone_reclaim.

That means RAMFS pages are accounted as NR_FILE_PAGES but not as 
NR_FILE_MAPPED..... So we have unmapped pages that are not reclaimable.

But they are not really file backed pages. They are backed by memory. Can 
we just not increment NR_FILE_MAPPED? Should they not be accounted for an 
NR_ANON_PAGES or so?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-27 23:27 [rfc] [patch] mm: zone_reclaim fix for pseudo file systems Ravikiran G Thirumalai
  2007-07-30 18:12 ` Christoph Lameter
  2007-07-30 20:23 ` Andrew Morton
@ 2007-07-31  2:19 ` Christoph Lameter
  2 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  2:19 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-mm, Andrew Morton, shai

On Fri, 27 Jul 2007, Ravikiran G Thirumalai wrote:

>  		       "Node %d Mapped:       %8lu kB\n"
>  		       "Node %d AnonPages:    %8lu kB\n"
>  		       "Node %d PageTables:   %8lu kB\n"
> +		       "Node %d PseudoFS:     %8lu kB\n"
>  		       "Node %d NFS_Unstable: %8lu kB\n"

			 Extempt from Reclaim: %8lu kb ?
Call this NR_FILE_UNRECLAIMABLE? Those pages should not end up on the LRU.

We likely will need NR_ANON_UNRECLAIMABLE if we do the removal of mlocked 
pages from the LRU. Mlocked pages then may have
to be accounted depending on them being file backed or not.

And keep this count out of NR_FILE_PAGES. Then we wont have to change zone 
reclaim.

> Index: linux-2.6.22/mm/filemap.c
> ===================================================================
> --- linux-2.6.22.orig/mm/filemap.c
> +++ linux-2.6.22/mm/filemap.c
> @@ -119,6 +119,8 @@ void __remove_from_page_cache(struct pag
>  	radix_tree_delete(&mapping->page_tree, page->index);
>  	page->mapping = NULL;
>  	mapping->nrpages--;
> +	if (mapping->backing_dev_info->capabilities & BDI_CAP_NO_WRITEBACK)
> +		__dec_zone_page_state(page, NR_PSEUDO_FS_PAGES);

We probably need a BDI_CAP_UNRECLAIMABLE or so?
Do not increment NR_FILE_PAGES for BDI_CAP_UNRECLAIMABLE.


	else

	>  	__dec_zone_page_state(page, NR_FILE_PAGES);
>  }
>  
> @@ -448,6 +450,9 @@ int add_to_page_cache(struct page *page,
>  			page->mapping = mapping;
>  			page->index = offset;
>  			mapping->nrpages++;
> +			if (mapping->backing_dev_info->capabilities
> +				& BDI_CAP_NO_WRITEBACK)
> +				__inc_zone_page_state(page, NR_PSEUDO_FS_PAGES);
			else
	>  			__inc_zone_page_state(page, NR_FILE_PAGES);
>  		}
>  		write_unlock_irq(&mapping->tree_lock);

> Index: linux-2.6.22/mm/migrate.c
> ===================================================================
> --- linux-2.6.22.orig/mm/migrate.c
> +++ linux-2.6.22/mm/migrate.c
> @@ -346,6 +346,11 @@ static int migrate_page_move_mapping(str
>  	__dec_zone_page_state(page, NR_FILE_PAGES);
>  	__inc_zone_page_state(newpage, NR_FILE_PAGES);
>  
> +	if (mapping->backing_dev_info->capabilities & BDI_CAP_NO_WRITEBACK) {
> +		__dec_zone_page_state(page, NR_PSEUDO_FS_PAGES);
> +		__inc_zone_page_state(newpage, NR_PSEUDO_FS_PAGES);
> +	}
> +
>  	write_unlock_irq(&mapping->tree_lock);

If unreclaimable pages are not on the LRU then you do not need this.
  
>  	return 0;
> Index: linux-2.6.22/mm/vmscan.c
> ===================================================================
> --- linux-2.6.22.orig/mm/vmscan.c
> +++ linux-2.6.22/mm/vmscan.c

None of the modifications to vmscan.c are needed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  2:01           ` Christoph Lameter
@ 2007-07-31  2:27             ` Andrew Morton
  2007-07-31  2:36               ` Christoph Lameter
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  2:27 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007 19:01:07 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Ravikiran G Thirumalai wrote:
> 
> > On Mon, Jul 30, 2007 at 05:27:41PM -0700, Christoph Lameter wrote:
> > >On Mon, 30 Jul 2007, Andrew Morton wrote:
> > >
> > >> The problem is that __zone_reclaim() doesn't use all_unreclaimable at all.
> > >> You'll note that all the other callers of shrink_zone() do take avoiding
> > >> action if the zone is in all_unreclaimable state, but __zone_reclaim() forgot
> > >> to.
> > >
> > >zone reclaim only runs if there are unmapped file backed pages that can be 
> > >reclaimed. 
> > 
> > Yes, and in this case, without the patch, VM considers RAMFS pages to be
> > file backed, thus being fooled into entering reclaim.  The process entering
> > into reclaim in our tests gets in through zone_reclaim.

Oh.. So this:

	/*
	 * Zone reclaim reclaims unmapped file backed pages and
	 * slab pages if we are over the defined limits.
	 *
	 * A small portion of unmapped file backed pages is needed for
	 * file I/O otherwise pages read by file I/O will be immediately
	 * thrown out if the zone is overallocated. So we do not reclaim
	 * if less than a specified percentage of the zone is used by
	 * unmapped file backed pages.
	 */
	if (zone_page_state(zone, NR_FILE_PAGES) -
	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
			<= zone->min_slab_pages)
		return 0;

is being fooled.

That makes sense, but any fix we do here won't fix things for regular
reclaim.

Sigh, I should have spotted that bug on day one - it's pretty gross.  Too
many patches, too little akpm.

> That means RAMFS pages are accounted as NR_FILE_PAGES but not as 
> NR_FILE_MAPPED..... So we have unmapped pages that are not reclaimable.
> 
> But they are not really file backed pages. They are backed by memory. Can 
> we just not increment NR_FILE_MAPPED? Should they not be accounted for an 
> NR_ANON_PAGES or so?

Or we change NR_FILE_MAPPED accounting so that it doesn't account
BDI_CAP_foo pages, where foo is, I guess, NO_WRITEBACK.

We're going to create a mess here, I can feel it.  Please, ignore "what works".
What is _right_ here?  What is our design?  Our guiding principle?  Because we
already have a mess.

Straw man proposal:

- account file-backed pages, BDI_CAP_NO_ACCT_DIRTY pages and
  BDI_CAP_NO_WRITEBACK separately.  ie: zone accounting pretty
  much follows the BDI_CAP_ selectors.

- work out, then document what those BDI_CAP_* things actually _mean_.  ie:
  which sorts of callers should set them, and why.

- do the appropriate arith at reclaim-time.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  2:27             ` Andrew Morton
@ 2007-07-31  2:36               ` Christoph Lameter
  2007-07-31  4:47                 ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  2:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> That makes sense, but any fix we do here won't fix things for regular
> reclaim.

Standard reclaim has the same issues. It uselessly keeps 
scanning the unreclaimable file backed pages. Fixing this will also 
enhance regular reclaim.

> - account file-backed pages, BDI_CAP_NO_ACCT_DIRTY pages and
>   BDI_CAP_NO_WRITEBACK separately.  ie: zone accounting pretty
>   much follows the BDI_CAP_ selectors.

Or BDI_CAP_UNRECLAIMABLE.... 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  2:36               ` Christoph Lameter
@ 2007-07-31  4:47                 ` Andrew Morton
  2007-07-31  5:00                   ` Christoph Lameter
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  4:47 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007 19:36:04 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > That makes sense, but any fix we do here won't fix things for regular
> > reclaim.
> 
> Standard reclaim has the same issues. It uselessly keeps 
> scanning the unreclaimable file backed pages.

Well it shouldn't.  That's what all_unreclaimable is for.  And it does
work.  Or used to, five years ago.  Stuff like this has a habit of breaking
because we don't have a test suite.

> Fixing this will also 
> enhance regular reclaim.
> 
> > - account file-backed pages, BDI_CAP_NO_ACCT_DIRTY pages and
> >   BDI_CAP_NO_WRITEBACK separately.  ie: zone accounting pretty
> >   much follows the BDI_CAP_ selectors.
> 
> Or BDI_CAP_UNRECLAIMABLE.... 

Yeah, that's nice and direct.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  4:47                 ` Andrew Morton
@ 2007-07-31  5:00                   ` Christoph Lameter
  2007-07-31  5:17                     ` Andrew Morton
  2007-07-31  7:15                     ` Ravikiran G Thirumalai
  0 siblings, 2 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  5:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> On Mon, 30 Jul 2007 19:36:04 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> 
> > On Mon, 30 Jul 2007, Andrew Morton wrote:
> > 
> > > That makes sense, but any fix we do here won't fix things for regular
> > > reclaim.
> > 
> > Standard reclaim has the same issues. It uselessly keeps 
> > scanning the unreclaimable file backed pages.
> 
> Well it shouldn't.  That's what all_unreclaimable is for.  And it does
> work.  Or used to, five years ago.  Stuff like this has a habit of breaking
> because we don't have a test suite.

The current VM has never been able to handle it since we have never had 
logic to remove unreclaimable pages from the LRU.

Lets bring up the patchsets for the handling of unreclaimable pages up 
again (mlocked and anonymous/no swap) again and make sure that it also 
addresses the issue issue here so that we have a comprehensive solution.

I am going over my old patchsets anyways. Kiran: Did you have a look at 
the patches Nick and I did earlier this year for mlocked pages?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:00                   ` Christoph Lameter
@ 2007-07-31  5:17                     ` Andrew Morton
  2007-07-31  5:33                       ` Christoph Lameter
  2007-07-31  7:15                     ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  5:17 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007 22:00:15 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > On Mon, 30 Jul 2007 19:36:04 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> > 
> > > On Mon, 30 Jul 2007, Andrew Morton wrote:
> > > 
> > > > That makes sense, but any fix we do here won't fix things for regular
> > > > reclaim.
> > > 
> > > Standard reclaim has the same issues. It uselessly keeps 
> > > scanning the unreclaimable file backed pages.
> > 
> > Well it shouldn't.  That's what all_unreclaimable is for.  And it does
> > work.  Or used to, five years ago.  Stuff like this has a habit of breaking
> > because we don't have a test suite.
> 
> The current VM has never been able to handle it since we have never had 
> logic to remove unreclaimable pages from the LRU.

Nonsense.  The VM used to handle it just fine.  That's what I wrote the
all_unreclaimable logic *for*.  It wasn't just added as typing practice.

Here's the changelog, from 22 Nov 2002:

	[PATCH] handle zones which are full of unreclaimable pages
	
	This patch is a general solution to the situation where a zone is full
	of pinned pages.
	
	This can come about if:
	
	a) Someone has allocated all of ZONE_DMA for IO buffers
	
	b) Some application is mlocking some memory and a zone ends up full
	   of mlocked pages (can happen on a 1G ia32 system)
	
	c) All of ZONE_HIGHMEM is pinned in hugetlb pages (can happen on 1G
	   machines)
	
	We'll currently burn 10% of CPU in kswapd when this happens, although
	it is quite hard to trigger.
	
	The algorithm is:
	
	- If page reclaim has scanned 2 * the total number of pages in the
	  zone and there have been no pages freed in that zone then mark the
	  zone as "all unreclaimable".
	
	- When a zone is "all unreclaimable" page reclaim almost ignores it.
	  We will perform a "light" scan at DEF_PRIORITY (typically 1/4096'th of
	  the zone, or 64 pages) and then forget about the zone.
	
	- When a batch of pages are freed into the zone, clear its "all
	  unreclaimable" state and start full scanning again.  The assumption
	  being that some state change has come about which will make reclaim
	  successful again.
	
	  So if a "light scan" actually frees some pages, the zone will revert to
	  normal state immediately.
	
	So we're effectively putting the zone into "low power" mode, and lightly
	polling it to see if something has changed.
	
	The code works OK, but is quite hard to test - I mainly tested it by
	pinning all highmem in hugetlb pages.


See?  "general".

Now it may be that someone broke it since then, sure.  But I haven't seen a
cogent bug report or test case for all the things you're waving hands at me
over.  Where's the git-bisect result?

> Lets bring up the patchsets for the handling of unreclaimable pages up 
> again (mlocked and anonymous/no swap) again and make sure that it also 
> addresses the issue issue here so that we have a comprehensive solution.

No, let us not.  If the existing crap isn't working as it should (and as it
used to) let us first fix (or at least understand) that before adding more
crap.

No?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:17                     ` Andrew Morton
@ 2007-07-31  5:33                       ` Christoph Lameter
  2007-07-31  5:58                         ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  5:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> Nonsense.  The VM used to handle it just fine.  That's what I wrote the
> all_unreclaimable logic *for*.  It wasn't just added as typing practice.

That is if the whole zone is unreclaimable. The problems that we want to 
solve are due to parts of a zone being unreclaimable and due to the VM 
counters giving an inaccurate picture of the memory situation.

> Here's the changelog, from 22 Nov 2002:
> 
> 	[PATCH] handle zones which are full of unreclaimable pages
> 	
> 	This patch is a general solution to the situation where a zone is full
> 	of pinned pages.
> 	
> 	This can come about if:
> 	
> 	a) Someone has allocated all of ZONE_DMA for IO buffers
> 	
> 	b) Some application is mlocking some memory and a zone ends up full
> 	   of mlocked pages (can happen on a 1G ia32 system)
> 	
> 	c) All of ZONE_HIGHMEM is pinned in hugetlb pages (can happen on 1G
> 	   machines)
> 	
> 	We'll currently burn 10% of CPU in kswapd when this happens, although
> 	it is quite hard to trigger.
> 	
> 	The algorithm is:
> 	
> 	- If page reclaim has scanned 2 * the total number of pages in the
> 	  zone and there have been no pages freed in that zone then mark the
> 	  zone as "all unreclaimable".
> 	
> 	- When a zone is "all unreclaimable" page reclaim almost ignores it.
> 	  We will perform a "light" scan at DEF_PRIORITY (typically 1/4096'th of
> 	  the zone, or 64 pages) and then forget about the zone.
> 	
> 	- When a batch of pages are freed into the zone, clear its "all
> 	  unreclaimable" state and start full scanning again.  The assumption
> 	  being that some state change has come about which will make reclaim
> 	  successful again.
> 	
> 	  So if a "light scan" actually frees some pages, the zone will revert to
> 	  normal state immediately.
> 	
> 	So we're effectively putting the zone into "low power" mode, and lightly
> 	polling it to see if something has changed.
> 	
> 	The code works OK, but is quite hard to test - I mainly tested it by
> 	pinning all highmem in hugetlb pages.
> 
> 
> See?  "general".

Nope. Its a special situation in which the whole zone has become 
unhandleable by the reclaim logic so it gives up and waits for things 
somehow to get better. During that time we cannot allocate from a zone 
which typically makes a vital zone or a node unusuable. In a NUMA 
configuration performance degrades in unacceptable ways.

What we want is to remove the unreclaimable pages from the LRU and have 
reclaim continue on the remainder of the zone.

> No, let us not.  If the existing crap isn't working as it should (and as it
> used to) let us first fix (or at least understand) that before adding more
> crap.
> 
> No?

The all_reclaimable logic is different. It was never been designed to 
remove the unreclaimable pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:33                       ` Christoph Lameter
@ 2007-07-31  5:58                         ` Andrew Morton
  2007-07-31  6:09                           ` Christoph Lameter
  2007-07-31  8:27                           ` Ravikiran G Thirumalai
  0 siblings, 2 replies; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  5:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007 22:33:03 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > Nonsense.  The VM used to handle it just fine.  That's what I wrote the
> > all_unreclaimable logic *for*.  It wasn't just added as typing practice.
> 
> That is if the whole zone is unreclaimable. The problems that we want to 
> solve are due to parts of a zone being unreclaimable and due to the VM 
> counters giving an inaccurate picture of the memory situation.

Where is the evidence that this is happening in Kiran's situation?

> > See?  "general".
> 
> Nope. Its a special situation in which the whole zone has become 
> unhandleable by the reclaim logic so it gives up and waits for things 
> somehow to get better.

yes.

> During that time we cannot allocate from a zone 
> which typically makes a vital zone or a node unusuable.

Of course you can't - there are no free pages and none are reclaimable.

> In a NUMA 
> configuration performance degrades in unacceptable ways.

No it won't - you must be referring to something else, or speculating.

> What we want is to remove the unreclaimable pages from the LRU and have 
> reclaim continue on the remainder of the zone.

Well that might be what we want.  afacit we don't know yet.

> > No, let us not.  If the existing crap isn't working as it should (and as it
> > used to) let us first fix (or at least understand) that before adding more
> > crap.
> > 
> > No?
> 
> The all_reclaimable logic is different. It was never been designed to 
> remove the unreclaimable pages.

Of course not.  But I don't know how you can be proposing solutions
without yet knowing what the problem is.

The first thing Kiran should have done was to gather a kernel profile.  If
we're spending a lot (proably half) of time in shrink_active_lsit() then
yeah, that's a plausible theory.

And yes, keeping these pages off the LRU does make sense, and it heaps
easier to handle than mlocked pages.

Sorry, I just go crazy when I see these random pokes at the VM which
are nowhere near being backed by sufficient analysis of the problem
which they allegedly solve.

The _theory_ here is that a large number (but not all) of the pages
in the zone are in ramfs and so page reclaim is making some progress,
but reclaim efficiency is low, hence there is high CPU consumption.

OK, plausible.  But where's the *proof*?  We probably already have 
sufficient statistics to be able to prove this.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:58                         ` Andrew Morton
@ 2007-07-31  6:09                           ` Christoph Lameter
  2007-07-31  6:18                             ` Andrew Morton
  2007-07-31  8:27                           ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31  6:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> > That is if the whole zone is unreclaimable. The problems that we want to 
> > solve are due to parts of a zone being unreclaimable and due to the VM 
> > counters giving an inaccurate picture of the memory situation.
> 
> Where is the evidence that this is happening in Kiran's situation?

He used ramfs for some of this memory. As a result some memory became
unreclaimable but it was put on the LRU. Zone reclaim understood that
as reclaimable memory since unmapped file backed pages were on the LRU and 
scanned for them.

> > During that time we cannot allocate from a zone 
> > which typically makes a vital zone or a node unusuable.
> 
> Of course you can't - there are no free pages and none are reclaimable.

There may be free pages that we cannot get to because too many 
unreclaimable pages have to be scanned until we get there.

> > In a NUMA 
> > configuration performance degrades in unacceptable ways.
> 
> No it won't - you must be referring to something else, or speculating.

Sorry that occurs at SGI. Typically if a customers uses XPMEM to pin too 
many pages on a zone.

> > The all_reclaimable logic is different. It was never been designed to 
> > remove the unreclaimable pages.
> 
> Of course not.  But I don't know how you can be proposing solutions
> without yet knowing what the problem is.

We know the problem and have seen it repeatedly.

> The first thing Kiran should have done was to gather a kernel profile.  If
> we're spending a lot (proably half) of time in shrink_active_lsit() then
> yeah, that's a plausible theory.

Well that is what the traces show here in these scenarios. I have never
seen it in zone_reclaim (guess we do not use ramfs that warps the 
counters)

> And yes, keeping these pages off the LRU does make sense, and it heaps
> easier to handle than mlocked pages.

I think this is pretty straighforward.

> The _theory_ here is that a large number (but not all) of the pages
> in the zone are in ramfs and so page reclaim is making some progress,
> but reclaim efficiency is low, hence there is high CPU consumption.

No theory. The problem here is that the VM counters are off. RAMFS puts 
pages unmapped pages on the LRU that are not reclaimable and zone reclaim 
will continually run to get rid of these pages counting on the ability to 
throw out unmapped pages. 

What are these pages doing on the LRU if they cannot be reclaimed anyways? 
There is no point on putting them on it in the first place.

> OK, plausible.  But where's the *proof*?  We probably already have 
> sufficient statistics to be able to prove this.

Rik has shown this repeatedly. You want metaphysical certainty?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  6:09                           ` Christoph Lameter
@ 2007-07-31  6:18                             ` Andrew Morton
  2007-07-31 19:35                               ` Christoph Lameter
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  6:18 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007 23:09:09 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> > OK, plausible.  But where's the *proof*?  We probably already have 
> > sufficient statistics to be able to prove this.
> 
> Rik has shown this repeatedly.

url?

> You want metaphysical certainty?

I want sufficient analysis of this particular problem to know that
we're fixing the right thing, and in the most appropriate fashion.

As is usual when a bug report starts with the text "patch", this
is like pulling teeth.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:00                   ` Christoph Lameter
  2007-07-31  5:17                     ` Andrew Morton
@ 2007-07-31  7:15                     ` Ravikiran G Thirumalai
  2007-07-31 19:18                       ` Christoph Lameter
  1 sibling, 1 reply; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-31  7:15 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, shai

On Mon, Jul 30, 2007 at 10:00:15PM -0700, Christoph Lameter wrote:
>On Mon, 30 Jul 2007, Andrew Morton wrote:
>
>> On Mon, 30 Jul 2007 19:36:04 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
>> 
>> > On Mon, 30 Jul 2007, Andrew Morton wrote:
>> > 
>> > > That makes sense, but any fix we do here won't fix things for regular
>> > > reclaim.
>> > 
>> > Standard reclaim has the same issues. It uselessly keeps 
>> > scanning the unreclaimable file backed pages.
>> 
>> Well it shouldn't.  That's what all_unreclaimable is for.  And it does
>> work.  Or used to, five years ago.  Stuff like this has a habit of breaking
>> because we don't have a test suite.
>
>The current VM has never been able to handle it since we have never had 
>logic to remove unreclaimable pages from the LRU.
>
>Lets bring up the patchsets for the handling of unreclaimable pages up 
>again (mlocked and anonymous/no swap) again and make sure that it also 
>addresses the issue issue here so that we have a comprehensive solution.
>
>I am going over my old patchsets anyways. Kiran: Did you have a look at 
>the patches Nick and I did earlier this year for mlocked pages?

Yes.  I guess it is good to move unrelclaimable pages off LRU.  But we still
need to not get into reclaim when we don't have pages to reclaim.  That is,
fix the arithmetic here.  No?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  5:58                         ` Andrew Morton
  2007-07-31  6:09                           ` Christoph Lameter
@ 2007-07-31  8:27                           ` Ravikiran G Thirumalai
  2007-07-31  8:35                             ` Andrew Morton
  2007-07-31 19:20                             ` Christoph Lameter
  1 sibling, 2 replies; 34+ messages in thread
From: Ravikiran G Thirumalai @ 2007-07-31  8:27 UTC (permalink / raw)
  To: Andrew Morton, clameter; +Cc: linux-mm, shai

On Mon, Jul 30, 2007 at 10:58:09PM -0700, Andrew Morton wrote:
>On Mon, 30 Jul 2007 22:33:03 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
>
>
>Of course not.  But I don't know how you can be proposing solutions
>without yet knowing what the problem is.
>
>The first thing Kiran should have done was to gather a kernel profile.  If
>we're spending a lot (proably half) of time in shrink_active_lsit() then
>yeah, that's a plausible theory.

Well, we have used RAMFS with 2.6.17 kernels with reasonable performance.
What we saw here was a regression from earlier behavior.  2.6.17 never went
into reclaim with this kind of workload:

Quote 2.6.17

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
        cpumask_t mask;
        int node_id;

        /*
         * Do not reclaim if there was a recent unsuccessful attempt at zone
         * reclaim.  In that case we let allocations go off node for the
         * zone_reclaim_interval.  Otherwise we would scan for each off-node
         * page allocation.
         */
        if (time_before(jiffies,
                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
                        return 0;


>From what I can see with .21 and .22, going into reclaim is a problem rather
than reclaim efficiency itself. Sure, if unreclaimable pages are not on LRU
it would be good, but the main problem for my narrow eyes is going into
reclaim when there are no reclaimable pages, and the fact that benchmark
works as expected with the fixed arithmetic reinforces that impression.

What am I missing?

Thanks,
Kiran

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  8:27                           ` Ravikiran G Thirumalai
@ 2007-07-31  8:35                             ` Andrew Morton
  2007-07-31 19:30                               ` Christoph Lameter
  2007-07-31 19:20                             ` Christoph Lameter
  1 sibling, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31  8:35 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: clameter, linux-mm, shai

On Tue, 31 Jul 2007 01:27:51 -0700 Ravikiran G Thirumalai <kiran@scalex86.org> wrote:

> >From what I can see with .21 and .22, going into reclaim is a problem rather
> than reclaim efficiency itself. Sure, if unreclaimable pages are not on LRU
> it would be good, but the main problem for my narrow eyes is going into
> reclaim when there are no reclaimable pages, and the fact that benchmark
> works as expected with the fixed arithmetic reinforces that impression.
> 
> What am I missing?

The fact that is there are "no reclaimable pages" then the all_unreclaimable
logic should kick in and fix the problem.

Except zone_reclaim() fails to implement it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  7:15                     ` Ravikiran G Thirumalai
@ 2007-07-31 19:18                       ` Christoph Lameter
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31 19:18 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Andrew Morton, linux-mm, shai

On Tue, 31 Jul 2007, Ravikiran G Thirumalai wrote:

> >I am going over my old patchsets anyways. Kiran: Did you have a look at 
> >the patches Nick and I did earlier this year for mlocked pages?
> 
> Yes.  I guess it is good to move unrelclaimable pages off LRU.  But we still
> need to not get into reclaim when we don't have pages to reclaim.  That is,
> fix the arithmetic here.  No?

The arithmetic will be fixed automatically if these pages do not end up
on the LRU. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  8:27                           ` Ravikiran G Thirumalai
  2007-07-31  8:35                             ` Andrew Morton
@ 2007-07-31 19:20                             ` Christoph Lameter
  1 sibling, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31 19:20 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Andrew Morton, linux-mm, shai

On Tue, 31 Jul 2007, Ravikiran G Thirumalai wrote:

> Well, we have used RAMFS with 2.6.17 kernels with reasonable performance.
> What we saw here was a regression from earlier behavior.  2.6.17 never went
> into reclaim with this kind of workload:

2.6.17 had a time based heuristic. It would frequently needlessly scan for 
pages. The VM counters enabled the determination of unmappped pages which 
allowed the determination if it makes sense to scan without timeout.

> >From what I can see with .21 and .22, going into reclaim is a problem rather
> than reclaim efficiency itself. Sure, if unreclaimable pages are not on LRU
> it would be good, but the main problem for my narrow eyes is going into
> reclaim when there are no reclaimable pages, and the fact that benchmark
> works as expected with the fixed arithmetic reinforces that impression.

The problem is that zone reclaim assumes unmapped pagecache pages are 
easily reclaimable. That is the only thing that zone reclaim is after. All 
other reclaim happens in regular reclaim.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  8:35                             ` Andrew Morton
@ 2007-07-31 19:30                               ` Christoph Lameter
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31 19:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Tue, 31 Jul 2007, Andrew Morton wrote:

> On Tue, 31 Jul 2007 01:27:51 -0700 Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
> 
> > >From what I can see with .21 and .22, going into reclaim is a problem rather
> > than reclaim efficiency itself. Sure, if unreclaimable pages are not on LRU
> > it would be good, but the main problem for my narrow eyes is going into
> > reclaim when there are no reclaimable pages, and the fact that benchmark
> > works as expected with the fixed arithmetic reinforces that impression.
> > 
> > What am I missing?
> 
> The fact that is there are "no reclaimable pages" then the all_unreclaimable
> logic should kick in and fix the problem.
> 
> Except zone_reclaim() fails to implement it.

It would be easy to implement. Just set a flag when we fail to reclaim. 
But this will result in the same deadbeat behavior like regular reclaim.

If the unmapped pages turn out to be unreclaimable then we essentially 
switch off zone reclaim and do small attempts at reclaim until we are 
successful. This may take a long time and we may be unsuccessful in 
detecting unmapped pages that become reclaimable.

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2007-07-31 12:25:18.000000000 -0700
+++ linux-2.6/include/linux/mmzone.h	2007-07-31 12:25:41.000000000 -0700
@@ -234,6 +234,7 @@ struct zone {
 	unsigned long		nr_scan_inactive;
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
+	int			unmapped_unreclaimable;	/* Unmapped pages are unreclaimable */
 
 	/* A count of how many reclaimers are scanning this zone */
 	atomic_t		reclaim_in_progress;
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2007-07-31 12:21:23.000000000 -0700
+++ linux-2.6/mm/vmscan.c	2007-07-31 12:29:27.000000000 -0700
@@ -1759,7 +1759,10 @@ static int __zone_reclaim(struct zone *z
 			note_zone_scanning_priority(zone, priority);
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			priority--;
-		} while (priority >= 0 && nr_reclaimed < nr_pages);
+		} while (priority >= 0 && nr_reclaimed < nr_pages &&
+			!zone->unmapped_unreclaimable);
+
+		zone->unmapped_reclaimable = nr_reclaimed > 0;
 	}
 
 	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31  6:18                             ` Andrew Morton
@ 2007-07-31 19:35                               ` Christoph Lameter
  2007-07-31 19:46                                 ` Andrew Morton
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31 19:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Mon, 30 Jul 2007, Andrew Morton wrote:

> On Mon, 30 Jul 2007 23:09:09 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> 
> > > OK, plausible.  But where's the *proof*?  We probably already have 
> > > sufficient statistics to be able to prove this.
> > 
> > Rik has shown this repeatedly.
> 
> url?

F.e.

http://linux-mm.org/ProblemWorkloads

http://lwn.net/Articles/224850/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31 19:35                               ` Christoph Lameter
@ 2007-07-31 19:46                                 ` Andrew Morton
  2007-07-31 19:50                                   ` Christoph Lameter
  0 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2007-07-31 19:46 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Tue, 31 Jul 2007 12:35:03 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Mon, 30 Jul 2007, Andrew Morton wrote:
> 
> > On Mon, 30 Jul 2007 23:09:09 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> > 
> > > > OK, plausible.  But where's the *proof*?  We probably already have 
> > > > sufficient statistics to be able to prove this.
> > > 
> > > Rik has shown this repeatedly.
> > 
> > url?
> 
> F.e.
> 
> http://linux-mm.org/ProblemWorkloads
> 
> http://lwn.net/Articles/224850/

They're different from Kiran's problem.  Not specifically ramfs and
zone-reclaim isn't (obviously) involved.  Yes, the solution is probably the
same one, but it'd be sad to "fix" Kiran's problem via finer-grained zone
accounting while leaving an undiscovered bug behind.

If we're going further down that path we should aim at removing the
all_unreclaimable logic completely.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [rfc] [patch] mm: zone_reclaim fix for pseudo file systems
  2007-07-31 19:46                                 ` Andrew Morton
@ 2007-07-31 19:50                                   ` Christoph Lameter
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Lameter @ 2007-07-31 19:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-mm, shai

On Tue, 31 Jul 2007, Andrew Morton wrote:

> They're different from Kiran's problem.  Not specifically ramfs and
> zone-reclaim isn't (obviously) involved.  Yes, the solution is probably the
> same one, but it'd be sad to "fix" Kiran's problem via finer-grained zone
> accounting while leaving an undiscovered bug behind.

Zone reclaim would not occur if the counters would accurately describing 
the unmapped pagecache pages that are presumably very easy to reclaim. 
Zone reclaim is not a full reclaim implementation. Its just superficial 
removal of easy to get pages.

> If we're going further down that path we should aim at removing the
> all_unreclaimable logic completely.

I think that is doable if we account for the unreclaimable pages and move 
them off the LRU.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2007-07-31 19:50 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-27 23:27 [rfc] [patch] mm: zone_reclaim fix for pseudo file systems Ravikiran G Thirumalai
2007-07-30 18:12 ` Christoph Lameter
2007-07-30 20:23 ` Andrew Morton
2007-07-30 20:31   ` Christoph Lameter
2007-07-30 21:12     ` Lee Schermerhorn
2007-07-31  0:01   ` Ravikiran G Thirumalai
2007-07-31  0:20     ` Andrew Morton
2007-07-31  0:27       ` Christoph Lameter
2007-07-31  1:06         ` Andrew Morton
2007-07-31  1:52           ` Christoph Lameter
2007-07-31  1:56         ` Ravikiran G Thirumalai
2007-07-31  2:01           ` Christoph Lameter
2007-07-31  2:27             ` Andrew Morton
2007-07-31  2:36               ` Christoph Lameter
2007-07-31  4:47                 ` Andrew Morton
2007-07-31  5:00                   ` Christoph Lameter
2007-07-31  5:17                     ` Andrew Morton
2007-07-31  5:33                       ` Christoph Lameter
2007-07-31  5:58                         ` Andrew Morton
2007-07-31  6:09                           ` Christoph Lameter
2007-07-31  6:18                             ` Andrew Morton
2007-07-31 19:35                               ` Christoph Lameter
2007-07-31 19:46                                 ` Andrew Morton
2007-07-31 19:50                                   ` Christoph Lameter
2007-07-31  8:27                           ` Ravikiran G Thirumalai
2007-07-31  8:35                             ` Andrew Morton
2007-07-31 19:30                               ` Christoph Lameter
2007-07-31 19:20                             ` Christoph Lameter
2007-07-31  7:15                     ` Ravikiran G Thirumalai
2007-07-31 19:18                       ` Christoph Lameter
2007-07-31  1:36       ` Ravikiran G Thirumalai
2007-07-31  1:53         ` Andrew Morton
2007-07-31  1:56           ` Christoph Lameter
2007-07-31  2:19 ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox