* [patch 01/10] SLUB: add support for kmem_cache_ops
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
@ 2007-05-18 18:10 ` clameter
2007-05-19 12:53 ` Pekka Enberg
2007-05-18 18:10 ` [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate clameter
` (10 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: kmem_cache_ops --]
[-- Type: text/plain, Size: 8349 bytes --]
We use the parameter formerly used by the destructor to pass an optional
pointer to a kmem_cache_ops structure to kmem_cache_create.
kmem_cache_ops is created as empty. Later patches populate kmem_cache_ops.
Create a KMEM_CACHE_OPS macro that allows the specification of a the
kmem_cache_ops.
Code to handle kmem_cache_ops is added to SLUB. SLAB and SLOB are updated
to be able to take a kmem_cache_ops structure but will ignore it.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
include/linux/slab.h | 13 +++++++++----
include/linux/slub_def.h | 1 +
mm/slab.c | 6 +++---
mm/slob.c | 2 +-
mm/slub.c | 44 ++++++++++++++++++++++++++++++--------------
5 files changed, 44 insertions(+), 22 deletions(-)
Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h 2007-05-15 21:19:51.000000000 -0700
+++ slub/include/linux/slab.h 2007-05-15 21:27:07.000000000 -0700
@@ -38,10 +38,13 @@ typedef struct kmem_cache kmem_cache_t _
void __init kmem_cache_init(void);
int slab_is_available(void);
+struct kmem_cache_ops {
+};
+
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
void (*)(void *, struct kmem_cache *, unsigned long),
- void (*)(void *, struct kmem_cache *, unsigned long));
+ const struct kmem_cache_ops *s);
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
@@ -59,9 +62,11 @@ int kmem_ptr_validate(struct kmem_cache
* f.e. add ____cacheline_aligned_in_smp to the struct declaration
* then the objects will be properly aligned in SMP configurations.
*/
-#define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
- sizeof(struct __struct), __alignof__(struct __struct),\
- (__flags), NULL, NULL)
+#define KMEM_CACHE_OPS(__struct, __flags, __ops) \
+ kmem_cache_create(#__struct, sizeof(struct __struct), \
+ __alignof__(struct __struct), (__flags), NULL, (__ops))
+
+#define KMEM_CACHE(__struct, __flags) KMEM_CACHE_OPS(__struct, __flags, NULL)
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-05-15 21:25:46.000000000 -0700
+++ slub/mm/slub.c 2007-05-15 21:29:36.000000000 -0700
@@ -294,6 +294,9 @@ static inline int check_valid_pointer(st
return 1;
}
+struct kmem_cache_ops slub_default_ops = {
+};
+
/*
* Slow version of get and set free pointer.
*
@@ -2003,11 +2006,13 @@ static int calculate_sizes(struct kmem_c
static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
const char *name, size_t size,
size_t align, unsigned long flags,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(void *, struct kmem_cache *, unsigned long),
+ const struct kmem_cache_ops *ops)
{
memset(s, 0, kmem_size);
s->name = name;
s->ctor = ctor;
+ s->ops = ops;
s->objsize = size;
s->flags = flags;
s->align = align;
@@ -2191,7 +2196,7 @@ static struct kmem_cache *create_kmalloc
down_write(&slub_lock);
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
- flags, NULL))
+ flags, NULL, &slub_default_ops))
goto panic;
list_add(&s->list, &slab_caches);
@@ -2505,12 +2510,16 @@ static int slab_unmergeable(struct kmem_
if (s->ctor)
return 1;
+ if (s->ops != &slub_default_ops)
+ return 1;
+
return 0;
}
static struct kmem_cache *find_mergeable(size_t size,
size_t align, unsigned long flags,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(void *, struct kmem_cache *, unsigned long),
+ const struct kmem_cache_ops *ops)
{
struct list_head *h;
@@ -2520,6 +2529,9 @@ static struct kmem_cache *find_mergeable
if (ctor)
return NULL;
+ if (ops != &slub_default_ops)
+ return NULL;
+
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
@@ -2555,13 +2567,15 @@ static struct kmem_cache *find_mergeable
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *, struct kmem_cache *, unsigned long),
- void (*dtor)(void *, struct kmem_cache *, unsigned long))
+ const struct kmem_cache_ops *ops)
{
struct kmem_cache *s;
- BUG_ON(dtor);
+ if (!ops)
+ ops = &slub_default_ops;
+
down_write(&slub_lock);
- s = find_mergeable(size, align, flags, ctor);
+ s = find_mergeable(size, align, flags, ctor, ops);
if (s) {
s->refcount++;
/*
@@ -2575,7 +2589,7 @@ struct kmem_cache *kmem_cache_create(con
} else {
s = kmalloc(kmem_size, GFP_KERNEL);
if (s && kmem_cache_open(s, GFP_KERNEL, name,
- size, align, flags, ctor)) {
+ size, align, flags, ctor, ops)) {
if (sysfs_slab_add(s)) {
kfree(s);
goto err;
@@ -3206,16 +3220,18 @@ static ssize_t order_show(struct kmem_ca
}
SLAB_ATTR_RO(order);
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
+ int x = 0;
- return n + sprintf(buf + n, "\n");
+ if (s->ctor) {
+ x += sprintf(buf + x, "ctor : ");
+ x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "\n");
}
- return 0;
+ return x;
}
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
@@ -3447,7 +3463,7 @@ static struct attribute * slab_attrs[] =
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
- &ctor_attr.attr,
+ &ops_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
&sanity_checks_attr.attr,
Index: slub/include/linux/slub_def.h
===================================================================
--- slub.orig/include/linux/slub_def.h 2007-05-15 21:21:27.000000000 -0700
+++ slub/include/linux/slub_def.h 2007-05-15 21:26:13.000000000 -0700
@@ -40,6 +40,7 @@ struct kmem_cache {
int objects; /* Number of objects in slab */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *, struct kmem_cache *, unsigned long);
+ const struct kmem_cache_ops *ops;
int inuse; /* Offset to metadata */
int align; /* Alignment */
const char *name; /* Name (only for display!) */
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c 2007-05-15 21:19:51.000000000 -0700
+++ slub/mm/slab.c 2007-05-15 21:26:13.000000000 -0700
@@ -2100,7 +2100,7 @@ static int setup_cpu_cache(struct kmem_c
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects (not implemented anymore).
+ * @ops: A kmem_cache_ops structure (ignored).
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
@@ -2126,7 +2126,7 @@ struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags,
void (*ctor)(void*, struct kmem_cache *, unsigned long),
- void (*dtor)(void*, struct kmem_cache *, unsigned long))
+ const struct kmem_cache_ops *ops)
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
@@ -2135,7 +2135,7 @@ kmem_cache_create (const char *name, siz
* Sanity checks... these are all serious usage bugs.
*/
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
- size > KMALLOC_MAX_SIZE || dtor) {
+ size > KMALLOC_MAX_SIZE) {
printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
name);
BUG();
Index: slub/mm/slob.c
===================================================================
--- slub.orig/mm/slob.c 2007-05-15 21:17:15.000000000 -0700
+++ slub/mm/slob.c 2007-05-15 21:28:06.000000000 -0700
@@ -285,7 +285,7 @@ struct kmem_cache {
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void*, struct kmem_cache *, unsigned long),
- void (*dtor)(void*, struct kmem_cache *, unsigned long))
+ const struct kmem_cache_ops *o)
{
struct kmem_cache *c;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 01/10] SLUB: add support for kmem_cache_ops
2007-05-18 18:10 ` [patch 01/10] SLUB: add support for kmem_cache_ops clameter
@ 2007-05-19 12:53 ` Pekka Enberg
2007-05-19 18:19 ` Christoph Lameter
0 siblings, 1 reply; 24+ messages in thread
From: Pekka Enberg @ 2007-05-19 12:53 UTC (permalink / raw)
To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On 5/18/07, clameter@sgi.com <clameter@sgi.com> wrote:
> kmem_cache_ops is created as empty. Later patches populate kmem_cache_ops.
Hmm, would make more sense to me to move "ctor" in kmem_cache_ops in
this patch and not make kmem_cache_create() take both as parameters...
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 01/10] SLUB: add support for kmem_cache_ops
2007-05-19 12:53 ` Pekka Enberg
@ 2007-05-19 18:19 ` Christoph Lameter
2007-05-20 21:16 ` Pekka Enberg
0 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2007-05-19 18:19 UTC (permalink / raw)
To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On Sat, 19 May 2007, Pekka Enberg wrote:
> On 5/18/07, clameter@sgi.com <clameter@sgi.com> wrote:
> > kmem_cache_ops is created as empty. Later patches populate kmem_cache_ops.
>
> Hmm, would make more sense to me to move "ctor" in kmem_cache_ops in
> this patch and not make kmem_cache_create() take both as parameters...
Yeah earlier versions did this but then I have to do a patch that changes
all destructors and all kmem_cache_create calls in the kernel.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 01/10] SLUB: add support for kmem_cache_ops
2007-05-19 18:19 ` Christoph Lameter
@ 2007-05-20 21:16 ` Pekka Enberg
0 siblings, 0 replies; 24+ messages in thread
From: Pekka Enberg @ 2007-05-20 21:16 UTC (permalink / raw)
To: Christoph Lameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
Christoph Lameter wrote:
> Yeah earlier versions did this but then I have to do a patch that changes
> all destructors and all kmem_cache_create calls in the kernel.
Yes, please ;-)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
2007-05-18 18:10 ` [patch 01/10] SLUB: add support for kmem_cache_ops clameter
@ 2007-05-18 18:10 ` clameter
2007-05-21 14:10 ` Mel Gorman
2007-05-18 18:10 ` [patch 03/10] Dentry defragmentation clameter
` (9 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: get_ref_kick --]
[-- Type: text/plain, Size: 14687 bytes --]
Slab defragmentation occurs when the slabs are shrunk (after inode, dentry
shrinkers have been run from the reclaim code) or when a manual shrinking
is requested via slabinfo. During the shrink operation SLUB will generate a
list of partially populated slabs sorted by the number of objects in use.
We extract pages off that list that are only filled less than a quarter and
attempt to motivate the users of those slabs to either remove the objects
or move the objects.
Targeted reclaim allows to target a single slab for reclaim. This is done by
calling
kmem_cache_vacate(page);
It will return 1 on success, 0 if the operation failed.
In order for a slabcache to support defragmentation a couple of functions
must be defined via kmem_cache_ops. These are
void *get(struct kmem_cache *s, int nr, void **objects)
Must obtain a reference to the listed objects. SLUB guarantees that
the objects are still allocated. However, other threads may be blocked
in slab_free attempting to free objects in the slab. These may succeed
as soon as get() returns to the slab allocator. The function must
be able to detect the situation and void the attempts to handle such
objects (by for example voiding the corresponding entry in the objects
array).
No slab operations may be performed in get_reference(). Interrupts
are disabled. What can be done is very limited. The slab lock
for the page with the object is taken. Any attempt to perform a slab
operation may lead to a deadlock.
get() returns a private pointer that is passed to kick. Should we
be unable to obtain all references then that pointer may indicate
to the kick() function that it should not attempt any object removal
or move but simply remove the reference counts.
void kick(struct kmem_cache *, int nr, void **objects, void *get_result)
After SLUB has established references to the objects in a
slab it will drop all locks and then use kick() to move objects out
of the slab. The existence of the object is guaranteed by virtue of
the earlier obtained references via get(). The callback may perform
any slab operation since no locks are held at the time of call.
The callback should remove the object from the slab in some way. This
may be accomplished by reclaiming the object and then running
kmem_cache_free() or reallocating it and then running
kmem_cache_free(). Reallocation is advantageous because the partial
slabs were just sorted to have the partial slabs with the most objects
first. Allocation is likely to result in filling up a slab so that
it can be removed from the partial list.
Kick() does not return a result. SLUB will check the number of
remaining objects in the slab. If all objects were removed then
we know that the operation was successful.
If a kmem_cache_vacate on a page fails then the slab has usually a pretty
low usage ratio. Go through the slab and resequence the freelist so that
object addresses increase as we allocate objects. This will trigger the
cacheline prefetcher when we start allocating from the slab again and
thereby increase allocations speed.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
include/linux/slab.h | 31 +++++
mm/slab.c | 9 +
mm/slob.c | 9 +
mm/slub.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 303 insertions(+), 10 deletions(-)
Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h 2007-05-18 00:13:39.000000000 -0700
+++ slub/include/linux/slab.h 2007-05-18 00:13:40.000000000 -0700
@@ -39,6 +39,36 @@ void __init kmem_cache_init(void);
int slab_is_available(void);
struct kmem_cache_ops {
+ /*
+ * Called with slab lock held and interrupts disabled.
+ * No slab operation may be performed.
+ *
+ * Parameters passed are the number of objects to process
+ * and a an array of pointers to objects for which we
+ * need references.
+ *
+ * Returns a pointer that is passed to the kick function.
+ * If all objects cannot be moved then the pointer may
+ * indicate that this wont work and then kick can simply
+ * remove the references that were already obtained.
+ *
+ * The array passed to get() is also passed to kick(). The
+ * function may remove objects by setting array elements to NULL.
+ */
+ void *(*get)(struct kmem_cache *, int nr, void **);
+
+ /*
+ * Called with no locks held and interrupts enabled.
+ * Any operation may be performed in kick().
+ *
+ * Parameters passed are the number of objects in the array,
+ * the array of pointers to the objects and the pointer
+ * returned by get().
+ *
+ * Success is checked by examining the number of remaining
+ * objects in the slab.
+ */
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private);
};
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
@@ -53,6 +83,7 @@ void kmem_cache_free(struct kmem_cache *
unsigned int kmem_cache_size(struct kmem_cache *);
const char *kmem_cache_name(struct kmem_cache *);
int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
+int kmem_cache_vacate(struct page *);
/*
* Please use this macro to create slab caches. Simply specify the
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c 2007-05-18 00:13:39.000000000 -0700
+++ slub/mm/slub.c 2007-05-18 09:55:47.000000000 -0700
@@ -1043,12 +1043,11 @@ static struct page *new_slab(struct kmem
n = get_node(s, page_to_nid(page));
if (n)
atomic_long_inc(&n->nr_slabs);
+
+ page->inuse = 0;
+ page->lockless_freelist = NULL;
page->offset = s->offset / sizeof(void *);
page->slab = s;
- page->flags |= 1 << PG_slab;
- if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_TRACE))
- SetSlabDebug(page);
start = page_address(page);
end = start + s->objects * s->size;
@@ -1066,11 +1065,20 @@ static struct page *new_slab(struct kmem
set_freepointer(s, last, NULL);
page->freelist = start;
- page->lockless_freelist = NULL;
- page->inuse = 0;
-out:
- if (flags & __GFP_WAIT)
- local_irq_disable();
+
+ /*
+ * page->inuse must be 0 when PageSlab(page) becomes
+ * true so that defrag knows that this slab is not in use.
+ */
+ smp_wmb();
+ __SetPageSlab(page);
+ if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
+ SLAB_STORE_USER | SLAB_TRACE))
+ SetSlabDebug(page);
+
+ out:
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
return page;
}
@@ -2323,6 +2331,191 @@ void kfree(const void *x)
EXPORT_SYMBOL(kfree);
/*
+ * Order the freelist so that addresses increase as object are allocated.
+ * This is useful to trigger the cpu cacheline prefetching logic.
+ */
+void resequence_freelist(struct kmem_cache *s, struct page *page)
+{
+ void *p;
+ void *last;
+ void *addr = page_address(page);
+ DECLARE_BITMAP(map, s->objects);
+
+ bitmap_zero(map, s->objects);
+
+ /* Figure out which objects are on the freelist */
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
+
+ last = NULL;
+ for_each_object(p, s, addr)
+ if (test_bit(slab_index(p, s, addr), map)) {
+ if (last)
+ set_freepointer(s, last, p);
+ else
+ page->freelist = p;
+ last = p;
+ }
+
+ if (last)
+ set_freepointer(s, last, NULL);
+ else
+ page->freelist = NULL;
+}
+
+/*
+ * Vacate all objects in the given slab.
+ *
+ * Slab must be locked and frozen. Interrupts are disabled (flags must
+ * be passed).
+ *
+ * Will drop and regain and drop the slab lock. At the end the slab will
+ * either be freed or returned to the partial lists.
+ *
+ * Returns the number of remaining objects
+ */
+static int __kmem_cache_vacate(struct kmem_cache *s,
+ struct page *page, unsigned long flags, void **vector)
+{
+ void *p;
+ void *addr = page_address(page);
+ DECLARE_BITMAP(map, s->objects);
+ int leftover;
+ int objects;
+ void *private;
+
+ if (!page->inuse)
+ goto out;
+
+ /* Determine used objects */
+ bitmap_fill(map, s->objects);
+ for_each_free_object(p, s, page->freelist)
+ __clear_bit(slab_index(p, s, addr), map);
+
+ objects = 0;
+ memset(vector, 0, s->objects * sizeof(void **));
+ for_each_object(p, s, addr) {
+ if (test_bit(slab_index(p, s, addr), map))
+ vector[objects++] = p;
+ }
+
+ private = s->ops->get(s, objects, vector);
+
+ /*
+ * Got references. Now we can drop the slab lock. The slab
+ * is frozen so it cannot vanish from under us nor will
+ * allocations be performed on the slab. However, unlocking the
+ * slab will allow concurrent slab_frees to proceed.
+ */
+ slab_unlock(page);
+ local_irq_restore(flags);
+
+ /*
+ * Perform the KICK callbacks to remove the objects.
+ */
+ s->ops->kick(s, objects, vector, private);
+
+ local_irq_save(flags);
+ slab_lock(page);
+out:
+ /*
+ * Check the result and unfreeze the slab
+ */
+ leftover = page->inuse;
+ if (leftover > 0)
+ /*
+ * Cannot free. Lets at least optimize the freelist. We have
+ * likely touched all the cachelines with the free pointers
+ * already so it is cheap to do here.
+ */
+ resequence_freelist(s, page);
+ unfreeze_slab(s, page);
+ local_irq_restore(flags);
+ return leftover;
+}
+
+/*
+ * Get a page off a list and freeze it. Must be holding slab lock.
+ */
+static void freeze_from_list(struct kmem_cache *s, struct page *page)
+{
+ if (page->inuse < s->objects)
+ remove_partial(s, page);
+ else if (s->flags & SLAB_STORE_USER)
+ remove_full(s, page);
+ SetSlabFrozen(page);
+}
+
+/*
+ * Attempt to free objects in a page. Return 1 if succesful.
+ */
+int kmem_cache_vacate(struct page *page)
+{
+ unsigned long flags;
+ struct kmem_cache *s;
+ int vacated = 0;
+ void **vector = NULL;
+
+ /*
+ * Get a reference to the page. Return if its freed or being freed.
+ * This is necessary to make sure that the page does not vanish
+ * from under us before we are able to check the result.
+ */
+ if (!get_page_unless_zero(page))
+ return 0;
+
+ if (!PageSlab(page))
+ goto out;
+
+ s = page->slab;
+ if (!s)
+ goto out;
+
+ vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL);
+ if (!vector)
+ return 0;
+
+ local_irq_save(flags);
+ /*
+ * The implicit memory barrier in slab_lock guarantees that page->inuse
+ * is loaded after PageSlab(page) has been established to be true. This is
+ * only revelant for a newly created slab.
+ */
+ slab_lock(page);
+
+ /*
+ * We may now have locked a page that may be in various stages of
+ * being freed. If the PageSlab bit is off then we have already
+ * reached the page allocator. If page->inuse is zero then we are
+ * in SLUB but freeing or allocating the page.
+ * page->inuse is never modified without the slab lock held.
+ *
+ * Also abort if the page happens to be already frozen. If its
+ * frozen then a concurrent vacate may be in progress.
+ */
+ if (!PageSlab(page) || SlabFrozen(page) || !page->inuse)
+ goto out_locked;
+
+ /*
+ * We are holding a lock on a slab page and all operations on the
+ * slab are blocking.
+ */
+ if (!s->ops->get || !s->ops->kick)
+ goto out_locked;
+ freeze_from_list(s, page);
+ vacated = __kmem_cache_vacate(s, page, flags, vector) == 0;
+out:
+ put_page(page);
+ kfree(vector);
+ return vacated;
+out_locked:
+ slab_unlock(page);
+ local_irq_restore(flags);
+ goto out;
+
+}
+
+/*
* kmem_cache_shrink removes empty slabs from the partial lists and sorts
* the remaining slabs by the number of items in use. The slabs with the
* most items in use come first. New allocations will then fill those up
@@ -2337,11 +2530,12 @@ int kmem_cache_shrink(struct kmem_cache
int node;
int i;
struct kmem_cache_node *n;
- struct page *page;
+ struct page *page, *page2;
struct page *t;
struct list_head *slabs_by_inuse =
kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
unsigned long flags;
+ LIST_HEAD(zaplist);
if (!slabs_by_inuse)
return -ENOMEM;
@@ -2392,8 +2586,44 @@ int kmem_cache_shrink(struct kmem_cache
for (i = s->objects - 1; i >= 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
+ /*
+ * If we have no functions available to defragment the slabs
+ * then we are done.
+ */
+ if (!s->ops->get || !s->ops->kick)
+ goto out;
+
+ /* Take objects with just a few objects off the tail */
+ while (n->nr_partial > MAX_PARTIAL) {
+ page = container_of(n->partial.prev, struct page, lru);
+
+ /*
+ * We are holding the list_lock so we can only
+ * trylock the slab
+ */
+ if (page->inuse > s->objects / 4)
+ break;
+
+ if (!slab_trylock(page))
+ break;
+
+ list_move_tail(&page->lru, &zaplist);
+ n->nr_partial--;
+ SetSlabFrozen(page);
+ slab_unlock(page);
+ }
out:
spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Now we can free objects in the slabs on the zaplist */
+ list_for_each_entry_safe(page, page2, &zaplist, lru) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ slab_lock(page);
+ __kmem_cache_vacate(s, page, flags,
+ (void **)slabs_by_inuse);
+ }
}
kfree(slabs_by_inuse);
@@ -3229,6 +3459,20 @@ static ssize_t ops_show(struct kmem_cach
x += sprint_symbol(buf + x, (unsigned long)s->ctor);
x += sprintf(buf + x, "\n");
}
+
+ if (s->ops->get) {
+ x += sprintf(buf + x, "get : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->ops->get);
+ x += sprintf(buf + x, "\n");
+ }
+
+ if (s->ops->kick) {
+ x += sprintf(buf + x, "kick : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->ops->kick);
+ x += sprintf(buf + x, "\n");
+ }
return x;
}
SLAB_ATTR_RO(ops);
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c 2007-05-18 00:13:39.000000000 -0700
+++ slub/mm/slab.c 2007-05-18 00:13:40.000000000 -0700
@@ -2516,6 +2516,15 @@ int kmem_cache_shrink(struct kmem_cache
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/*
+ * SLAB does not support slab defragmentation
+ */
+int kmem_cache_vacate(struct page *page)
+{
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_vacate);
+
/**
* kmem_cache_destroy - delete a cache
* @cachep: the cache to destroy
Index: slub/mm/slob.c
===================================================================
--- slub.orig/mm/slob.c 2007-05-18 00:13:39.000000000 -0700
+++ slub/mm/slob.c 2007-05-18 00:13:40.000000000 -0700
@@ -394,6 +394,15 @@ int kmem_cache_shrink(struct kmem_cache
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/*
+ * SLOB does not support slab defragmentation
+ */
+int kmem_cache_vacate(struct page *page)
+{
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_vacate);
+
int kmem_ptr_validate(struct kmem_cache *a, const void *b)
{
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate
2007-05-18 18:10 ` [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate clameter
@ 2007-05-21 14:10 ` Mel Gorman
2007-05-21 17:01 ` Christoph Lameter
0 siblings, 1 reply; 24+ messages in thread
From: Mel Gorman @ 2007-05-21 14:10 UTC (permalink / raw)
To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On (18/05/07 11:10), clameter@sgi.com didst pronounce:
> Slab defragmentation occurs when the slabs are shrunk (after inode, dentry
> shrinkers have been run from the reclaim code) or when a manual shrinking
> is requested via slabinfo. During the shrink operation SLUB will generate a
> list of partially populated slabs sorted by the number of objects in use.
>
> We extract pages off that list that are only filled less than a quarter and
> attempt to motivate the users of those slabs to either remove the objects
> or move the objects.
>
I know I brought up this "less than a quarter" thing before and I
haven't thought of a better alternative. However, it occurs to be that
shrink_slab() is called when there is awareness of a reclaim priority.
It may be worth passing that down so that the fraction of candidates
pages is calculated based on priority.
That said..... where is kmem_cache_shrink() ever called? The freeing of
slab pages seems to be indirect these days. Way back,
kmem_cache_shrink() used to be called directly but I'm not sure where it
happens now.
> Targeted reclaim allows to target a single slab for reclaim. This is done by
> calling
>
> kmem_cache_vacate(page);
>
> It will return 1 on success, 0 if the operation failed.
>
>
> In order for a slabcache to support defragmentation a couple of functions
> must be defined via kmem_cache_ops. These are
>
> void *get(struct kmem_cache *s, int nr, void **objects)
>
> Must obtain a reference to the listed objects. SLUB guarantees that
> the objects are still allocated. However, other threads may be blocked
> in slab_free attempting to free objects in the slab. These may succeed
> as soon as get() returns to the slab allocator. The function must
> be able to detect the situation and void the attempts to handle such
> objects (by for example voiding the corresponding entry in the objects
> array).
>
> No slab operations may be performed in get_reference(). Interrupts
> are disabled. What can be done is very limited. The slab lock
> for the page with the object is taken. Any attempt to perform a slab
> operation may lead to a deadlock.
>
> get() returns a private pointer that is passed to kick. Should we
> be unable to obtain all references then that pointer may indicate
> to the kick() function that it should not attempt any object removal
> or move but simply remove the reference counts.
>
Much clearer than before.
> void kick(struct kmem_cache *, int nr, void **objects, void *get_result)
>
> After SLUB has established references to the objects in a
> slab it will drop all locks and then use kick() to move objects out
> of the slab. The existence of the object is guaranteed by virtue of
> the earlier obtained references via get(). The callback may perform
> any slab operation since no locks are held at the time of call.
>
> The callback should remove the object from the slab in some way. This
> may be accomplished by reclaiming the object and then running
> kmem_cache_free() or reallocating it and then running
> kmem_cache_free(). Reallocation is advantageous because the partial
> slabs were just sorted to have the partial slabs with the most objects
> first. Allocation is likely to result in filling up a slab so that
> it can be removed from the partial list.
>
> Kick() does not return a result. SLUB will check the number of
> remaining objects in the slab. If all objects were removed then
> we know that the operation was successful.
>
Again, much clearer.
> If a kmem_cache_vacate on a page fails then the slab has usually a pretty
> low usage ratio. Go through the slab and resequence the freelist so that
> object addresses increase as we allocate objects. This will trigger the
> cacheline prefetcher when we start allocating from the slab again and
> thereby increase allocations speed.
>
Nice idea.
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
>
> ---
> include/linux/slab.h | 31 +++++
> mm/slab.c | 9 +
> mm/slob.c | 9 +
> mm/slub.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++++--
> 4 files changed, 303 insertions(+), 10 deletions(-)
>
> Index: slub/include/linux/slab.h
> ===================================================================
> --- slub.orig/include/linux/slab.h 2007-05-18 00:13:39.000000000 -0700
> +++ slub/include/linux/slab.h 2007-05-18 00:13:40.000000000 -0700
> @@ -39,6 +39,36 @@ void __init kmem_cache_init(void);
> int slab_is_available(void);
>
> struct kmem_cache_ops {
> + /*
> + * Called with slab lock held and interrupts disabled.
> + * No slab operation may be performed.
> + *
> + * Parameters passed are the number of objects to process
> + * and a an array of pointers to objects for which we
> + * need references.
> + *
s/a an/an/
> + * Returns a pointer that is passed to the kick function.
> + * If all objects cannot be moved then the pointer may
> + * indicate that this wont work and then kick can simply
> + * remove the references that were already obtained.
> + *
> + * The array passed to get() is also passed to kick(). The
> + * function may remove objects by setting array elements to NULL.
> + */
> + void *(*get)(struct kmem_cache *, int nr, void **);
> +
> + /*
> + * Called with no locks held and interrupts enabled.
> + * Any operation may be performed in kick().
> + *
> + * Parameters passed are the number of objects in the array,
> + * the array of pointers to the objects and the pointer
> + * returned by get().
> + *
> + * Success is checked by examining the number of remaining
> + * objects in the slab.
> + */
> + void (*kick)(struct kmem_cache *, int nr, void **, void *private);
> };
>
> struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
> @@ -53,6 +83,7 @@ void kmem_cache_free(struct kmem_cache *
> unsigned int kmem_cache_size(struct kmem_cache *);
> const char *kmem_cache_name(struct kmem_cache *);
> int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
> +int kmem_cache_vacate(struct page *);
>
> /*
> * Please use this macro to create slab caches. Simply specify the
> Index: slub/mm/slub.c
> ===================================================================
> --- slub.orig/mm/slub.c 2007-05-18 00:13:39.000000000 -0700
> +++ slub/mm/slub.c 2007-05-18 09:55:47.000000000 -0700
> @@ -1043,12 +1043,11 @@ static struct page *new_slab(struct kmem
> n = get_node(s, page_to_nid(page));
> if (n)
> atomic_long_inc(&n->nr_slabs);
> +
> + page->inuse = 0;
> + page->lockless_freelist = NULL;
> page->offset = s->offset / sizeof(void *);
> page->slab = s;
> - page->flags |= 1 << PG_slab;
> - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
> - SLAB_STORE_USER | SLAB_TRACE))
> - SetSlabDebug(page);
>
> start = page_address(page);
> end = start + s->objects * s->size;
> @@ -1066,11 +1065,20 @@ static struct page *new_slab(struct kmem
> set_freepointer(s, last, NULL);
>
> page->freelist = start;
> - page->lockless_freelist = NULL;
> - page->inuse = 0;
> -out:
> - if (flags & __GFP_WAIT)
> - local_irq_disable();
> +
> + /*
> + * page->inuse must be 0 when PageSlab(page) becomes
> + * true so that defrag knows that this slab is not in use.
> + */
> + smp_wmb();
> + __SetPageSlab(page);
> + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
> + SLAB_STORE_USER | SLAB_TRACE))
> + SetSlabDebug(page);
> +
> + out:
> + if (flags & __GFP_WAIT)
> + local_irq_disable();
> return page;
> }
>
> @@ -2323,6 +2331,191 @@ void kfree(const void *x)
> EXPORT_SYMBOL(kfree);
>
> /*
> + * Order the freelist so that addresses increase as object are allocated.
> + * This is useful to trigger the cpu cacheline prefetching logic.
> + */
makes sense. However, it occurs to me that maybe this should be a
separate patch so it can be measured to be sure. It makes sense though.
> +void resequence_freelist(struct kmem_cache *s, struct page *page)
> +{
> + void *p;
> + void *last;
> + void *addr = page_address(page);
> + DECLARE_BITMAP(map, s->objects);
> +
> + bitmap_zero(map, s->objects);
> +
> + /* Figure out which objects are on the freelist */
> + for_each_free_object(p, s, page->freelist)
> + set_bit(slab_index(p, s, addr), map);
> +
> + last = NULL;
> + for_each_object(p, s, addr)
> + if (test_bit(slab_index(p, s, addr), map)) {
> + if (last)
> + set_freepointer(s, last, p);
> + else
> + page->freelist = p;
> + last = p;
> + }
> +
> + if (last)
> + set_freepointer(s, last, NULL);
> + else
> + page->freelist = NULL;
> +}
> +
> +/*
> + * Vacate all objects in the given slab.
> + *
> + * Slab must be locked and frozen. Interrupts are disabled (flags must
> + * be passed).
> + *
It may not hurt to have a VM_BUG_ON() if interrupts are still enabled when
this is called
> + * Will drop and regain and drop the slab lock. At the end the slab will
> + * either be freed or returned to the partial lists.
> + *
> + * Returns the number of remaining objects
> + */
> +static int __kmem_cache_vacate(struct kmem_cache *s,
> + struct page *page, unsigned long flags, void **vector)
> +{
> + void *p;
> + void *addr = page_address(page);
> + DECLARE_BITMAP(map, s->objects);
> + int leftover;
> + int objects;
> + void *private;
> +
> + if (!page->inuse)
> + goto out;
> +
> + /* Determine used objects */
> + bitmap_fill(map, s->objects);
> + for_each_free_object(p, s, page->freelist)
> + __clear_bit(slab_index(p, s, addr), map);
> +
> + objects = 0;
> + memset(vector, 0, s->objects * sizeof(void **));
> + for_each_object(p, s, addr) {
> + if (test_bit(slab_index(p, s, addr), map))
> + vector[objects++] = p;
> + }
> +
> + private = s->ops->get(s, objects, vector);
> +
> + /*
> + * Got references. Now we can drop the slab lock. The slab
> + * is frozen so it cannot vanish from under us nor will
> + * allocations be performed on the slab. However, unlocking the
> + * slab will allow concurrent slab_frees to proceed.
> + */
> + slab_unlock(page);
> + local_irq_restore(flags);
I recognise that you want to restore interrupts as early as possible but
it should be noted somewhere that kmem_cache_vacate() disables
interrupts and __kmem_cache_vacate() enabled them again. I had to go
searching to see where interrupts are enabled again.
Maybe even a slab_lock_irq() and slab_unlock_irq() would clarify things
a little.
> +
> + /*
> + * Perform the KICK callbacks to remove the objects.
> + */
> + s->ops->kick(s, objects, vector, private);
> +
> + local_irq_save(flags);
> + slab_lock(page);
> +out:
> + /*
> + * Check the result and unfreeze the slab
> + */
> + leftover = page->inuse;
> + if (leftover > 0)
> + /*
> + * Cannot free. Lets at least optimize the freelist. We have
> + * likely touched all the cachelines with the free pointers
> + * already so it is cheap to do here.
> + */
> + resequence_freelist(s, page);
> + unfreeze_slab(s, page);
> + local_irq_restore(flags);
> + return leftover;
> +}
> +
> +/*
> + * Get a page off a list and freeze it. Must be holding slab lock.
> + */
> +static void freeze_from_list(struct kmem_cache *s, struct page *page)
> +{
> + if (page->inuse < s->objects)
> + remove_partial(s, page);
> + else if (s->flags & SLAB_STORE_USER)
> + remove_full(s, page);
> + SetSlabFrozen(page);
> +}
> +
> +/*
> + * Attempt to free objects in a page. Return 1 if succesful.
> + */
> +int kmem_cache_vacate(struct page *page)
> +{
> + unsigned long flags;
> + struct kmem_cache *s;
> + int vacated = 0;
> + void **vector = NULL;
> +
> + /*
> + * Get a reference to the page. Return if its freed or being freed.
> + * This is necessary to make sure that the page does not vanish
> + * from under us before we are able to check the result.
> + */
> + if (!get_page_unless_zero(page))
> + return 0;
> +
> + if (!PageSlab(page))
> + goto out;
> +
> + s = page->slab;
> + if (!s)
> + goto out;
> +
> + vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL);
> + if (!vector)
> + return 0;
Is it worth logging this event, returning -ENOMEM or something so that
callers are aware of why kmem_cache_vacate() failed in this instance?
Also.. we have called get_page_unless_zero() but if we are out of memory
here, where have we called put_page()? Maybe we should be "goto out"
here with a
if (vector)
kfree(vector);
> +
> + local_irq_save(flags);
> + /*
> + * The implicit memory barrier in slab_lock guarantees that page->inuse
> + * is loaded after PageSlab(page) has been established to be true. This is
> + * only revelant for a newly created slab.
> + */
> + slab_lock(page);
> +
> + /*
> + * We may now have locked a page that may be in various stages of
> + * being freed. If the PageSlab bit is off then we have already
> + * reached the page allocator. If page->inuse is zero then we are
> + * in SLUB but freeing or allocating the page.
> + * page->inuse is never modified without the slab lock held.
> + *
> + * Also abort if the page happens to be already frozen. If its
> + * frozen then a concurrent vacate may be in progress.
> + */
> + if (!PageSlab(page) || SlabFrozen(page) || !page->inuse)
> + goto out_locked;
> +
> + /*
> + * We are holding a lock on a slab page and all operations on the
> + * slab are blocking.
> + */
> + if (!s->ops->get || !s->ops->kick)
> + goto out_locked;
> + freeze_from_list(s, page);
> + vacated = __kmem_cache_vacate(s, page, flags, vector) == 0;
That is a little funky looking. This may be nicer;
vacated = __kmem_cache_vacate(s, page, flags, vector);
out:
....
return vacated == 0;
> +out:
> + put_page(page);
> + kfree(vector);
> + return vacated;
> +out_locked:
> + slab_unlock(page);
> + local_irq_restore(flags);
> + goto out;
> +
> +}
> +
> +/*
> * kmem_cache_shrink removes empty slabs from the partial lists and sorts
> * the remaining slabs by the number of items in use. The slabs with the
> * most items in use come first. New allocations will then fill those up
> @@ -2337,11 +2530,12 @@ int kmem_cache_shrink(struct kmem_cache
> int node;
> int i;
> struct kmem_cache_node *n;
> - struct page *page;
> + struct page *page, *page2;
> struct page *t;
> struct list_head *slabs_by_inuse =
> kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
> unsigned long flags;
> + LIST_HEAD(zaplist);
>
> if (!slabs_by_inuse)
> return -ENOMEM;
> @@ -2392,8 +2586,44 @@ int kmem_cache_shrink(struct kmem_cache
> for (i = s->objects - 1; i >= 0; i--)
> list_splice(slabs_by_inuse + i, n->partial.prev);
>
> + /*
> + * If we have no functions available to defragment the slabs
> + * then we are done.
> + */
> + if (!s->ops->get || !s->ops->kick)
> + goto out;
> +
> + /* Take objects with just a few objects off the tail */
> + while (n->nr_partial > MAX_PARTIAL) {
> + page = container_of(n->partial.prev, struct page, lru);
> +
> + /*
> + * We are holding the list_lock so we can only
> + * trylock the slab
> + */
> + if (page->inuse > s->objects / 4)
> + break;
> +
> + if (!slab_trylock(page))
> + break;
> +
> + list_move_tail(&page->lru, &zaplist);
> + n->nr_partial--;
> + SetSlabFrozen(page);
> + slab_unlock(page);
> + }
> out:
> spin_unlock_irqrestore(&n->list_lock, flags);
> +
> + /* Now we can free objects in the slabs on the zaplist */
> + list_for_each_entry_safe(page, page2, &zaplist, lru) {
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + slab_lock(page);
> + __kmem_cache_vacate(s, page, flags,
> + (void **)slabs_by_inuse);
> + }
> }
>
> kfree(slabs_by_inuse);
> @@ -3229,6 +3459,20 @@ static ssize_t ops_show(struct kmem_cach
> x += sprint_symbol(buf + x, (unsigned long)s->ctor);
> x += sprintf(buf + x, "\n");
> }
> +
> + if (s->ops->get) {
> + x += sprintf(buf + x, "get : ");
> + x += sprint_symbol(buf + x,
> + (unsigned long)s->ops->get);
> + x += sprintf(buf + x, "\n");
> + }
> +
> + if (s->ops->kick) {
> + x += sprintf(buf + x, "kick : ");
> + x += sprint_symbol(buf + x,
> + (unsigned long)s->ops->kick);
> + x += sprintf(buf + x, "\n");
> + }
> return x;
> }
> SLAB_ATTR_RO(ops);
> Index: slub/mm/slab.c
> ===================================================================
> --- slub.orig/mm/slab.c 2007-05-18 00:13:39.000000000 -0700
> +++ slub/mm/slab.c 2007-05-18 00:13:40.000000000 -0700
> @@ -2516,6 +2516,15 @@ int kmem_cache_shrink(struct kmem_cache
> }
> EXPORT_SYMBOL(kmem_cache_shrink);
>
> +/*
> + * SLAB does not support slab defragmentation
> + */
> +int kmem_cache_vacate(struct page *page)
> +{
> + return 0;
> +}
> +EXPORT_SYMBOL(kmem_cache_vacate);
> +
> /**
> * kmem_cache_destroy - delete a cache
> * @cachep: the cache to destroy
> Index: slub/mm/slob.c
> ===================================================================
> --- slub.orig/mm/slob.c 2007-05-18 00:13:39.000000000 -0700
> +++ slub/mm/slob.c 2007-05-18 00:13:40.000000000 -0700
> @@ -394,6 +394,15 @@ int kmem_cache_shrink(struct kmem_cache
> }
> EXPORT_SYMBOL(kmem_cache_shrink);
>
> +/*
> + * SLOB does not support slab defragmentation
> + */
> +int kmem_cache_vacate(struct page *page)
> +{
> + return 0;
> +}
> +EXPORT_SYMBOL(kmem_cache_vacate);
> +
> int kmem_ptr_validate(struct kmem_cache *a, const void *b)
> {
> return 0;
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate
2007-05-21 14:10 ` Mel Gorman
@ 2007-05-21 17:01 ` Christoph Lameter
0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2007-05-21 17:01 UTC (permalink / raw)
To: Mel Gorman; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On Mon, 21 May 2007, Mel Gorman wrote:
> I know I brought up this "less than a quarter" thing before and I
> haven't thought of a better alternative. However, it occurs to be that
> shrink_slab() is called when there is awareness of a reclaim priority.
> It may be worth passing that down so that the fraction of candidates
> pages is calculated based on priority.
Hmmmm.. Yes I am thinking about that one too. Right now I have a system
that triggers reclaim every 10 seconds or after more than 100 objects have
been reclaimed.
> That said..... where is kmem_cache_shrink() ever called? The freeing of
> slab pages seems to be indirect these days. Way back,
> kmem_cache_shrink() used to be called directly but I'm not sure where it
> happens now.
Well, this one only allows manual triggering. To my embarasasment I found
that the kmem_cache_shrink calls for icache and dentries are only in 2.4.
They have been removed in 2.6.X. I need to add them back to 2.6.
> > /*
> > + * Order the freelist so that addresses increase as object are allocated.
> > + * This is useful to trigger the cpu cacheline prefetching logic.
> > + */
>
> makes sense. However, it occurs to me that maybe this should be a
> separate patch so it can be measured to be sure. It makes sense though.
Ok.
> > +/*
> > + * Vacate all objects in the given slab.
> > + *
> > + * Slab must be locked and frozen. Interrupts are disabled (flags must
> > + * be passed).
> > + *
>
> It may not hurt to have a VM_BUG_ON() if interrupts are still enabled when
> this is called
Well flags need to be passed and those flags are obtained via disabling
interrupts.
> > + /*
> > + * Got references. Now we can drop the slab lock. The slab
> > + * is frozen so it cannot vanish from under us nor will
> > + * allocations be performed on the slab. However, unlocking the
> > + * slab will allow concurrent slab_frees to proceed.
> > + */
> > + slab_unlock(page);
> > + local_irq_restore(flags);
>
> I recognise that you want to restore interrupts as early as possible but
> it should be noted somewhere that kmem_cache_vacate() disables
> interrupts and __kmem_cache_vacate() enabled them again. I had to go
> searching to see where interrupts are enabled again.
>
> Maybe even a slab_lock_irq() and slab_unlock_irq() would clarify things
> a little.
Hmmmm... Okay but this is a rare situation in SLUB. Regular slab
operations always run with interrupts disabled.
> > +
> > + vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL);
> > + if (!vector)
> > + return 0;
>
> Is it worth logging this event, returning -ENOMEM or something so that
> callers are aware of why kmem_cache_vacate() failed in this instance?
>
> Also.. we have called get_page_unless_zero() but if we are out of memory
> here, where have we called put_page()? Maybe we should be "goto out"
> here with a
Ahh. Thanks. Will fix that.
> > + * We are holding a lock on a slab page and all operations on the
> > + * slab are blocking.
> > + */
> > + if (!s->ops->get || !s->ops->kick)
> > + goto out_locked;
> > + freeze_from_list(s, page);
> > + vacated = __kmem_cache_vacate(s, page, flags, vector) == 0;
>
> That is a little funky looking. This may be nicer;
>
> vacated = __kmem_cache_vacate(s, page, flags, vector);
> out:
> ...
> return vacated == 0;
>
Right. Done.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 03/10] Dentry defragmentation
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
2007-05-18 18:10 ` [patch 01/10] SLUB: add support for kmem_cache_ops clameter
2007-05-18 18:10 ` [patch 02/10] SLUB: slab defragmentation and kmem_cache_vacate clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:10 ` [patch 04/10] Generic inode defragmentation clameter
` (8 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: dentry_targeted_reclaim --]
[-- Type: text/plain, Size: 4720 bytes --]
This patch allows the removal of unused or negative dentry entries in a
partially populated slab page.
get() uses the dcache lock and then works with dget_locked to obtain a
reference to the dentry. An additional complication is that the dentry
may be in process of being freed or it may just have been allocated.
We add an additional flag to d_flags to be able to determined the
status of an object.
kick() is called after get() has been used and after the slab has dropped
all of its own locks. The dentry pruning for unused entries works in a
straighforward way.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/dcache.c | 100 +++++++++++++++++++++++++++++++++++++++++++++----
include/linux/dcache.h | 4 +
2 files changed, 96 insertions(+), 8 deletions(-)
Index: slub/fs/dcache.c
===================================================================
--- slub.orig/fs/dcache.c 2007-05-18 10:53:01.000000000 -0700
+++ slub/fs/dcache.c 2007-05-18 10:58:38.000000000 -0700
@@ -136,6 +136,7 @@ static struct dentry *d_kill(struct dent
list_del(&dentry->d_u.d_child);
dentry_stat.nr_dentry--; /* For d_free, below */
+ dentry->d_flags &= ~DCACHE_ENTRY_VALID;
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
parent = dentry->d_parent;
@@ -952,6 +953,7 @@ struct dentry *d_alloc(struct dentry * p
if (parent)
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
dentry_stat.nr_dentry++;
+ dentry->d_flags |= DCACHE_ENTRY_VALID;
spin_unlock(&dcache_lock);
return dentry;
@@ -2114,18 +2116,100 @@ static void __init dcache_init_early(voi
INIT_HLIST_HEAD(&dentry_hashtable[loop]);
}
+/*
+ * The slab is holding off frees. Thus we can safely examine
+ * the object without the danger of it vanishing from under us.
+ */
+static void *get_dentries(struct kmem_cache *s, int nr, void **v)
+{
+ struct dentry *dentry;
+ unsigned long abort = 0;
+ int i;
+
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+ /*
+ * if DCACHE_ENTRY_VALID is not set then the dentry
+ * may be already in the process of being freed.
+ */
+ if (abort || !(dentry->d_flags & DCACHE_ENTRY_VALID))
+ v[i] = NULL;
+ else {
+ dget_locked(dentry);
+ abort = atomic_read(&dentry->d_count) > 1;
+ }
+ }
+ spin_unlock(&dcache_lock);
+ return (void *)abort;
+}
+
+/*
+ * Slab has dropped all the locks. Get rid of the
+ * refcount we obtained earlier and also rid of the
+ * object.
+ */
+static void kick_dentries(struct kmem_cache *s, int nr, void **v, void *private)
+{
+ struct dentry *dentry;
+ unsigned long abort = (unsigned long)private;
+ int i;
+
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+ if (!dentry)
+ continue;
+
+ if (abort)
+ goto put_dentry;
+
+ spin_lock(&dentry->d_lock);
+ if (atomic_read(&dentry->d_count) > 1) {
+ /*
+ * Reference count was increased.
+ * We need to abandon the freeing of
+ * objects.
+ */
+ abort = 1;
+ spin_unlock(&dentry->d_lock);
+put_dentry:
+ spin_unlock(&dcache_lock);
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ continue;
+ }
+
+ /* Remove from LRU */
+ if (!list_empty(&dentry->d_lru)) {
+ dentry_stat.nr_unused--;
+ list_del_init(&dentry->d_lru);
+ }
+ /* Drop the entry */
+ prune_one_dentry(dentry, 1);
+ }
+ spin_unlock(&dcache_lock);
+ /*
+ * dentries are freed using RCU so we need to wait until RCU
+ * operations arei complete
+ */
+ if (!abort)
+ synchronize_rcu();
+}
+
+static struct kmem_cache_ops dentry_kmem_cache_ops = {
+ .get = get_dentries,
+ .kick = kick_dentries,
+};
+
static void __init dcache_init(unsigned long mempages)
{
int loop;
- /*
- * A constructor could be added for stable state like the lists,
- * but it is probably not worth it because of the cache nature
- * of the dcache.
- */
- dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-
+ dentry_cache = KMEM_CACHE_OPS(dentry,
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD,
+ &dentry_kmem_cache_ops);
+
register_shrinker(&dcache_shrinker);
/* Hash may have been set up in dcache_init_early */
Index: slub/include/linux/dcache.h
===================================================================
--- slub.orig/include/linux/dcache.h 2007-05-18 10:53:01.000000000 -0700
+++ slub/include/linux/dcache.h 2007-05-18 10:58:07.000000000 -0700
@@ -177,6 +177,10 @@ d_iput: no no no yes
#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */
+#define DCACHE_ENTRY_VALID 0x0040 /*
+ * Entry is valid and not in the process of
+ * being created or destroyed
+ */
extern spinlock_t dcache_lock;
/**
--
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 04/10] Generic inode defragmentation
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (2 preceding siblings ...)
2007-05-18 18:10 ` [patch 03/10] Dentry defragmentation clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:10 ` [patch 05/10] reiserfs: inode defragmentation support clameter
` (7 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: inode_targeted_reclaim --]
[-- Type: text/plain, Size: 3910 bytes --]
This implements the ability to remove a list of inodes from the inode
cache. In order to remove an inode we may have to write out the pages
of an inode, the inode itself and remove the dentries referring to the
node.
Provide generic functionality that can be used by filesystems that have
their own inode caches to also tie into the defragmentation functions
that are made available here.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/inode.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/fs.h | 5 ++
2 files changed, 96 insertions(+), 1 deletion(-)
Index: slub/fs/inode.c
===================================================================
--- slub.orig/fs/inode.c 2007-05-18 00:50:36.000000000 -0700
+++ slub/fs/inode.c 2007-05-18 00:55:40.000000000 -0700
@@ -1361,6 +1361,96 @@ static int __init set_ihash_entries(char
}
__setup("ihash_entries=", set_ihash_entries);
+static void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ int i;
+
+ spin_lock(&inode_lock);
+ for (i = 0; i < nr; i++) {
+ struct inode *inode = v[i];
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+ v[i] = NULL;
+ else
+ __iget(inode);
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * structures. The offset is the offset of the struct inode in the fs inode.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v, unsigned long offset)
+{
+ int i;
+
+ for (i = 0; i < nr; i++)
+ v[i] += offset;
+
+ return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+ struct inode *inode;
+ int i;
+ int abort = 0;
+ LIST_HEAD(freeable);
+ struct super_block *sb;
+
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (remove_inode_buffers(inode))
+ invalidate_mapping_pages(&inode->i_data,
+ 0, -1);
+ }
+
+ if (inode->i_state & I_DIRTY)
+ write_inode_now(inode, 1);
+
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ }
+
+ mutex_lock(&iprune_mutex);
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ sb = inode->i_sb;
+ iput(inode);
+ if (abort || !(sb->s_flags & MS_ACTIVE))
+ continue;
+
+ spin_lock(&inode_lock);
+ if (!can_unuse(inode)) {
+ abort = 1;
+ spin_unlock(&inode_lock);
+ continue;
+ }
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ inodes_stat.nr_unused--;
+ spin_unlock(&inode_lock);
+ }
+ dispose_list(&freeable);
+ mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
+static struct kmem_cache_ops inode_kmem_cache_ops = {
+ .get = get_inodes,
+ .kick = kick_inodes
+};
+
/*
* Initialize the waitqueues and inode hash table.
*/
@@ -1399,7 +1489,7 @@ void __init inode_init(unsigned long mem
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
init_once,
- NULL);
+ &inode_kmem_cache_ops);
register_shrinker(&icache_shrinker);
/* Hash may have been set up in inode_init_early */
Index: slub/include/linux/fs.h
===================================================================
--- slub.orig/include/linux/fs.h 2007-05-18 00:50:36.000000000 -0700
+++ slub/include/linux/fs.h 2007-05-18 00:54:33.000000000 -0700
@@ -1608,6 +1608,11 @@ static inline void insert_inode_hash(str
__insert_inode_hash(inode, inode->i_ino);
}
+/* Helpers to realize inode defrag support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+ unsigned long offset);
+
extern struct file * get_empty_filp(void);
extern void file_move(struct file *f, struct list_head *list);
extern void file_kill(struct file *f);
--
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 05/10] reiserfs: inode defragmentation support
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (3 preceding siblings ...)
2007-05-18 18:10 ` [patch 04/10] Generic inode defragmentation clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:10 ` [patch 06/10] xfs: " clameter
` (6 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_reiser --]
[-- Type: text/plain, Size: 1168 bytes --]
Add inode defrag support
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/reiserfs/super.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
Index: slub/fs/reiserfs/super.c
===================================================================
--- slub.orig/fs/reiserfs/super.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/fs/reiserfs/super.c 2007-05-18 00:57:12.000000000 -0700
@@ -520,6 +520,17 @@ static void init_once(void *foo, struct
#endif
}
+static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct reiserfs_inode_info, vfs_inode));
+}
+
+struct kmem_cache_ops reiserfs_kmem_cache_ops = {
+ .get = reiserfs_get_inodes,
+ .kick = kick_inodes
+};
+
static int init_inodecache(void)
{
reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
@@ -527,7 +538,8 @@ static int init_inodecache(void)
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- init_once, NULL);
+ init_once,
+ &reiserfs_kmem_cache_ops);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 06/10] xfs: inode defragmentation support
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (4 preceding siblings ...)
2007-05-18 18:10 ` [patch 05/10] reiserfs: inode defragmentation support clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:26 ` Christoph Lameter
2007-05-18 18:10 ` [patch 07/10] procfs: " clameter
` (5 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_xfs --]
[-- Type: text/plain, Size: 2438 bytes --]
Add slab defrag support.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/xfs/linux-2.6/kmem.h | 5 +++--
fs/xfs/linux-2.6/xfs_buf.c | 2 +-
fs/xfs/linux-2.6/xfs_super.c | 13 ++++++++++++-
3 files changed, 16 insertions(+), 4 deletions(-)
Index: slub/fs/xfs/linux-2.6/kmem.h
===================================================================
--- slub.orig/fs/xfs/linux-2.6/kmem.h 2007-05-18 00:54:30.000000000 -0700
+++ slub/fs/xfs/linux-2.6/kmem.h 2007-05-18 00:58:38.000000000 -0700
@@ -79,9 +79,10 @@ kmem_zone_init(int size, char *zone_name
static inline kmem_zone_t *
kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
- void (*construct)(void *, kmem_zone_t *, unsigned long))
+ void (*construct)(void *, kmem_zone_t *, unsigned long),
+ const struct kmem_cache_ops *ops)
{
- return kmem_cache_create(zone_name, size, 0, flags, construct, NULL);
+ return kmem_cache_create(zone_name, size, 0, flags, construct, ops);
}
static inline void
Index: slub/fs/xfs/linux-2.6/xfs_buf.c
===================================================================
--- slub.orig/fs/xfs/linux-2.6/xfs_buf.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/fs/xfs/linux-2.6/xfs_buf.c 2007-05-18 00:58:38.000000000 -0700
@@ -1832,7 +1832,7 @@ xfs_buf_init(void)
#endif
xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
- KM_ZONE_HWALIGN, NULL);
+ KM_ZONE_HWALIGN, NULL, NULL);
if (!xfs_buf_zone)
goto out_free_trace_buf;
Index: slub/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- slub.orig/fs/xfs/linux-2.6/xfs_super.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/fs/xfs/linux-2.6/xfs_super.c 2007-05-18 00:58:38.000000000 -0700
@@ -355,13 +355,24 @@ xfs_fs_inode_init_once(
inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
}
+static void *xfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v, offsetof(bhv_vnode_t, v_inode));
+};
+
+static struct kmem_cache_ops xfs_kmem_cache_ops = {
+ .get = xfs_get_inodes,
+ .kick = kick_inodes
+};
+
STATIC int
xfs_init_zones(void)
{
xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ xfs_fs_inode_init_once,
+ &xfs_kmem_cache_ops);
if (!xfs_vnode_zone)
goto out;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 06/10] xfs: inode defragmentation support
2007-05-18 18:10 ` [patch 06/10] xfs: " clameter
@ 2007-05-18 18:26 ` Christoph Lameter
0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2007-05-18 18:26 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
Rats. Missing a piece due to the need to change the parameters of
kmem_zone_init_flags (Isnt it possible to use kmem_cache_create
directly?).
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: slub/fs/xfs/xfs_vfsops.c
===================================================================
--- slub.orig/fs/xfs/xfs_vfsops.c 2007-05-18 11:23:27.000000000 -0700
+++ slub/fs/xfs/xfs_vfsops.c 2007-05-17 22:14:34.000000000 -0700
@@ -109,13 +109,13 @@ xfs_init(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
- KM_ZONE_SPREAD, NULL);
+ KM_ZONE_SPREAD, NULL, NULL);
xfs_ili_zone =
kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
- KM_ZONE_SPREAD, NULL);
+ KM_ZONE_SPREAD, NULL, NULL);
xfs_chashlist_zone =
kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist",
- KM_ZONE_SPREAD, NULL);
+ KM_ZONE_SPREAD, NULL, NULL);
/*
* Allocate global trace buffers.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 07/10] procfs: inode defragmentation support
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (5 preceding siblings ...)
2007-05-18 18:10 ` [patch 06/10] xfs: " clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:10 ` [patch 08/10] shmem: " clameter
` (4 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_proc --]
[-- Type: text/plain, Size: 1122 bytes --]
Hmmm... Do we really need this?
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/proc/inode.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
Index: slub/fs/proc/inode.c
===================================================================
--- slub.orig/fs/proc/inode.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/fs/proc/inode.c 2007-05-18 01:00:36.000000000 -0700
@@ -111,14 +111,25 @@ static void init_once(void * foo, struct
inode_init_once(&ei->vfs_inode);
}
-
+
+static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct proc_inode, vfs_inode));
+};
+
+static struct kmem_cache_ops proc_kmem_cache_ops = {
+ .get = proc_get_inodes,
+ .kick = kick_inodes
+};
+
int __init proc_init_inodecache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- init_once, NULL);
+ init_once, &proc_kmem_cache_ops);
if (proc_inode_cachep == NULL)
return -ENOMEM;
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 08/10] shmem: inode defragmentation support
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (6 preceding siblings ...)
2007-05-18 18:10 ` [patch 07/10] procfs: " clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 20:34 ` Jan Engelhardt
2007-05-18 18:10 ` [patch 09/10] sockets: " clameter
` (3 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_shmem --]
[-- Type: text/plain, Size: 981 bytes --]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
mm/shmem.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
Index: slub/mm/shmem.c
===================================================================
--- slub.orig/mm/shmem.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/mm/shmem.c 2007-05-18 01:02:26.000000000 -0700
@@ -2337,11 +2337,22 @@ static void init_once(void *foo, struct
#endif
}
+static void *shmem_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct shmem_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops shmem_kmem_cache_ops = {
+ .get = shmem_get_inodes,
+ .kick = kick_inodes
+};
+
static int init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, 0, init_once, NULL);
+ 0, 0, init_once, &shmem_kmem_cache_ops);
if (shmem_inode_cachep == NULL)
return -ENOMEM;
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 08/10] shmem: inode defragmentation support
2007-05-18 18:10 ` [patch 08/10] shmem: " clameter
@ 2007-05-18 20:34 ` Jan Engelhardt
2007-05-18 21:04 ` Christoph Lameter
0 siblings, 1 reply; 24+ messages in thread
From: Jan Engelhardt @ 2007-05-18 20:34 UTC (permalink / raw)
To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On May 18 2007 11:10, clameter@sgi.com wrote:
>
>Index: slub/mm/shmem.c
>===================================================================
>--- slub.orig/mm/shmem.c 2007-05-18 00:54:30.000000000 -0700
>+++ slub/mm/shmem.c 2007-05-18 01:02:26.000000000 -0700
Do we need *this*? (compare procfs)
I believe that shmfs's inodes remain "more" in memory than those of
procfs. That is, procfs ones can find their way out (we can regenerate
it), while shmfs/tmpfs/ramfs/etc. should not do that (we'd lose the
file).
>@@ -2337,11 +2337,22 @@ static void init_once(void *foo, struct
> #endif
> }
>
>+static void *shmem_get_inodes(struct kmem_cache *s, int nr, void **v)
>+{
>+ return fs_get_inodes(s, nr, v,
>+ offsetof(struct shmem_inode_info, vfs_inode));
>+}
>+
>+static struct kmem_cache_ops shmem_kmem_cache_ops = {
>+ .get = shmem_get_inodes,
>+ .kick = kick_inodes
>+};
>+
> static int init_inodecache(void)
> {
> shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
> sizeof(struct shmem_inode_info),
>- 0, 0, init_once, NULL);
>+ 0, 0, init_once, &shmem_kmem_cache_ops);
> if (shmem_inode_cachep == NULL)
> return -ENOMEM;
> return 0;
>
>--
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
Jan
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 08/10] shmem: inode defragmentation support
2007-05-18 20:34 ` Jan Engelhardt
@ 2007-05-18 21:04 ` Christoph Lameter
0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2007-05-18 21:04 UTC (permalink / raw)
To: Jan Engelhardt; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On Fri, 18 May 2007, Jan Engelhardt wrote:
> Do we need *this*? (compare procfs)
>
> I believe that shmfs's inodes remain "more" in memory than those of
> procfs. That is, procfs ones can find their way out (we can regenerate
> it), while shmfs/tmpfs/ramfs/etc. should not do that (we'd lose the
> file).
Ahh... Okay so shmem inodes are not defraggable.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [patch 09/10] sockets: inode defragmentation support
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (7 preceding siblings ...)
2007-05-18 18:10 ` [patch 08/10] shmem: " clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 18:10 ` [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation clameter
` (2 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_socket --]
[-- Type: text/plain, Size: 1086 bytes --]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
net/socket.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
Index: slub/net/socket.c
===================================================================
--- slub.orig/net/socket.c 2007-05-18 00:54:30.000000000 -0700
+++ slub/net/socket.c 2007-05-18 01:03:31.000000000 -0700
@@ -264,6 +264,17 @@ static void init_once(void *foo, struct
inode_init_once(&ei->vfs_inode);
}
+static void *sock_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct socket_alloc, vfs_inode));
+}
+
+static struct kmem_cache_ops sock_kmem_cache_ops = {
+ .get = sock_get_inodes,
+ .kick = kick_inodes
+};
+
static int init_inodecache(void)
{
sock_inode_cachep = kmem_cache_create("sock_inode_cache",
@@ -273,7 +284,7 @@ static int init_inodecache(void)
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD),
init_once,
- NULL);
+ &sock_kmem_cache_ops);
if (sock_inode_cachep == NULL)
return -ENOMEM;
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (8 preceding siblings ...)
2007-05-18 18:10 ` [patch 09/10] sockets: " clameter
@ 2007-05-18 18:10 ` clameter
2007-05-18 20:32 ` Jan Engelhardt
2007-05-18 18:29 ` [patch 00/10] Slab defragmentation V2 Christoph Lameter
2007-05-21 12:52 ` Hugh Dickins
11 siblings, 1 reply; 24+ messages in thread
From: clameter @ 2007-05-18 18:10 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, linux-mm, dgc, Hugh Dickins
[-- Attachment #1: fs_ext234 --]
[-- Type: text/plain, Size: 3064 bytes --]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/ext2/super.c | 16 ++++++++++++++--
fs/ext3/super.c | 14 +++++++++++++-
fs/ext4/super.c | 15 ++++++++++++++-
3 files changed, 41 insertions(+), 4 deletions(-)
Index: slub/fs/ext2/super.c
===================================================================
--- slub.orig/fs/ext2/super.c 2007-05-18 10:19:12.000000000 -0700
+++ slub/fs/ext2/super.c 2007-05-18 10:24:03.000000000 -0700
@@ -168,14 +168,26 @@ static void init_once(void * foo, struct
mutex_init(&ei->truncate_mutex);
inode_init_once(&ei->vfs_inode);
}
-
+
+static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext2_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext2_kmem_cache_ops = {
+ ext2_get_inodes,
+ kick_inodes
+};
+
static int init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- init_once, NULL);
+ init_once,
+ &ext2_kmem_cache_ops);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
return 0;
Index: slub/fs/ext3/super.c
===================================================================
--- slub.orig/fs/ext3/super.c 2007-05-18 10:22:01.000000000 -0700
+++ slub/fs/ext3/super.c 2007-05-18 10:23:04.000000000 -0700
@@ -475,13 +475,25 @@ static void init_once(void * foo, struct
inode_init_once(&ei->vfs_inode);
}
+static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext3_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext3_kmem_cache_ops = {
+ ext3_get_inodes,
+ kick_inodes
+};
+
static int init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
sizeof(struct ext3_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- init_once, NULL);
+ init_once,
+ &ext3_kmem_cache_ops);
if (ext3_inode_cachep == NULL)
return -ENOMEM;
return 0;
Index: slub/fs/ext4/super.c
===================================================================
--- slub.orig/fs/ext4/super.c 2007-05-18 10:23:15.000000000 -0700
+++ slub/fs/ext4/super.c 2007-05-18 10:23:48.000000000 -0700
@@ -535,13 +535,26 @@ static void init_once(void * foo, struct
inode_init_once(&ei->vfs_inode);
}
+static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext4_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext4_kmem_cache_ops = {
+ ext4_get_inodes,
+ kick_inodes
+};
+
+
static int init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- init_once, NULL);
+ init_once,
+ &ext4_kmem_cache_ops);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
return 0;
--
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation
2007-05-18 18:10 ` [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation clameter
@ 2007-05-18 20:32 ` Jan Engelhardt
2007-05-18 21:03 ` Christoph Lameter
0 siblings, 1 reply; 24+ messages in thread
From: Jan Engelhardt @ 2007-05-18 20:32 UTC (permalink / raw)
To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On May 18 2007 11:10, clameter@sgi.com wrote:
>+
>+static struct kmem_cache_ops ext2_kmem_cache_ops = {
>+ ext2_get_inodes,
>+ kick_inodes
>+};
>+
We love C99 names:
static struct kmem_cache_ops ext2_kmem_cache_ops = {
.get = ext2_get_inodes,
.kick = kick_inodes,
};
Jan
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation
2007-05-18 20:32 ` Jan Engelhardt
@ 2007-05-18 21:03 ` Christoph Lameter
0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2007-05-18 21:03 UTC (permalink / raw)
To: Jan Engelhardt; +Cc: akpm, linux-kernel, linux-mm, dgc, Hugh Dickins
On Fri, 18 May 2007, Jan Engelhardt wrote:
>
> On May 18 2007 11:10, clameter@sgi.com wrote:
> >+
> >+static struct kmem_cache_ops ext2_kmem_cache_ops = {
> >+ ext2_get_inodes,
> >+ kick_inodes
> >+};
> >+
>
> We love C99 names:
>
> static struct kmem_cache_ops ext2_kmem_cache_ops = {
> .get = ext2_get_inodes,
> .kick = kick_inodes,
> };
>
Right. The other patches all have C99 names in kmem_cache_ops. The mass
handling of extxx filesystems must have made me loose sight of that. Next
rev will have it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [patch 00/10] Slab defragmentation V2
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (9 preceding siblings ...)
2007-05-18 18:10 ` [patch 10/10] ext2 ext3 ext4: support inode slab defragmentation clameter
@ 2007-05-18 18:29 ` Christoph Lameter
2007-05-18 18:54 ` Michal Piotrowski
2007-05-21 12:52 ` Hugh Dickins
11 siblings, 1 reply; 24+ messages in thread
From: Christoph Lameter @ 2007-05-18 18:29 UTC (permalink / raw)
To: dgc; +Cc: linux-kernel, linux-mm, Hugh Dickins
For Dave: You can find the patchset also at
http://ftp.kernel.org/pub/linux/kernel/people/christoph/slub-defrag
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [patch 00/10] Slab defragmentation V2
2007-05-18 18:10 [patch 00/10] Slab defragmentation V2 clameter
` (10 preceding siblings ...)
2007-05-18 18:29 ` [patch 00/10] Slab defragmentation V2 Christoph Lameter
@ 2007-05-21 12:52 ` Hugh Dickins
11 siblings, 0 replies; 24+ messages in thread
From: Hugh Dickins @ 2007-05-21 12:52 UTC (permalink / raw)
To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc
On Fri, 18 May 2007, clameter@sgi.com wrote:
> Hugh: Could you have a look at this? There is lots of critical locking
> here....
Sorry, Christoph, no: I've far too many bugs to chase, and unfulfilled
promises outstanding: this is not something I can spend time on - sorry.
Hugh
> Support for Slab defragmentation and targeted reclaim. The current
> state of affairs is that a large portion of inode and dcache slab caches
> can be effectively reclaimed. The remaining problems are:
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 24+ messages in thread