* [PATCH 0/2 v5] Livelock avoidance for data integrity writes
@ 2010-06-24 13:57 Jan Kara
2010-06-24 13:57 ` [PATCH 1/2] radix-tree: Implement function radix_tree_range_tag_if_tagged Jan Kara
2010-06-24 13:57 ` [PATCH 2/2] mm: Implement writeback livelock avoidance using page tagging Jan Kara
0 siblings, 2 replies; 3+ messages in thread
From: Jan Kara @ 2010-06-24 13:57 UTC (permalink / raw)
To: Andrew Morton; +Cc: inux-fsdevel, linux-mm, npiggin, david
Hi,
this is an update of my patches to implement livelock avoidance for data
integrity writes using page tagging. There are some minor changes against
the previous versions:
* fixed some whitespace problems spotted checkpatch
* added WARN_ON_ONCE to catch a problem if radix_tree_range_tag_if_tagged
tags more pages than we asked
* fixed radix_tree_range_tag_if_tagged to tag at most as many pages as
we asked (it could tag one page more).
The patch now passed also XFSQA for XFS (well, several tests failed - like
192, 195, 228 ... - but they don't seem to be related - they are atime test,
ctime test, file alignment test, ...). Also the radix tree code passed through
10000 iterations of the following test I've implemented in Andrew's rtth:
void copy_tag_check(void)
{
RADIX_TREE(tree, GFP_KERNEL);
unsigned long idx[ITEMS];
unsigned long start, end, count = 0, tagged, cur, tmp;
int i;
// printf("generating radix tree indices...\n");
start = rand();
end = rand();
if (start > end && (rand() % 10)) {
cur = start;
start = end;
end = cur;
}
/* Specifically create items around the start and the end of the range
* with high probability to check for off by one errors */
cur = rand();
if (cur & 1) {
item_insert(&tree, start);
if (cur & 2) {
if (start <= end)
count++;
item_tag_set(&tree, start, 0);
}
}
if (cur & 4) {
item_insert(&tree, start-1);
if (cur & 8)
item_tag_set(&tree, start-1, 0);
}
if (cur & 16) {
item_insert(&tree, end);
if (cur & 32) {
if (start <= end)
count++;
item_tag_set(&tree, end, 0);
}
}
if (cur & 64) {
item_insert(&tree, end+1);
if (cur & 128)
item_tag_set(&tree, end+1, 0);
}
for (i = 0; i < ITEMS; i++) {
do {
idx[i] = rand();
} while (item_lookup(&tree, idx[i]));
item_insert(&tree, idx[i]);
if (rand() & 1) {
item_tag_set(&tree, idx[i], 0);
if (idx[i] >= start && idx[i] <= end)
count++;
}
/* if (i % 1000 == 0)
putchar('.'); */
}
// printf("\ncopying tags...\n");
cur = start;
tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1);
// printf("checking copied tags\n");
assert(tagged == count);
check_copied_tags(&tree, start, end, idx, ITEMS, 0, 1);
/* Copy tags in several rounds */
// printf("\ncopying tags...\n");
cur = start;
do {
tmp = rand() % (count/10+2);
tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0
} while (tmp == tagged);
// printf("%lu %lu %lu\n", tagged, tmp, count);
// printf("checking copied tags\n");
check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2);
assert(tagged < tmp);
// printf("\n");
item_kill_tree(&tree);
}
Honza
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH 1/2] radix-tree: Implement function radix_tree_range_tag_if_tagged
2010-06-24 13:57 [PATCH 0/2 v5] Livelock avoidance for data integrity writes Jan Kara
@ 2010-06-24 13:57 ` Jan Kara
2010-06-24 13:57 ` [PATCH 2/2] mm: Implement writeback livelock avoidance using page tagging Jan Kara
1 sibling, 0 replies; 3+ messages in thread
From: Jan Kara @ 2010-06-24 13:57 UTC (permalink / raw)
To: Andrew Morton; +Cc: inux-fsdevel, linux-mm, npiggin, david, Jan Kara
Implement function for setting one tag if another tag is set
for each item in given range.
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/radix-tree.h | 4 ++
lib/radix-tree.c | 94 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 98 insertions(+), 0 deletions(-)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 55ca73c..a4b00e9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -192,6 +192,10 @@ unsigned int
radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
unsigned long first_index, unsigned int max_items,
unsigned int tag);
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+ unsigned long *first_indexp, unsigned long last_index,
+ unsigned long nr_to_tag,
+ unsigned int fromtag, unsigned int totag);
int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
static inline void radix_tree_preload_end(void)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 05da38b..e907858 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -609,6 +609,100 @@ int radix_tree_tag_get(struct radix_tree_root *root,
EXPORT_SYMBOL(radix_tree_tag_get);
/**
+ * radix_tree_range_tag_if_tagged - for each item in given range set given
+ * tag if item has another tag set
+ * @root: radix tree root
+ * @first_indexp: pointer to a starting index of a range to scan
+ * @last_index: last index of a range to scan
+ * @nr_to_tag: maximum number items to tag
+ * @iftag: tag index to test
+ * @settag: tag index to set if tested tag is set
+ *
+ * This function scans range of radix tree from first_index to last_index
+ * (inclusive). For each item in the range if iftag is set, the function sets
+ * also settag. The function stops either after tagging nr_to_tag items or
+ * after reaching last_index.
+ *
+ * The function returns number of leaves where the tag was set and sets
+ * *first_indexp to the first unscanned index.
+ */
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+ unsigned long *first_indexp, unsigned long last_index,
+ unsigned long nr_to_tag,
+ unsigned int iftag, unsigned int settag)
+{
+ unsigned int height = root->height, shift;
+ unsigned long tagged = 0, index = *first_indexp;
+ struct radix_tree_node *open_slots[height], *slot;
+
+ last_index = min(last_index, radix_tree_maxindex(height));
+ if (index > last_index)
+ return 0;
+ if (!nr_to_tag)
+ return 0;
+ if (!root_tag_get(root, iftag)) {
+ *first_indexp = last_index + 1;
+ return 0;
+ }
+ if (height == 0) {
+ *first_indexp = last_index + 1;
+ root_tag_set(root, settag);
+ return 1;
+ }
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ slot = radix_tree_indirect_to_ptr(root->rnode);
+
+ for (;;) {
+ int offset;
+
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ if (!slot->slots[offset])
+ goto next;
+ if (!tag_get(slot, iftag, offset))
+ goto next;
+ tag_set(slot, settag, offset);
+ if (height == 1) {
+ tagged++;
+ goto next;
+ }
+ /* Go down one level */
+ height--;
+ shift -= RADIX_TREE_MAP_SHIFT;
+ open_slots[height] = slot;
+ slot = slot->slots[offset];
+ continue;
+next:
+ /* Go to next item at level determined by 'shift' */
+ index = ((index >> shift) + 1) << shift;
+ if (index > last_index)
+ break;
+ if (tagged >= nr_to_tag)
+ break;
+ while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+ /*
+ * We've fully scanned this node. Go up. Because
+ * last_index is guaranteed to be in the tree, what
+ * we do below cannot wander astray.
+ */
+ slot = open_slots[height];
+ height++;
+ shift += RADIX_TREE_MAP_SHIFT;
+ }
+ }
+ /*
+ * The iftag must have been set somewhere because otherwise
+ * we would return immediated at the beginning of the function
+ */
+ root_tag_set(root, settag);
+ *first_indexp = index;
+
+ return tagged;
+}
+EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+
+
+/**
* radix_tree_next_hole - find the next hole (not-present entry)
* @root: tree root
* @index: index key
--
1.6.4.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH 2/2] mm: Implement writeback livelock avoidance using page tagging
2010-06-24 13:57 [PATCH 0/2 v5] Livelock avoidance for data integrity writes Jan Kara
2010-06-24 13:57 ` [PATCH 1/2] radix-tree: Implement function radix_tree_range_tag_if_tagged Jan Kara
@ 2010-06-24 13:57 ` Jan Kara
1 sibling, 0 replies; 3+ messages in thread
From: Jan Kara @ 2010-06-24 13:57 UTC (permalink / raw)
To: Andrew Morton; +Cc: inux-fsdevel, linux-mm, npiggin, david, Jan Kara
We try to avoid livelocks of writeback when some steadily creates
dirty pages in a mapping we are writing out. For memory-cleaning
writeback, using nr_to_write works reasonably well but we cannot
really use it for data integrity writeback. This patch tries to
solve the problem.
The idea is simple: Tag all pages that should be written back
with a special tag (TOWRITE) in the radix tree. This can be done
rather quickly and thus livelocks should not happen in practice.
Then we start doing the hard work of locking pages and sending
them to disk only for those pages that have TOWRITE tag set.
Note: Adding new radix tree tag grows radix tree node from 288 to
296 bytes for 32-bit archs and from 552 to 560 bytes for 64-bit archs.
However, the number of slab/slub items per page remains the same
(13 and 7 respectively).
Signed-off-by: Jan Kara <jack@suse.cz>
---
include/linux/fs.h | 1 +
include/linux/radix-tree.h | 2 +-
mm/page-writeback.c | 70 +++++++++++++++++++++++++++++++++----------
3 files changed, 55 insertions(+), 18 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 471e1ff..664674e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -685,6 +685,7 @@ struct block_device {
*/
#define PAGECACHE_TAG_DIRTY 0
#define PAGECACHE_TAG_WRITEBACK 1
+#define PAGECACHE_TAG_TOWRITE 2
int mapping_tagged(struct address_space *mapping, int tag);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a4b00e9..634b8e6 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
/*** radix-tree API starts here ***/
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bbd396a..d265ef9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -807,6 +807,41 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+#define WRITEBACK_TAG_BATCH 4096
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ } while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -820,6 +855,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -835,6 +877,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -851,29 +894,19 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
-
- /*
- * If this is a data integrity sync, cap the writeback to the
- * current end of file. Any extension to the file that occurs
- * after this is a new write and we don't need to write those
- * pages out to fulfil our data integrity requirements. If we
- * try to write them out, we can get stuck in this scan until
- * the concurrent writer stops adding dirty pages and extending
- * EOF.
- */
- if (wbc->sync_mode == WB_SYNC_ALL &&
- wbc->range_end == LLONG_MAX) {
- end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
- }
}
-
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -1329,6 +1362,9 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
--
1.6.4.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2010-06-24 13:58 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-24 13:57 [PATCH 0/2 v5] Livelock avoidance for data integrity writes Jan Kara
2010-06-24 13:57 ` [PATCH 1/2] radix-tree: Implement function radix_tree_range_tag_if_tagged Jan Kara
2010-06-24 13:57 ` [PATCH 2/2] mm: Implement writeback livelock avoidance using page tagging Jan Kara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox