From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: kosaki.motohiro@jp.fujitsu.com, Rik van Riel <riel@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>
Subject: [RFC] mmaped copy too slow?
Date: Tue, 15 Jan 2008 10:45:47 +0900 [thread overview]
Message-ID: <20080115100450.1180.KOSAKI.MOTOHIRO@jp.fujitsu.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 4479 bytes --]
Hi
at one point, I found the large file copy speed was different depending on
the copy method.
I compared below method
- read(2) and write(2).
- mmap(2) x2 and memcpy.
- mmap(2) and write(2).
in addition, effect of fadvice(2) and madvice(2) is checked.
to a strange thing,
- most faster method is read + write + fadvice.
- worst method is mmap + memcpy.
some famous book(i.e. Advanced Programming in UNIX Environment
by W. Richard Stevens) written mmap copy x2 faster than read-write.
but, linux doesn't.
and, I found bottleneck is page reclaim.
for comparision, I change page reclaim function a bit. and test again.
test machine:
CPU: Pentium4 with HT 2.8GHz
memory: 512M
Disk I/O: can about 20M/s transfer.
(in other word, 1GB transfer need 50s at ideal state)
spent time of 1GB file copy.(unit is second)
2.6.24-rc6 2.6.24-rc6 ratio
+my patch (small is faster)
------------------------------------------------------------
rw_cp 59.32 58.60 98.79%
rw_fadv_cp 57.96 57.96 100.0%
mm_sync_cp 69.97 61.68 88.15%
mm_sync_madv_cp 69.41 62.54 90.10%
mw_cp 61.69 63.11 102.30%
mw_madv_cp 61.35 61.31 99.93%
this patch is too premature and ugly.
but I think that there is enough information to discuss to
page reclaim improvement.
the problem is when almost page is mapped and PTE access bit on,
page reclaim process below steps.
1) page move to inactive list -> active list
2) page move to active list -> inactive list
3) really pageout
It is too roundabout and unnecessary memory pressure happend.
if you don't mind, please discuss.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
mm/vmscan.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 43 insertions(+), 3 deletions(-)
Index: linux-2.6.24-rc6-cp3/mm/vmscan.c
===================================================================
--- linux-2.6.24-rc6-cp3.orig/mm/vmscan.c 2008-01-13 21:58:03.000000000 +0900
+++ linux-2.6.24-rc6-cp3/mm/vmscan.c 2008-01-13 22:30:27.000000000 +0900
@@ -446,13 +446,18 @@ static unsigned long shrink_page_list(st
struct pagevec freed_pvec;
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_scanned = 0;
+ LIST_HEAD(l_mapped_pages);
+ unsigned long nr_mapped_page_activate = 0;
+ struct page *page;
+ int reference_checked = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
- struct page *page;
int may_enter_fs;
int referenced;
@@ -466,6 +471,7 @@ static unsigned long shrink_page_list(st
VM_BUG_ON(PageActive(page));
+ nr_scanned++;
sc->nr_scanned++;
if (!sc->may_swap && page_mapped(page))
@@ -493,11 +499,17 @@ static unsigned long shrink_page_list(st
goto keep_locked;
}
- referenced = page_referenced(page, 1);
- /* In active use or really unfreeable? Activate it. */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
- referenced && page_mapping_inuse(page))
- goto activate_locked;
+ if (!reference_checked) {
+ referenced = page_referenced(page, 1);
+ /* In active use or really unfreeable? Activate it. */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+ referenced && page_mapping_inuse(page)) {
+ nr_mapped_page_activate++;
+ unlock_page(page);
+ list_add(&page->lru, &l_mapped_pages);
+ continue;
+ }
+ }
#ifdef CONFIG_SWAP
/*
@@ -604,7 +616,31 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page));
}
+
+ if (nr_scanned == nr_mapped_page_activate) {
+ /* may be under copy by mmap.
+ ignore reference flag. */
+ reference_checked = 1;
+ list_splice(&l_mapped_pages, page_list);
+ goto retry;
+ } else {
+ /* move active list just now */
+ while (!list_empty(&l_mapped_pages)) {
+ page = lru_to_page(&l_mapped_pages);
+ list_del(&page->lru);
+ prefetchw_prev_lru_page(page, &l_mapped_pages, flags);
+
+ if (!TestSetPageLocked(page)) {
+ SetPageActive(page);
+ pgactivate++;
+ unlock_page(page);
+ }
+ list_add(&page->lru, &ret_pages);
+ }
+ }
+
list_splice(&ret_pages, page_list);
+
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
[-- Attachment #2: mmap-write.c --]
[-- Type: application/octet-stream, Size: 1066 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
int main(int argc, char *argv[])
{
if (argc != 3) {
fprintf(stderr, "usage: %s from_file to_file", argv[0]);
exit(0);
}
/* from */
int from = open(argv[1], O_RDONLY, 0644);
assert(from >= 0);
struct stat st_buf;
assert(fstat(from, &st_buf) >= 0);
size_t size = st_buf.st_size;
void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
assert(from_mmap >= 0);
#if USE_MADVISE
assert(madvise(from_mmap, size, MADV_SEQUENTIAL) >= 0);
#endif
/* to */
int to = open(argv[2], O_CREAT | O_WRONLY, 0666);
assert(to >= 0);
/* copy */
char *p = from_mmap;
const char * const endp = from_mmap + size;
while (p < endp) {
int num_bytes = write(to, p, endp - p);
p += num_bytes;
}
assert(p == endp);
fsync(to);
close(to);
close(from);
return 0;
}
[-- Attachment #3: read-write.c --]
[-- Type: application/octet-stream, Size: 1225 bytes --]
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <linux/fadvise.h>
#define BUF_SIZE 8192
int main(int argc, char **argv)
{
char buf[BUF_SIZE];
struct stat st_buf;
if (argc < 3) {
fprintf(stderr, "usage: %s src out\n", argv[0]);
exit(EXIT_SUCCESS);
}
char *src = argv[1];
char *dest = argv[2];
assert(strcmp(src, dest) != 0);
int srcfd = open(src, O_RDONLY, 0644);
assert(srcfd >= 0);
#if USE_FADVISE
posix_fadvise(srcfd, 0, 0, POSIX_FADV_SEQUENTIAL);
posix_fadvise(srcfd, 0, 0, POSIX_FADV_NOREUSE);
#endif
/* get permission */
assert(fstat(srcfd, &st_buf) >= 0);
int destfd = open(dest, O_WRONLY | O_CREAT, st_buf.st_mode);
assert(destfd >= 0);
int n = 0;
while ((n = read(srcfd, buf, sizeof(buf))) > 0) {
char *p = &buf[0];
const char * const endp = buf + n;
while (p < endp) {
int num_bytes = write(destfd, p, endp - p);
p += num_bytes;
}
}
assert(n == 0);
fsync(destfd);
close(destfd);
close(srcfd);
exit(EXIT_SUCCESS);
}
[-- Attachment #4: test.sh --]
[-- Type: application/octet-stream, Size: 568 bytes --]
#!/bin/zsh -x
SRC=testfile1G
DST=testfile1G2
TIMEX=/usr/bin/time
PREPARE='rm $DST;sync;sync;sync;sudo sh -c "echo 3 > /proc/sys/vm/drop_caches";sleep 1'
REPEAT=1
(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_fadv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_madv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_madv_cp ${SRC} ${DST}))
[-- Attachment #5: Makefile --]
[-- Type: application/octet-stream, Size: 1047 bytes --]
CFLAGS = -Wall -O2 --static
TARGET = rw_cp rw_fadv_cp mm_sync_cp mm_mun_cp mm_sync_madv_cp mm_mun_madv_cp mw_cp mw_madv_cp mm_sync_nocache_cp mm_sync_madv_nocache_cp
all: $(TARGET)
rw_cp: read-write.c
gcc $(CFLAGS) -o rw_cp read-write.c
rw_fadv_cp: read-write.c
gcc $(CFLAGS) -DUSE_FADVISE -o rw_fadv_cp read-write.c
mm_sync_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -o mm_sync_cp mmap-mmap.c
mm_sync_nocache_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<
mm_mun_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MUNMAP -o mm_mun_cp mmap-mmap.c
mm_sync_madv_cp: mmap-mmap.c
gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MSYNC -o mm_sync_madv_cp mmap-mmap.c
mm_sync_madv_nocache_cp: mmap-mmap.c
gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<
mm_mun_madv_cp: mmap-mmap.c
gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MUNMAP -o mm_mun_madv_cp mmap-mmap.c
mw_cp: mmap-write.c
gcc $(CFLAGS) -o mw_cp mmap-write.c
mw_madv_cp: mmap-write.c
gcc $(CFLAGS) -DUSE_MADVISE -o mw_madv_cp mmap-write.c
clean:
-rm *.o
-rm $(TARGET)
[-- Attachment #6: mmap-mmap.c --]
[-- Type: application/octet-stream, Size: 1457 bytes --]
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/errno.h>
int main(int argc, char *argv[])
{
int err;
if (argc != 3) {
fprintf(stderr, "usage: %s from_file to_file", argv[0]);
exit(0);
}
/* from */
int from = open(argv[1], O_RDONLY, 0644);
assert(from >= 0);
struct stat st_buf;
assert(fstat(from, &st_buf) >= 0);
size_t size = st_buf.st_size;
void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
assert(from_mmap >= 0);
#if USE_MADVISE
err = madvise(from_mmap, size, MADV_SEQUENTIAL);
assert(err >= 0);
#endif
/* to */
int to = open(argv[2], O_CREAT|O_RDWR, st_buf.st_mode);
assert(to >= 0);
int i = 0;
assert(lseek(to, size - sizeof(int), 0L) >= 0);
assert(write(to, (&i), sizeof(int)) == sizeof(int));
errno=0;
void *to_mmap = mmap(NULL, size, PROT_WRITE, MAP_SHARED, to, 0);
assert_perror(errno);
#if USE_MADVISE
errno=0;
err = madvise(to_mmap, size, MADV_SEQUENTIAL);
assert_perror(errno);
#endif
/* copy */
memcpy(to_mmap, from_mmap, size);
#if WITH_MSYNC
assert(msync(to_mmap, size, MS_SYNC) >= 0);
#endif
#if WITH_MUNMAP
assert(munmap(to_mmap, size) >= 0);
#endif
assert(ftruncate(to, size) >= 0);
return 0;
}
next reply other threads:[~2008-01-15 1:45 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-15 1:45 KOSAKI Motohiro [this message]
2008-01-15 2:15 ` Rik van Riel
2008-01-15 3:20 ` KOSAKI Motohiro
2008-01-15 8:57 ` Peter Zijlstra
2008-01-15 9:03 ` KOSAKI Motohiro
2008-01-15 9:08 ` Peter Zijlstra
2008-01-15 12:46 ` Paulo Marques
2008-01-16 2:05 ` KOSAKI Motohiro
2008-01-17 3:23 ` KOSAKI Motohiro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080115100450.1180.KOSAKI.MOTOHIRO@jp.fujitsu.com \
--to=kosaki.motohiro@jp.fujitsu.com \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=riel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox