linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: kosaki.motohiro@jp.fujitsu.com, Rik van Riel <riel@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: [RFC] mmaped copy too slow?
Date: Tue, 15 Jan 2008 10:45:47 +0900	[thread overview]
Message-ID: <20080115100450.1180.KOSAKI.MOTOHIRO@jp.fujitsu.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 4479 bytes --]

Hi

at one point, I found the large file copy speed was different depending on
the copy method.

I compared below method
 - read(2) and write(2).
 - mmap(2) x2 and memcpy.
 - mmap(2) and write(2).

in addition, effect of fadvice(2) and madvice(2) is checked.

to a strange thing, 
   - most faster method is read + write + fadvice.
   - worst method is mmap + memcpy.

some famous book(i.e. Advanced Programming in UNIX Environment 
by W. Richard Stevens) written mmap copy x2 faster than read-write.
but, linux doesn't.

and, I found bottleneck is page reclaim.
for comparision, I change page reclaim function a bit. and test again.


test machine:
   CPU:      Pentium4 with HT 2.8GHz
   memory:   512M
   Disk I/O: can about 20M/s transfer.
             (in other word, 1GB transfer need 50s at ideal state)


spent time of 1GB file copy.(unit is second)

                 2.6.24-rc6    2.6.24-rc6       ratio
                               +my patch        (small is faster)
    ------------------------------------------------------------
    rw_cp             59.32       58.60          98.79%
    rw_fadv_cp        57.96       57.96          100.0%
    mm_sync_cp        69.97       61.68          88.15%
    mm_sync_madv_cp   69.41       62.54          90.10%
    mw_cp             61.69       63.11         102.30%
    mw_madv_cp        61.35       61.31          99.93%

this patch is too premature and ugly.
but I think that there is enough information to discuss to 
page reclaim improvement. 

the problem is when almost page is mapped and PTE access bit on,
page reclaim process below steps.

  1) page move to inactive list -> active list
  2) page move to active list   -> inactive list
  3) really pageout

It is too roundabout and unnecessary memory pressure happend.
if you don't mind, please discuss.




Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

---
 mm/vmscan.c |   46 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

Index: linux-2.6.24-rc6-cp3/mm/vmscan.c
===================================================================
--- linux-2.6.24-rc6-cp3.orig/mm/vmscan.c	2008-01-13 21:58:03.000000000 +0900
+++ linux-2.6.24-rc6-cp3/mm/vmscan.c	2008-01-13 22:30:27.000000000 +0900
@@ -446,13 +446,18 @@ static unsigned long shrink_page_list(st
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
+	unsigned long nr_scanned = 0;
+	LIST_HEAD(l_mapped_pages);
+	unsigned long nr_mapped_page_activate = 0;
+	struct page *page;
+	int reference_checked = 0;
 
 	cond_resched();
 
 	pagevec_init(&freed_pvec, 1);
+retry:
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
-		struct page *page;
 		int may_enter_fs;
 		int referenced;
 
@@ -466,6 +471,7 @@ static unsigned long shrink_page_list(st
 
 		VM_BUG_ON(PageActive(page));
 
+		nr_scanned++;
 		sc->nr_scanned++;
 
 		if (!sc->may_swap && page_mapped(page))
@@ -493,11 +499,17 @@ static unsigned long shrink_page_list(st
 				goto keep_locked;
 		}
 
-		referenced = page_referenced(page, 1);
-		/* In active use or really unfreeable?  Activate it. */
-		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
-					referenced && page_mapping_inuse(page))
-			goto activate_locked;
+		if (!reference_checked) {
+			referenced = page_referenced(page, 1);
+			/* In active use or really unfreeable?  Activate it. */
+			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+			    referenced && page_mapping_inuse(page)) {
+				nr_mapped_page_activate++;
+				unlock_page(page);
+				list_add(&page->lru, &l_mapped_pages);
+				continue;
+			}
+		}
 
 #ifdef CONFIG_SWAP
 		/*
@@ -604,7 +616,31 @@ keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page));
 	}
+
+	if (nr_scanned == nr_mapped_page_activate) {
+		/* may be under copy by mmap.
+		   ignore reference flag. */
+		reference_checked = 1;
+		list_splice(&l_mapped_pages, page_list);
+		goto retry;
+	} else {
+		/* move active list just now */
+		while (!list_empty(&l_mapped_pages)) {
+			page = lru_to_page(&l_mapped_pages);
+			list_del(&page->lru);
+			prefetchw_prev_lru_page(page, &l_mapped_pages, flags);
+
+			if (!TestSetPageLocked(page)) {
+				SetPageActive(page);
+				pgactivate++;
+				unlock_page(page);
+			}
+			list_add(&page->lru, &ret_pages);
+		}
+	}
+
 	list_splice(&ret_pages, page_list);
+
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	count_vm_events(PGACTIVATE, pgactivate);


[-- Attachment #2: mmap-write.c --]
[-- Type: application/octet-stream, Size: 1066 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>

int main(int argc, char *argv[])
{
    if (argc != 3) {
        fprintf(stderr, "usage: %s from_file to_file", argv[0]);
        exit(0);
    }

    /* from */
    int from = open(argv[1], O_RDONLY, 0644);
    assert(from >= 0);

    struct stat st_buf;
    assert(fstat(from, &st_buf) >= 0);
    size_t size = st_buf.st_size;

    void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
    assert(from_mmap >= 0);

#if USE_MADVISE
    assert(madvise(from_mmap, size, MADV_SEQUENTIAL) >= 0);
#endif

    /* to */
    int to = open(argv[2], O_CREAT | O_WRONLY, 0666);
    assert(to >= 0);

    /* copy */
    char *p = from_mmap;
    const char * const endp = from_mmap + size;
    while (p < endp) {
        int num_bytes = write(to, p, endp - p);
        p += num_bytes;
    }
    assert(p == endp);

    fsync(to);

    close(to);
    close(from);

    return 0;
}

[-- Attachment #3: read-write.c --]
[-- Type: application/octet-stream, Size: 1225 bytes --]

#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <linux/fadvise.h>

#define BUF_SIZE 8192

int main(int argc, char **argv)
{
    char buf[BUF_SIZE];
    struct stat st_buf;

    if (argc < 3) {
        fprintf(stderr, "usage: %s src out\n", argv[0]);
        exit(EXIT_SUCCESS);
    }

    char *src  = argv[1];
    char *dest = argv[2];
    assert(strcmp(src, dest) != 0);

    int srcfd = open(src, O_RDONLY, 0644);
    assert(srcfd >= 0);

#if USE_FADVISE
    posix_fadvise(srcfd, 0, 0, POSIX_FADV_SEQUENTIAL);
    posix_fadvise(srcfd, 0, 0, POSIX_FADV_NOREUSE);
#endif

    /* get permission */
    assert(fstat(srcfd, &st_buf) >= 0);

    int destfd = open(dest, O_WRONLY | O_CREAT, st_buf.st_mode);
    assert(destfd >= 0);

    int n = 0;
    while ((n = read(srcfd, buf, sizeof(buf))) > 0) {
        char *p = &buf[0];
        const char * const endp = buf + n;
        while (p < endp) {
            int num_bytes = write(destfd, p, endp - p);
            p += num_bytes;
        }
    }
    assert(n == 0);

    fsync(destfd);

    close(destfd);
    close(srcfd);

    exit(EXIT_SUCCESS);
}

[-- Attachment #4: test.sh --]
[-- Type: application/octet-stream, Size: 568 bytes --]

#!/bin/zsh -x

SRC=testfile1G
DST=testfile1G2
TIMEX=/usr/bin/time
PREPARE='rm $DST;sync;sync;sync;sudo sh -c "echo 3 > /proc/sys/vm/drop_caches";sleep 1'
REPEAT=1


(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./rw_fadv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mm_sync_madv_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_cp ${SRC} ${DST}))
(repeat $REPEAT (eval $PREPARE; $TIMEX ./mw_madv_cp ${SRC} ${DST}))

[-- Attachment #5: Makefile --]
[-- Type: application/octet-stream, Size: 1047 bytes --]

CFLAGS = -Wall -O2 --static 
TARGET = rw_cp rw_fadv_cp mm_sync_cp mm_mun_cp mm_sync_madv_cp mm_mun_madv_cp mw_cp mw_madv_cp mm_sync_nocache_cp mm_sync_madv_nocache_cp

all: $(TARGET)

rw_cp: read-write.c
	gcc $(CFLAGS)  -o rw_cp read-write.c

rw_fadv_cp: read-write.c
	gcc $(CFLAGS)  -DUSE_FADVISE -o rw_fadv_cp read-write.c

mm_sync_cp: mmap-mmap.c
	gcc $(CFLAGS) -DWITH_MSYNC -o mm_sync_cp mmap-mmap.c

mm_sync_nocache_cp: mmap-mmap.c
	gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<

mm_mun_cp: mmap-mmap.c
	gcc $(CFLAGS) -DWITH_MUNMAP -o mm_mun_cp mmap-mmap.c

mm_sync_madv_cp: mmap-mmap.c
	gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MSYNC -o mm_sync_madv_cp mmap-mmap.c

mm_sync_madv_nocache_cp: mmap-mmap.c
	gcc $(CFLAGS) -DWITH_MSYNC -DUSE_NOCACHE_MEMCPY -o $@ $<

mm_mun_madv_cp: mmap-mmap.c
	gcc $(CFLAGS) -DUSE_MADVISE -DWITH_MUNMAP -o mm_mun_madv_cp mmap-mmap.c

mw_cp: mmap-write.c
	gcc $(CFLAGS) -o mw_cp mmap-write.c

mw_madv_cp: mmap-write.c
	gcc $(CFLAGS) -DUSE_MADVISE -o mw_madv_cp mmap-write.c

clean:
	-rm *.o
	-rm $(TARGET)


[-- Attachment #6: mmap-mmap.c --]
[-- Type: application/octet-stream, Size: 1457 bytes --]

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/errno.h>

int main(int argc, char *argv[])
{
    int err;

    if (argc != 3) {
        fprintf(stderr, "usage: %s from_file to_file", argv[0]);
        exit(0);
    }

    /* from */
    int from = open(argv[1], O_RDONLY, 0644);
    assert(from >= 0);

    struct stat st_buf;
    assert(fstat(from, &st_buf) >= 0);
    size_t size = st_buf.st_size;

    void *from_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, from, 0);
    assert(from_mmap >= 0);

#if USE_MADVISE
    err = madvise(from_mmap, size, MADV_SEQUENTIAL);
    assert(err >= 0);
#endif

    /* to */
    int to = open(argv[2], O_CREAT|O_RDWR, st_buf.st_mode);
    assert(to >= 0);

    int i = 0;
    assert(lseek(to, size - sizeof(int), 0L) >= 0);
    assert(write(to, (&i), sizeof(int)) == sizeof(int));

    errno=0;
    void *to_mmap = mmap(NULL, size, PROT_WRITE, MAP_SHARED, to, 0);
    assert_perror(errno);

#if USE_MADVISE
    errno=0;
    err = madvise(to_mmap, size, MADV_SEQUENTIAL);
    assert_perror(errno);
#endif

    /* copy */
    memcpy(to_mmap, from_mmap, size);

#if WITH_MSYNC
    assert(msync(to_mmap, size, MS_SYNC) >= 0);
#endif
#if WITH_MUNMAP
    assert(munmap(to_mmap, size) >= 0);
#endif

    assert(ftruncate(to, size) >= 0);

    return 0;
}

             reply	other threads:[~2008-01-15  1:45 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-01-15  1:45 KOSAKI Motohiro [this message]
2008-01-15  2:15 ` Rik van Riel
2008-01-15  3:20   ` KOSAKI Motohiro
2008-01-15  8:57     ` Peter Zijlstra
2008-01-15  9:03       ` KOSAKI Motohiro
2008-01-15  9:08         ` Peter Zijlstra
2008-01-15 12:46 ` Paulo Marques
2008-01-16  2:05   ` KOSAKI Motohiro
2008-01-17  3:23     ` KOSAKI Motohiro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080115100450.1180.KOSAKI.MOTOHIRO@jp.fujitsu.com \
    --to=kosaki.motohiro@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox