* [RFC] [RFT] Shared /dev/zero mmaping feature
@ 2000-02-25 23:08 Kanoj Sarcar
2000-02-26 16:38 ` Linus Torvalds
2000-02-29 10:54 ` Christoph Rohland
0 siblings, 2 replies; 19+ messages in thread
From: Kanoj Sarcar @ 2000-02-25 23:08 UTC (permalink / raw)
To: linux-mm, linux-kernel; +Cc: torvalds, Kanoj Sarcar
Hi all,
This is a patch against 2.3.47 that tries to implement shared /dev/zero
mappings. This is just a first cut attempt, I am hoping I will find a
few people to apply the patch and throw some real life programs at it
(preferably on low memory machines so that swapping is induced).
Currently, you will also need to turn on CONFIG_SYSVIPC, but most of
the shm.c code can be split into a new ipc/shm_core.c file that is
always compiled in, irrespective of CONFIG_SYSVIPC. Linus, do you
think this is the proper direction to follow?
Thanks. Comments and feedback welcome ...
Kanoj
--- drivers/char/mem.c Fri Feb 25 13:03:46 2000
+++ drivers/char/mem.c Fri Feb 25 14:26:44 2000
@@ -433,8 +433,12 @@
static int mmap_zero(struct file * file, struct vm_area_struct * vma)
{
+ int ret;
+ extern int map_zero_setup(struct vm_area_struct *);
+
if (vma->vm_flags & VM_SHARED)
- return -EINVAL;
+ if (ret = map_zero_setup(vma))
+ return ret;
if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
return -EAGAIN;
return 0;
--- ipc/shm.c Fri Feb 25 13:04:14 2000
+++ ipc/shm.c Fri Feb 25 14:56:39 2000
@@ -11,6 +11,7 @@
* HIGHMEM support, Ingo Molnar <mingo@redhat.com>
* avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
* Christoph Rohland <hans-christoph.rohland@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
*/
#include <linux/config.h>
@@ -70,6 +71,13 @@
static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
#endif
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
+static void zmap_unuse(swp_entry_t entry, struct page *page);
+static void shmzero_open(struct vm_area_struct *shmd);
+static void shmzero_close(struct vm_area_struct *shmd);
+static int zero_id;
+static struct kern_ipc_perm zero_perm;
+
size_t shm_ctlmax = SHMMAX;
int shm_ctlall = SHMALL;
int shm_ctlmni = SHMMNI;
@@ -103,6 +111,8 @@
#ifdef CONFIG_PROC_FS
create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
#endif
+ zero_id = ipc_addid(&shm_ids, &zero_perm, shm_ctlmni);
+ shm_unlock(zero_id);
return;
}
@@ -179,6 +189,26 @@
return 0;
}
+static inline struct shmid_kernel *newseg_alloc(int numpages)
+{
+ struct shmid_kernel *shp;
+
+ shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+ if (!shp)
+ return 0;
+
+ shp->shm_dir = shm_alloc (numpages);
+ if (!shp->shm_dir) {
+ kfree(shp);
+ return 0;
+ }
+ shp->shm_npages = numpages;
+ shp->attaches = NULL;
+ shp->shm_nattch = 0;
+ init_MUTEX(&shp->sem);
+ return(shp);
+}
+
static int newseg (key_t key, int shmflg, size_t size)
{
struct shmid_kernel *shp;
@@ -193,15 +223,8 @@
if (shm_tot + numpages >= shm_ctlall)
return -ENOSPC;
- shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
- if (!shp)
- return -ENOMEM;
-
- shp->shm_dir = shm_alloc (numpages);
- if (!shp->shm_dir) {
- kfree(shp);
+ if (!(shp = newseg_alloc(numpages)))
return -ENOMEM;
- }
id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
if(id == -1) {
shm_free(shp->shm_dir,numpages);
@@ -212,13 +235,10 @@
shp->shm_perm.mode = (shmflg & S_IRWXUGO);
shp->shm_segsz = size;
shp->shm_cpid = current->pid;
- shp->attaches = NULL;
- shp->shm_lpid = shp->shm_nattch = 0;
+ shp->shm_lpid = 0;
shp->shm_atime = shp->shm_dtime = 0;
shp->shm_ctime = CURRENT_TIME;
- shp->shm_npages = numpages;
shp->id = shm_buildid(id,shp->shm_perm.seq);
- init_MUTEX(&shp->sem);
shm_tot += numpages;
shm_unlock(id);
@@ -255,6 +275,35 @@
return err;
}
+static void killseg_core(struct shmid_kernel *shp, int doacc)
+{
+ int i, numpages, rss, swp;
+
+ numpages = shp->shm_npages;
+ for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
+ pte_t pte;
+ pte = SHM_ENTRY (shp,i);
+ if (pte_none(pte))
+ continue;
+ if (pte_present(pte)) {
+ __free_page (pte_page(pte));
+ rss++;
+ } else {
+ swap_free(pte_to_swp_entry(pte));
+ swp++;
+ }
+ }
+ shm_free (shp->shm_dir, numpages);
+ kfree(shp);
+ if (doacc) {
+ shm_lockall();
+ shm_rss -= rss;
+ shm_swp -= swp;
+ shm_tot -= numpages;
+ shm_unlockall();
+ }
+}
+
/*
* Only called after testing nattch and SHM_DEST.
* Here pages, pgtable and shmid_kernel are freed.
@@ -262,8 +311,6 @@
static void killseg (int shmid)
{
struct shmid_kernel *shp;
- int i, numpages;
- int rss, swp;
down(&shm_ids.sem);
shp = shm_lock(shmid);
@@ -284,28 +331,8 @@
BUG();
shm_unlock(shmid);
up(&shm_ids.sem);
+ killseg_core(shp, 1);
- numpages = shp->shm_npages;
- for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
- pte_t pte;
- pte = SHM_ENTRY (shp,i);
- if (pte_none(pte))
- continue;
- if (pte_present(pte)) {
- __free_page (pte_page(pte));
- rss++;
- } else {
- swap_free(pte_to_swp_entry(pte));
- swp++;
- }
- }
- shm_free (shp->shm_dir, numpages);
- kfree(shp);
- shm_lockall();
- shm_rss -= rss;
- shm_swp -= swp;
- shm_tot -= numpages;
- shm_unlockall();
return;
}
@@ -835,10 +862,12 @@
struct shmid_kernel *shp;
unsigned int idx;
struct page * page;
+ int is_shmzero;
shp = (struct shmid_kernel *) shmd->vm_private_data;
idx = (address - shmd->vm_start) >> PAGE_SHIFT;
idx += shmd->vm_pgoff;
+ is_shmzero = (shp->id == zero_id);
/*
* A shared mapping past the last page of the file is an error
@@ -850,7 +879,7 @@
return NULL;
}
down(&shp->sem);
- if(shp != shm_lock(shp->id))
+ if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
BUG();
pte = SHM_ENTRY(shp,idx);
@@ -864,7 +893,7 @@
if (!page)
goto oom;
clear_highpage(page);
- if(shp != shm_lock(shp->id))
+ if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
BUG();
} else {
swp_entry_t entry = pte_to_swp_entry(pte);
@@ -882,11 +911,11 @@
delete_from_swap_cache(page);
page = replace_with_highmem(page);
swap_free(entry);
- if(shp != shm_lock(shp->id))
+ if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
BUG();
- shm_swp--;
+ if (is_shmzero) shm_swp--;
}
- shm_rss++;
+ if (is_shmzero) shm_rss++;
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
SHM_ENTRY(shp, idx) = pte;
} else
@@ -904,6 +933,65 @@
return NOPAGE_OOM;
}
+#define OKAY 0
+#define RETRY 1
+#define FAILED 2
+
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
+{
+ pte_t page;
+ struct page *page_map;
+
+ page = SHM_ENTRY(shp, idx);
+ if (!pte_present(page))
+ return RETRY;
+ page_map = pte_page(page);
+ if (zone && (!memclass(page_map->zone, zone)))
+ return RETRY;
+ if (shp->id != zero_id) swap_attempts++;
+
+ if (--counter < 0) /* failed */
+ return FAILED;
+ if (page_count(page_map) != 1)
+ return RETRY;
+
+ if (!(page_map = prepare_highmem_swapout(page_map)))
+ return FAILED;
+ SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
+
+ /* add the locked page to the swap cache before allowing
+ the swapin path to run lookup_swap_cache(). This avoids
+ reading a not yet uptodate block from disk.
+ NOTE: we just accounted the swap space reference for this
+ swap cache page at __get_swap_page() time. */
+ add_to_swap_cache(*outpage = page_map, swap_entry);
+ return OKAY;
+}
+
+static void shm_swap_postop(struct page *page)
+{
+ lock_kernel();
+ rw_swap_page(WRITE, page, 0);
+ unlock_kernel();
+ __free_page(page);
+}
+
+static int shm_swap_preop(swp_entry_t *swap_entry)
+{
+ lock_kernel();
+ /* subtle: preload the swap count for the swap cache. We can't
+ increase the count inside the critical section as we can't release
+ the shm_lock there. And we can't acquire the big lock with the
+ shm_lock held (otherwise we would deadlock too easily). */
+ *swap_entry = __get_swap_page(2);
+ if (!(*swap_entry).val) {
+ unlock_kernel();
+ return 1;
+ }
+ unlock_kernel();
+ return 0;
+}
+
/*
* Goes through counter = (shm_rss >> prio) present shm pages.
*/
@@ -912,7 +1000,6 @@
int shm_swap (int prio, int gfp_mask, zone_t *zone)
{
- pte_t page;
struct shmid_kernel *shp;
swp_entry_t swap_entry;
unsigned long id, idx;
@@ -919,21 +1006,13 @@
int loop = 0;
int counter;
struct page * page_map;
-
+
+ zshm_swap(prio, gfp_mask, zone);
counter = shm_rss >> prio;
if (!counter)
return 0;
- lock_kernel();
- /* subtle: preload the swap count for the swap cache. We can't
- increase the count inside the critical section as we can't release
- the shm_lock there. And we can't acquire the big lock with the
- shm_lock held (otherwise we would deadlock too easily). */
- swap_entry = __get_swap_page(2);
- if (!swap_entry.val) {
- unlock_kernel();
+ if (shm_swap_preop(&swap_entry))
return 0;
- }
- unlock_kernel();
shm_lockall();
check_id:
@@ -943,8 +1022,12 @@
swap_idx = 0;
if (++swap_id > shm_ids.max_id) {
swap_id = 0;
- if (loop)
- goto failed;
+ if (loop) {
+failed:
+ shm_unlockall();
+ __swap_free(swap_entry, 2);
+ return 0;
+ }
loop = 1;
}
goto check_id;
@@ -956,43 +1039,16 @@
if (idx >= shp->shm_npages)
goto next_id;
- page = SHM_ENTRY(shp, idx);
- if (!pte_present(page))
- goto check_table;
- page_map = pte_page(page);
- if (zone && (!memclass(page_map->zone, zone)))
- goto check_table;
- swap_attempts++;
-
- if (--counter < 0) { /* failed */
-failed:
- shm_unlockall();
- __swap_free(swap_entry, 2);
- return 0;
+ switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+ case RETRY: goto check_table;
+ case FAILED: goto failed;
}
- if (page_count(page_map) != 1)
- goto check_table;
-
- if (!(page_map = prepare_highmem_swapout(page_map)))
- goto failed;
- SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
swap_successes++;
shm_swp++;
shm_rss--;
-
- /* add the locked page to the swap cache before allowing
- the swapin path to run lookup_swap_cache(). This avoids
- reading a not yet uptodate block from disk.
- NOTE: we just accounted the swap space reference for this
- swap cache page at __get_swap_page() time. */
- add_to_swap_cache(page_map, swap_entry);
shm_unlockall();
- lock_kernel();
- rw_swap_page(WRITE, page_map, 0);
- unlock_kernel();
-
- __free_page(page_map);
+ shm_swap_postop(page_map);
return 1;
}
@@ -1014,12 +1070,29 @@
swap_free(entry);
}
+static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
+{
+ int n;
+
+ for (n = 0; n < shp->shm_npages; n++) {
+ if (pte_none(SHM_ENTRY(shp,n)))
+ continue;
+ if (pte_present(SHM_ENTRY(shp,n)))
+ continue;
+ if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
+ shm_unuse_page(shp, n, entry, page);
+ return 1;
+ }
+ }
+ return 0;
+}
+
/*
* unuse_shm() search for an eventually swapped out shm page.
*/
void shm_unuse(swp_entry_t entry, struct page *page)
{
- int i, n;
+ int i;
shm_lockall();
for (i = 0; i <= shm_ids.max_id; i++) {
@@ -1026,19 +1099,12 @@
struct shmid_kernel *shp = shm_get(i);
if(shp==NULL)
continue;
- for (n = 0; n < shp->shm_npages; n++) {
- if (pte_none(SHM_ENTRY(shp,n)))
- continue;
- if (pte_present(SHM_ENTRY(shp,n)))
- continue;
- if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
- shm_unuse_page(shp, n, entry, page);
- goto out;
- }
- }
+ if (shm_unuse_core(shp, entry, page))
+ goto out;
}
out:
shm_unlockall();
+ zmap_unuse(entry, page);
}
#ifdef CONFIG_PROC_FS
@@ -1100,3 +1166,138 @@
return len;
}
#endif
+
+static struct shmid_kernel *zmap_list = 0;
+static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long zswap_idx = 0; /* next to swap */
+static struct shmid_kernel *zswap_shp = 0;
+
+static struct vm_operations_struct shmzero_vm_ops = {
+ open: shmzero_open,
+ close: shmzero_close,
+ nopage: shm_nopage,
+ swapout: shm_swapout,
+};
+
+int map_zero_setup(struct vm_area_struct *vma)
+{
+ struct shmid_kernel *shp;
+
+ if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
+ return -ENOMEM;
+ shp->id = zero_id; /* hack for shm_lock et al */
+ vma->vm_private_data = shp;
+ vma->vm_ops = &shmzero_vm_ops;
+ shmzero_open(vma);
+ spin_lock(&zmap_list_lock);
+ shp->attaches = (struct vm_area_struct *)zmap_list;
+ zmap_list = shp;
+ spin_unlock(&zmap_list_lock);
+ return 0;
+}
+
+static void shmzero_open(struct vm_area_struct *shmd)
+{
+ struct shmid_kernel *shp;
+
+ shp = (struct shmid_kernel *) shmd->vm_private_data;
+ down(&shp->sem);
+ shp->shm_nattch++;
+ up(&shp->sem);
+}
+
+static void shmzero_close(struct vm_area_struct *shmd)
+{
+ int done = 0;
+ struct shmid_kernel *shp, *prev, *cur;
+
+ shp = (struct shmid_kernel *) shmd->vm_private_data;
+ down(&shp->sem);
+ if (--shp->shm_nattch == 0)
+ done = 1;
+ up(&shp->sem);
+ if (done) {
+ spin_lock(&zmap_list_lock);
+ if (shp == zswap_shp)
+ zswap_shp = (struct shmid_kernel *)(shp->attaches);
+ if (shp == zmap_list)
+ zmap_list = (struct shmid_kernel *)(shp->attaches);
+ else {
+ prev = zmap_list;
+ cur = (struct shmid_kernel *)(prev->attaches);
+ while (cur != shp) {
+ prev = cur;
+ cur = (struct shmid_kernel *)(prev->attaches);
+ }
+ prev->attaches = (struct vm_area_struct *)(shp->attaches);
+ }
+ spin_unlock(&zmap_list_lock);
+ killseg_core(shp, 0);
+ }
+}
+
+static void zmap_unuse(swp_entry_t entry, struct page *page)
+{
+ struct shmid_kernel *shp;
+
+ spin_lock(&zmap_list_lock);
+ shp = zmap_list;
+ while (shp) {
+ if (shm_unuse_core(shp, entry, page))
+ break;
+ shp = (struct shmid_kernel *)shp->attaches;
+ }
+ spin_unlock(&zmap_list_lock);
+}
+
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
+{
+ struct shmid_kernel *shp;
+ swp_entry_t swap_entry;
+ unsigned long idx;
+ int loop = 0;
+ int counter;
+ struct page * page_map;
+
+ counter = 10; /* maybe we should use zshm_rss */
+ if (!counter)
+ return;
+next:
+ if (shm_swap_preop(&swap_entry))
+ return;
+
+ spin_lock(&zmap_list_lock);
+ if (zmap_list == 0)
+ goto failed;
+next_id:
+ if ((shp = zswap_shp) == 0) {
+ if (loop) {
+failed:
+ spin_unlock(&zmap_list_lock);
+ __swap_free(swap_entry, 2);
+ return;
+ }
+ zswap_shp = shp = zmap_list;
+ zswap_idx = 0;
+ loop = 1;
+ }
+
+check_table:
+ idx = zswap_idx++;
+ if (idx >= shp->shm_npages) {
+ zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
+ zswap_idx = 0;
+ goto next_id;
+ }
+
+ switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+ case RETRY: goto check_table;
+ case FAILED: goto failed;
+ }
+ spin_unlock(&zmap_list_lock);
+
+ shm_swap_postop(page_map);
+ if (counter)
+ goto next;
+ return;
+}
--- ipc/util.c Fri Feb 25 13:04:14 2000
+++ ipc/util.c Fri Feb 25 14:54:39 2000
@@ -317,4 +317,9 @@
{
}
+int map_zero_setup(struct vm_area_struct *vma)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_SYSVIPC */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar @ 2000-02-26 16:38 ` Linus Torvalds 2000-02-26 21:47 ` Kanoj Sarcar 2000-02-29 10:54 ` Christoph Rohland 1 sibling, 1 reply; 19+ messages in thread From: Linus Torvalds @ 2000-02-26 16:38 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: linux-mm, linux-kernel On Fri, 25 Feb 2000, Kanoj Sarcar wrote: > > This is a patch against 2.3.47 that tries to implement shared /dev/zero > mappings. This is just a first cut attempt, I am hoping I will find a > few people to apply the patch and throw some real life programs at it > (preferably on low memory machines so that swapping is induced). I'd like to see the shm code use the page cache exclusively to track pages, and this is going to mess with that, I think. We'll need an inode (or at least a "struct address_space") to act as a "anchor" for the pages, and it mustn't be the "/dev/zero" inode. Before that is done I'd be nervous about letting external things call into the shm subsystem, but maybe this is ok as-is (the only thing you really export is "map_zero_setup()", and I guess it can just be updated as required..) However, whether that will be changed or not, the following looks really wrong: > if (vma->vm_flags & VM_SHARED) > if (ret = map_zero_setup(vma)) > return ret; > if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) > return -EAGAIN; I do NOT believe that it can ever be right to do a "zeromap_page_range()" on a shared memory area. I'll tell you why: mmap(/dev/zero, MAP_SHARED) child = fork(); if (child) { ..modify mapping.. } else { ..wait for modifications to show up.. } the "zeromap_page_range()" thing mapped the area with private NULL pages. The fork() copied the page tables. The modification of the private NULL page causes the child to get a real shared page, but the mother will never see that shared page if it only reads from the mapping because the mother will still see the private one. So I suspect at the very least it would have to be if (vma->vm_flags & VM_SHARED) return map_zero_setup(vma); and let the whole area be populated with shared pages by "nopage". Comments? Linus -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-02-26 16:38 ` Linus Torvalds @ 2000-02-26 21:47 ` Kanoj Sarcar 0 siblings, 0 replies; 19+ messages in thread From: Kanoj Sarcar @ 2000-02-26 21:47 UTC (permalink / raw) To: Linus Torvalds; +Cc: linux-mm, linux-kernel > > I'd like to see the shm code use the page cache exclusively to track > pages, and this is going to mess with that, I think. We'll need an inode > (or at least a "struct address_space") to act as a "anchor" for the pages, > and it mustn't be the "/dev/zero" inode. > > Before that is done I'd be nervous about letting external things call into > the shm subsystem, but maybe this is ok as-is (the only thing you really > export is "map_zero_setup()", and I guess it can just be updated as > required..) > Yes, as you point out, we can put in this code now. Whatever changes are made to the underlying routines that manage shm pages, should also be able to handle /dev/zero pages with minor tweaks. The chances that the /dev/zero code will be a big hindrance to the page cache effort are minimal ... > However, whether that will be changed or not, the following looks really > wrong: > > > if (vma->vm_flags & VM_SHARED) > > if (ret = map_zero_setup(vma)) > > return ret; > > if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) > > return -EAGAIN; Yes, you are right. I have made the change in the attached patch. I also added in some checks so that users can not shmctl/shmat to the shmid being used for /dev/zero. Does this look ready for 2.3.48 then? Kanoj --- drivers/char/mem.c Fri Feb 25 13:03:46 2000 +++ drivers/char/mem.c Sat Feb 26 11:43:35 2000 @@ -5,6 +5,7 @@ * * Added devfs support. * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu> + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com> */ #include <linux/config.h> @@ -433,8 +434,10 @@ static int mmap_zero(struct file * file, struct vm_area_struct * vma) { + extern int map_zero_setup(struct vm_area_struct *); + if (vma->vm_flags & VM_SHARED) - return -EINVAL; + return(map_zero_setup(vma)); if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; return 0; --- ipc/shm.c Fri Feb 25 13:04:14 2000 +++ ipc/shm.c Sat Feb 26 13:10:21 2000 @@ -11,6 +11,7 @@ * HIGHMEM support, Ingo Molnar <mingo@redhat.com> * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able, * Christoph Rohland <hans-christoph.rohland@sap.com> + * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> */ #include <linux/config.h> @@ -70,6 +71,13 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data); #endif +static void zshm_swap (int prio, int gfp_mask, zone_t *zone); +static void zmap_unuse(swp_entry_t entry, struct page *page); +static void shmzero_open(struct vm_area_struct *shmd); +static void shmzero_close(struct vm_area_struct *shmd); +static int zero_id; +static struct shmid_kernel zshmid_kernel; + size_t shm_ctlmax = SHMMAX; int shm_ctlall = SHMALL; int shm_ctlmni = SHMMNI; @@ -103,6 +111,8 @@ #ifdef CONFIG_PROC_FS create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL); #endif + zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni); + shm_unlock(zero_id); return; } @@ -179,6 +189,26 @@ return 0; } +static inline struct shmid_kernel *newseg_alloc(int numpages) +{ + struct shmid_kernel *shp; + + shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL); + if (!shp) + return 0; + + shp->shm_dir = shm_alloc (numpages); + if (!shp->shm_dir) { + kfree(shp); + return 0; + } + shp->shm_npages = numpages; + shp->attaches = NULL; + shp->shm_nattch = 0; + init_MUTEX(&shp->sem); + return(shp); +} + static int newseg (key_t key, int shmflg, size_t size) { struct shmid_kernel *shp; @@ -193,15 +223,8 @@ if (shm_tot + numpages >= shm_ctlall) return -ENOSPC; - shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL); - if (!shp) - return -ENOMEM; - - shp->shm_dir = shm_alloc (numpages); - if (!shp->shm_dir) { - kfree(shp); + if (!(shp = newseg_alloc(numpages))) return -ENOMEM; - } id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); if(id == -1) { shm_free(shp->shm_dir,numpages); @@ -212,13 +235,10 @@ shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->shm_segsz = size; shp->shm_cpid = current->pid; - shp->attaches = NULL; - shp->shm_lpid = shp->shm_nattch = 0; + shp->shm_lpid = 0; shp->shm_atime = shp->shm_dtime = 0; shp->shm_ctime = CURRENT_TIME; - shp->shm_npages = numpages; shp->id = shm_buildid(id,shp->shm_perm.seq); - init_MUTEX(&shp->sem); shm_tot += numpages; shm_unlock(id); @@ -255,6 +275,35 @@ return err; } +static void killseg_core(struct shmid_kernel *shp, int doacc) +{ + int i, numpages, rss, swp; + + numpages = shp->shm_npages; + for (i = 0, rss = 0, swp = 0; i < numpages ; i++) { + pte_t pte; + pte = SHM_ENTRY (shp,i); + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + __free_page (pte_page(pte)); + rss++; + } else { + swap_free(pte_to_swp_entry(pte)); + swp++; + } + } + shm_free (shp->shm_dir, numpages); + kfree(shp); + if (doacc) { + shm_lockall(); + shm_rss -= rss; + shm_swp -= swp; + shm_tot -= numpages; + shm_unlockall(); + } +} + /* * Only called after testing nattch and SHM_DEST. * Here pages, pgtable and shmid_kernel are freed. @@ -262,8 +311,6 @@ static void killseg (int shmid) { struct shmid_kernel *shp; - int i, numpages; - int rss, swp; down(&shm_ids.sem); shp = shm_lock(shmid); @@ -284,28 +331,8 @@ BUG(); shm_unlock(shmid); up(&shm_ids.sem); + killseg_core(shp, 1); - numpages = shp->shm_npages; - for (i = 0, rss = 0, swp = 0; i < numpages ; i++) { - pte_t pte; - pte = SHM_ENTRY (shp,i); - if (pte_none(pte)) - continue; - if (pte_present(pte)) { - __free_page (pte_page(pte)); - rss++; - } else { - swap_free(pte_to_swp_entry(pte)); - swp++; - } - } - shm_free (shp->shm_dir, numpages); - kfree(shp); - shm_lockall(); - shm_rss -= rss; - shm_swp -= swp; - shm_tot -= numpages; - shm_unlockall(); return; } @@ -458,6 +485,10 @@ shp = shm_lock(shmid); if(shp==NULL) return -EINVAL; + if (shp == &zshmid_kernel) { + shm_unlock(shmid); + return -EINVAL; + } if(cmd==SHM_STAT) { err = -EINVAL; if (shmid > shm_ids.max_id) @@ -498,6 +529,10 @@ shp = shm_lock(shmid); if(shp==NULL) return -EINVAL; + if (shp == &zshmid_kernel) { + shm_unlock(shmid); + return -EINVAL; + } err=-EIDRM; if(shm_checkid(shp,shmid)) goto out_unlock; @@ -532,6 +567,8 @@ err=-EINVAL; if(shp==NULL) goto out_up; + if (shp == &zshmid_kernel) + goto out_unlock_up; err=-EIDRM; if(shm_checkid(shp,shmid)) goto out_unlock_up; @@ -653,6 +690,8 @@ shp = shm_lock(shmid); if (!shp) goto out_up; + if (shp == &zshmid_kernel) + goto out_unlock_up; err = -EACCES; if (ipcperms(&shp->shm_perm, flg)) @@ -835,10 +874,12 @@ struct shmid_kernel *shp; unsigned int idx; struct page * page; + int is_shmzero; shp = (struct shmid_kernel *) shmd->vm_private_data; idx = (address - shmd->vm_start) >> PAGE_SHIFT; idx += shmd->vm_pgoff; + is_shmzero = (shp->id == zero_id); /* * A shared mapping past the last page of the file is an error @@ -850,7 +891,7 @@ return NULL; } down(&shp->sem); - if(shp != shm_lock(shp->id)) + if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) BUG(); pte = SHM_ENTRY(shp,idx); @@ -864,7 +905,7 @@ if (!page) goto oom; clear_highpage(page); - if(shp != shm_lock(shp->id)) + if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) BUG(); } else { swp_entry_t entry = pte_to_swp_entry(pte); @@ -882,11 +923,11 @@ delete_from_swap_cache(page); page = replace_with_highmem(page); swap_free(entry); - if(shp != shm_lock(shp->id)) + if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) BUG(); - shm_swp--; + if (is_shmzero) shm_swp--; } - shm_rss++; + if (is_shmzero) shm_rss++; pte = pte_mkdirty(mk_pte(page, PAGE_SHARED)); SHM_ENTRY(shp, idx) = pte; } else @@ -904,6 +945,65 @@ return NOPAGE_OOM; } +#define OKAY 0 +#define RETRY 1 +#define FAILED 2 + +static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage) +{ + pte_t page; + struct page *page_map; + + page = SHM_ENTRY(shp, idx); + if (!pte_present(page)) + return RETRY; + page_map = pte_page(page); + if (zone && (!memclass(page_map->zone, zone))) + return RETRY; + if (shp->id != zero_id) swap_attempts++; + + if (--counter < 0) /* failed */ + return FAILED; + if (page_count(page_map) != 1) + return RETRY; + + if (!(page_map = prepare_highmem_swapout(page_map))) + return FAILED; + SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry); + + /* add the locked page to the swap cache before allowing + the swapin path to run lookup_swap_cache(). This avoids + reading a not yet uptodate block from disk. + NOTE: we just accounted the swap space reference for this + swap cache page at __get_swap_page() time. */ + add_to_swap_cache(*outpage = page_map, swap_entry); + return OKAY; +} + +static void shm_swap_postop(struct page *page) +{ + lock_kernel(); + rw_swap_page(WRITE, page, 0); + unlock_kernel(); + __free_page(page); +} + +static int shm_swap_preop(swp_entry_t *swap_entry) +{ + lock_kernel(); + /* subtle: preload the swap count for the swap cache. We can't + increase the count inside the critical section as we can't release + the shm_lock there. And we can't acquire the big lock with the + shm_lock held (otherwise we would deadlock too easily). */ + *swap_entry = __get_swap_page(2); + if (!(*swap_entry).val) { + unlock_kernel(); + return 1; + } + unlock_kernel(); + return 0; +} + /* * Goes through counter = (shm_rss >> prio) present shm pages. */ @@ -912,7 +1012,6 @@ int shm_swap (int prio, int gfp_mask, zone_t *zone) { - pte_t page; struct shmid_kernel *shp; swp_entry_t swap_entry; unsigned long id, idx; @@ -919,21 +1018,13 @@ int loop = 0; int counter; struct page * page_map; - + + zshm_swap(prio, gfp_mask, zone); counter = shm_rss >> prio; if (!counter) return 0; - lock_kernel(); - /* subtle: preload the swap count for the swap cache. We can't - increase the count inside the critical section as we can't release - the shm_lock there. And we can't acquire the big lock with the - shm_lock held (otherwise we would deadlock too easily). */ - swap_entry = __get_swap_page(2); - if (!swap_entry.val) { - unlock_kernel(); + if (shm_swap_preop(&swap_entry)) return 0; - } - unlock_kernel(); shm_lockall(); check_id: @@ -943,8 +1034,12 @@ swap_idx = 0; if (++swap_id > shm_ids.max_id) { swap_id = 0; - if (loop) - goto failed; + if (loop) { +failed: + shm_unlockall(); + __swap_free(swap_entry, 2); + return 0; + } loop = 1; } goto check_id; @@ -956,43 +1051,16 @@ if (idx >= shp->shm_npages) goto next_id; - page = SHM_ENTRY(shp, idx); - if (!pte_present(page)) - goto check_table; - page_map = pte_page(page); - if (zone && (!memclass(page_map->zone, zone))) - goto check_table; - swap_attempts++; - - if (--counter < 0) { /* failed */ -failed: - shm_unlockall(); - __swap_free(swap_entry, 2); - return 0; + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { + case RETRY: goto check_table; + case FAILED: goto failed; } - if (page_count(page_map) != 1) - goto check_table; - - if (!(page_map = prepare_highmem_swapout(page_map))) - goto failed; - SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry); swap_successes++; shm_swp++; shm_rss--; - - /* add the locked page to the swap cache before allowing - the swapin path to run lookup_swap_cache(). This avoids - reading a not yet uptodate block from disk. - NOTE: we just accounted the swap space reference for this - swap cache page at __get_swap_page() time. */ - add_to_swap_cache(page_map, swap_entry); shm_unlockall(); - lock_kernel(); - rw_swap_page(WRITE, page_map, 0); - unlock_kernel(); - - __free_page(page_map); + shm_swap_postop(page_map); return 1; } @@ -1014,12 +1082,29 @@ swap_free(entry); } +static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page) +{ + int n; + + for (n = 0; n < shp->shm_npages; n++) { + if (pte_none(SHM_ENTRY(shp,n))) + continue; + if (pte_present(SHM_ENTRY(shp,n))) + continue; + if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) { + shm_unuse_page(shp, n, entry, page); + return 1; + } + } + return 0; +} + /* * unuse_shm() search for an eventually swapped out shm page. */ void shm_unuse(swp_entry_t entry, struct page *page) { - int i, n; + int i; shm_lockall(); for (i = 0; i <= shm_ids.max_id; i++) { @@ -1026,19 +1111,12 @@ struct shmid_kernel *shp = shm_get(i); if(shp==NULL) continue; - for (n = 0; n < shp->shm_npages; n++) { - if (pte_none(SHM_ENTRY(shp,n))) - continue; - if (pte_present(SHM_ENTRY(shp,n))) - continue; - if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) { - shm_unuse_page(shp, n, entry, page); - goto out; - } - } + if (shm_unuse_core(shp, entry, page)) + goto out; } out: shm_unlockall(); + zmap_unuse(entry, page); } #ifdef CONFIG_PROC_FS @@ -1053,6 +1131,10 @@ for(i = 0; i <= shm_ids.max_id; i++) { struct shmid_kernel* shp = shm_lock(i); + if (shp == &zshmid_kernel) { + shm_unlock(i); + continue; + } if(shp!=NULL) { #define SMALL_STRING "%10d %10d %4o %10u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" #define BIG_STRING "%10d %10d %4o %21u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" @@ -1100,3 +1182,138 @@ return len; } #endif + +static struct shmid_kernel *zmap_list = 0; +static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED; +static unsigned long zswap_idx = 0; /* next to swap */ +static struct shmid_kernel *zswap_shp = 0; + +static struct vm_operations_struct shmzero_vm_ops = { + open: shmzero_open, + close: shmzero_close, + nopage: shm_nopage, + swapout: shm_swapout, +}; + +int map_zero_setup(struct vm_area_struct *vma) +{ + struct shmid_kernel *shp; + + if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE))) + return -ENOMEM; + shp->id = zero_id; /* hack for shm_lock et al */ + vma->vm_private_data = shp; + vma->vm_ops = &shmzero_vm_ops; + shmzero_open(vma); + spin_lock(&zmap_list_lock); + shp->attaches = (struct vm_area_struct *)zmap_list; + zmap_list = shp; + spin_unlock(&zmap_list_lock); + return 0; +} + +static void shmzero_open(struct vm_area_struct *shmd) +{ + struct shmid_kernel *shp; + + shp = (struct shmid_kernel *) shmd->vm_private_data; + down(&shp->sem); + shp->shm_nattch++; + up(&shp->sem); +} + +static void shmzero_close(struct vm_area_struct *shmd) +{ + int done = 0; + struct shmid_kernel *shp, *prev, *cur; + + shp = (struct shmid_kernel *) shmd->vm_private_data; + down(&shp->sem); + if (--shp->shm_nattch == 0) + done = 1; + up(&shp->sem); + if (done) { + spin_lock(&zmap_list_lock); + if (shp == zswap_shp) + zswap_shp = (struct shmid_kernel *)(shp->attaches); + if (shp == zmap_list) + zmap_list = (struct shmid_kernel *)(shp->attaches); + else { + prev = zmap_list; + cur = (struct shmid_kernel *)(prev->attaches); + while (cur != shp) { + prev = cur; + cur = (struct shmid_kernel *)(prev->attaches); + } + prev->attaches = (struct vm_area_struct *)(shp->attaches); + } + spin_unlock(&zmap_list_lock); + killseg_core(shp, 0); + } +} + +static void zmap_unuse(swp_entry_t entry, struct page *page) +{ + struct shmid_kernel *shp; + + spin_lock(&zmap_list_lock); + shp = zmap_list; + while (shp) { + if (shm_unuse_core(shp, entry, page)) + break; + shp = (struct shmid_kernel *)shp->attaches; + } + spin_unlock(&zmap_list_lock); +} + +static void zshm_swap (int prio, int gfp_mask, zone_t *zone) +{ + struct shmid_kernel *shp; + swp_entry_t swap_entry; + unsigned long idx; + int loop = 0; + int counter; + struct page * page_map; + + counter = 10; /* maybe we should use zshm_rss */ + if (!counter) + return; +next: + if (shm_swap_preop(&swap_entry)) + return; + + spin_lock(&zmap_list_lock); + if (zmap_list == 0) + goto failed; +next_id: + if ((shp = zswap_shp) == 0) { + if (loop) { +failed: + spin_unlock(&zmap_list_lock); + __swap_free(swap_entry, 2); + return; + } + zswap_shp = shp = zmap_list; + zswap_idx = 0; + loop = 1; + } + +check_table: + idx = zswap_idx++; + if (idx >= shp->shm_npages) { + zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches); + zswap_idx = 0; + goto next_id; + } + + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { + case RETRY: goto check_table; + case FAILED: goto failed; + } + spin_unlock(&zmap_list_lock); + + shm_swap_postop(page_map); + if (counter) + goto next; + return; +} --- ipc/util.c Fri Feb 25 13:04:14 2000 +++ ipc/util.c Fri Feb 25 14:54:39 2000 @@ -317,4 +317,9 @@ { } +int map_zero_setup(struct vm_area_struct *vma) +{ + return -EINVAL; +} + #endif /* CONFIG_SYSVIPC */ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar 2000-02-26 16:38 ` Linus Torvalds @ 2000-02-29 10:54 ` Christoph Rohland 2000-02-29 18:30 ` Kanoj Sarcar 1 sibling, 1 reply; 19+ messages in thread From: Christoph Rohland @ 2000-02-29 10:54 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: linux-mm, linux-kernel, torvalds Hi Kanoj, kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > This is a patch against 2.3.47 that tries to implement shared /dev/zero > mappings. This is just a first cut attempt, I am hoping I will find a > few people to apply the patch and throw some real life programs at it > (preferably on low memory machines so that swapping is induced). > > Currently, you will also need to turn on CONFIG_SYSVIPC, but most of > the shm.c code can be split into a new ipc/shm_core.c file that is > always compiled in, irrespective of CONFIG_SYSVIPC. Linus, do you > think this is the proper direction to follow? > > Thanks. Comments and feedback welcome ... Why do you use this special zero_id stuff? It clutters up the whole code. If you would simply open a normal shm segment with key IPC_PRIVATE and directly remove it nobody can attach to it and it will be released on exit and everything. No special handling needed any more. BTW that's exectly what we do in user space to circumvent the missing MAP_ANON | MAP_SHARED. I would also prefer to be able to see the allocated segments with the ipc* commands. Greetings Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-02-29 10:54 ` Christoph Rohland @ 2000-02-29 18:30 ` Kanoj Sarcar 2000-03-01 12:08 ` Christoph Rohland 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-02-29 18:30 UTC (permalink / raw) To: Christoph Rohland; +Cc: linux-mm, linux-kernel, torvalds, Kanoj Sarcar > Why do you use this special zero_id stuff? It clutters up the whole > code. The zero_id stuff is required to differentiate between the struct shmid_kernel of a "real" shared memory segment and one that represents a /dev/zero mapping. This is used in shm_swap_core(), for accounting purposes, but that can be changed by adding a new flag to shm_swap_core. The more important use is in shm_nopage(). > > If you would simply open a normal shm segment with key IPC_PRIVATE and > directly remove it nobody can attach to it and it will be released on > exit and everything. No special handling needed any more. BTW that's > exectly what we do in user space to circumvent the missing MAP_ANON | > MAP_SHARED. Would this need to be done for each /dev/zero mapping? If so, then I prefer the way the code is right now, since the interference with "real" shared memory is minimal. I was also trying to look for a way to prevent the zshmid_kernel hacks in shmat/shmctl (including setting strict permissions), but couldn't come up with one ... Tell me in more detail what you are suggesting here. > > I would also prefer to be able to see the allocated segments with the > ipc* commands. > I do not believe there is any good reason to expose the special shared memory segment used as a place holder for all /dev/zero mappings to users via ipc* commands. This special segment exists only because we want to reduce kernel code duplication, and all the zshmid_kernel/ zero_id checks just make sure that regular shared memory works pretty much the way it did before. (One thing I am unhappy about is that this special segment eats up a shm id, but that's probably not too bad). Thanks. Kanoj -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-02-29 18:30 ` Kanoj Sarcar @ 2000-03-01 12:08 ` Christoph Rohland 2000-03-01 17:34 ` Kanoj Sarcar 0 siblings, 1 reply; 19+ messages in thread From: Christoph Rohland @ 2000-03-01 12:08 UTC (permalink / raw) To: Kanoj Sarcar, Linus Torvalds; +Cc: linux-mm, linux-kernel [-- Attachment #1: Type: text/plain, Size: 840 bytes --] Hi Kanoj and Linus, kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > [snip] > > I do not believe there is any good reason to expose the special > shared memory segment used as a place holder for all /dev/zero > mappings to users via ipc* commands. This special segment exists > only because we want to reduce kernel code duplication, and all the > zshmid_kernel/ zero_id checks just make sure that regular shared > memory works pretty much the way it did before. (One thing I am > unhappy about is that this special segment eats up a shm id, but > that's probably not too bad). The appended proposal reduces code duplication and complexity a lot. (The diff47 needs your patches against other files added.) I would vote to apply diff48 to the standard kernel. For me the whole solution is still a workaround. Greetings Christoph [-- Attachment #2: patch shm.c against 2.3.47 --] [-- Type: text/plain, Size: 951 bytes --] --- 2.3.47/ipc/shm.c Wed Mar 1 10:33:26 2000 +++ make48/ipc/shm.c Wed Mar 1 12:54:49 2000 @@ -11,6 +11,7 @@ * HIGHMEM support, Ingo Molnar <mingo@redhat.com> * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able, * Christoph Rohland <hans-christoph.rohland@sap.com> + * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> */ #include <linux/config.h> @@ -1100,3 +1101,24 @@ return len; } #endif + +int map_zero_setup(struct vm_area_struct *vma) +{ + int shmid; + struct shmid_kernel *shp; + + down(&shm_ids.sem); + shmid = newseg(IPC_PRIVATE, S_IRWXUGO, vma->vm_end - vma->vm_start); + up(&shm_ids.sem); + if (shmid < 0) + return shmid; + + if (!(shp = shm_lock (shmid))) + BUG(); + shp->shm_perm.mode |= SHM_DEST; + shm_unlock (shmid); + vma->vm_private_data = shp; + vma->vm_ops = &shm_vm_ops; + shm_open (vma); + return 0; +} [-- Attachment #3: patch shm.c against 2.3.48 --] [-- Type: text/plain, Size: 14877 bytes --] --- make48/ipc/shm.c.48 Wed Mar 1 10:34:39 2000 +++ make48/ipc/shm.c Wed Mar 1 12:54:49 2000 @@ -71,13 +71,6 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data); #endif -static void zshm_swap (int prio, int gfp_mask, zone_t *zone); -static void zmap_unuse(swp_entry_t entry, struct page *page); -static void shmzero_open(struct vm_area_struct *shmd); -static void shmzero_close(struct vm_area_struct *shmd); -static int zero_id; -static struct shmid_kernel zshmid_kernel; - size_t shm_ctlmax = SHMMAX; int shm_ctlall = SHMALL; int shm_ctlmni = SHMMNI; @@ -111,8 +104,6 @@ #ifdef CONFIG_PROC_FS create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL); #endif - zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni); - shm_unlock(zero_id); return; } @@ -189,26 +180,6 @@ return 0; } -static inline struct shmid_kernel *newseg_alloc(int numpages) -{ - struct shmid_kernel *shp; - - shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL); - if (!shp) - return 0; - - shp->shm_dir = shm_alloc (numpages); - if (!shp->shm_dir) { - kfree(shp); - return 0; - } - shp->shm_npages = numpages; - shp->attaches = NULL; - shp->shm_nattch = 0; - init_MUTEX(&shp->sem); - return(shp); -} - static int newseg (key_t key, int shmflg, size_t size) { struct shmid_kernel *shp; @@ -223,8 +194,15 @@ if (shm_tot + numpages >= shm_ctlall) return -ENOSPC; - if (!(shp = newseg_alloc(numpages))) + shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL); + if (!shp) + return -ENOMEM; + + shp->shm_dir = shm_alloc (numpages); + if (!shp->shm_dir) { + kfree(shp); return -ENOMEM; + } id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); if(id == -1) { shm_free(shp->shm_dir,numpages); @@ -235,10 +213,13 @@ shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->shm_segsz = size; shp->shm_cpid = current->pid; - shp->shm_lpid = 0; + shp->attaches = NULL; + shp->shm_lpid = shp->shm_nattch = 0; shp->shm_atime = shp->shm_dtime = 0; shp->shm_ctime = CURRENT_TIME; + shp->shm_npages = numpages; shp->id = shm_buildid(id,shp->shm_perm.seq); + init_MUTEX(&shp->sem); shm_tot += numpages; shm_unlock(id); @@ -275,35 +256,6 @@ return err; } -static void killseg_core(struct shmid_kernel *shp, int doacc) -{ - int i, numpages, rss, swp; - - numpages = shp->shm_npages; - for (i = 0, rss = 0, swp = 0; i < numpages ; i++) { - pte_t pte; - pte = SHM_ENTRY (shp,i); - if (pte_none(pte)) - continue; - if (pte_present(pte)) { - __free_page (pte_page(pte)); - rss++; - } else { - swap_free(pte_to_swp_entry(pte)); - swp++; - } - } - shm_free (shp->shm_dir, numpages); - kfree(shp); - if (doacc) { - shm_lockall(); - shm_rss -= rss; - shm_swp -= swp; - shm_tot -= numpages; - shm_unlockall(); - } -} - /* * Only called after testing nattch and SHM_DEST. * Here pages, pgtable and shmid_kernel are freed. @@ -311,6 +263,8 @@ static void killseg (int shmid) { struct shmid_kernel *shp; + int i, numpages; + int rss, swp; down(&shm_ids.sem); shp = shm_lock(shmid); @@ -331,8 +285,28 @@ BUG(); shm_unlock(shmid); up(&shm_ids.sem); - killseg_core(shp, 1); + numpages = shp->shm_npages; + for (i = 0, rss = 0, swp = 0; i < numpages ; i++) { + pte_t pte; + pte = SHM_ENTRY (shp,i); + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + __free_page (pte_page(pte)); + rss++; + } else { + swap_free(pte_to_swp_entry(pte)); + swp++; + } + } + shm_free (shp->shm_dir, numpages); + kfree(shp); + shm_lockall(); + shm_rss -= rss; + shm_swp -= swp; + shm_tot -= numpages; + shm_unlockall(); return; } @@ -485,10 +459,6 @@ shp = shm_lock(shmid); if(shp==NULL) return -EINVAL; - if (shp == &zshmid_kernel) { - shm_unlock(shmid); - return -EINVAL; - } if(cmd==SHM_STAT) { err = -EINVAL; if (shmid > shm_ids.max_id) @@ -529,10 +499,6 @@ shp = shm_lock(shmid); if(shp==NULL) return -EINVAL; - if (shp == &zshmid_kernel) { - shm_unlock(shmid); - return -EINVAL; - } err=-EIDRM; if(shm_checkid(shp,shmid)) goto out_unlock; @@ -567,8 +533,6 @@ err=-EINVAL; if(shp==NULL) goto out_up; - if (shp == &zshmid_kernel) - goto out_unlock_up; err=-EIDRM; if(shm_checkid(shp,shmid)) goto out_unlock_up; @@ -690,8 +654,6 @@ shp = shm_lock(shmid); if (!shp) goto out_up; - if (shp == &zshmid_kernel) - goto out_unlock_up; err = -EACCES; if (ipcperms(&shp->shm_perm, flg)) @@ -874,12 +836,10 @@ struct shmid_kernel *shp; unsigned int idx; struct page * page; - int is_shmzero; shp = (struct shmid_kernel *) shmd->vm_private_data; idx = (address - shmd->vm_start) >> PAGE_SHIFT; idx += shmd->vm_pgoff; - is_shmzero = (shp->id == zero_id); /* * A shared mapping past the last page of the file is an error @@ -891,7 +851,7 @@ return NULL; } down(&shp->sem); - if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) + if(shp != shm_lock(shp->id)) BUG(); pte = SHM_ENTRY(shp,idx); @@ -905,7 +865,7 @@ if (!page) goto oom; clear_highpage(page); - if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) + if(shp != shm_lock(shp->id)) BUG(); } else { swp_entry_t entry = pte_to_swp_entry(pte); @@ -923,11 +883,11 @@ delete_from_swap_cache(page); page = replace_with_highmem(page); swap_free(entry); - if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) + if(shp != shm_lock(shp->id)) BUG(); - if (is_shmzero) shm_swp--; + shm_swp--; } - if (is_shmzero) shm_rss++; + shm_rss++; pte = pte_mkdirty(mk_pte(page, PAGE_SHARED)); SHM_ENTRY(shp, idx) = pte; } else @@ -945,65 +905,6 @@ return NOPAGE_OOM; } -#define OKAY 0 -#define RETRY 1 -#define FAILED 2 - -static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage) -{ - pte_t page; - struct page *page_map; - - page = SHM_ENTRY(shp, idx); - if (!pte_present(page)) - return RETRY; - page_map = pte_page(page); - if (zone && (!memclass(page_map->zone, zone))) - return RETRY; - if (shp->id != zero_id) swap_attempts++; - - if (--counter < 0) /* failed */ - return FAILED; - if (page_count(page_map) != 1) - return RETRY; - - if (!(page_map = prepare_highmem_swapout(page_map))) - return FAILED; - SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry); - - /* add the locked page to the swap cache before allowing - the swapin path to run lookup_swap_cache(). This avoids - reading a not yet uptodate block from disk. - NOTE: we just accounted the swap space reference for this - swap cache page at __get_swap_page() time. */ - add_to_swap_cache(*outpage = page_map, swap_entry); - return OKAY; -} - -static void shm_swap_postop(struct page *page) -{ - lock_kernel(); - rw_swap_page(WRITE, page, 0); - unlock_kernel(); - __free_page(page); -} - -static int shm_swap_preop(swp_entry_t *swap_entry) -{ - lock_kernel(); - /* subtle: preload the swap count for the swap cache. We can't - increase the count inside the critical section as we can't release - the shm_lock there. And we can't acquire the big lock with the - shm_lock held (otherwise we would deadlock too easily). */ - *swap_entry = __get_swap_page(2); - if (!(*swap_entry).val) { - unlock_kernel(); - return 1; - } - unlock_kernel(); - return 0; -} - /* * Goes through counter = (shm_rss >> prio) present shm pages. */ @@ -1012,19 +913,28 @@ int shm_swap (int prio, int gfp_mask, zone_t *zone) { + pte_t page; struct shmid_kernel *shp; swp_entry_t swap_entry; unsigned long id, idx; int loop = 0; int counter; struct page * page_map; - - zshm_swap(prio, gfp_mask, zone); + counter = shm_rss >> prio; if (!counter) return 0; - if (shm_swap_preop(&swap_entry)) + lock_kernel(); + /* subtle: preload the swap count for the swap cache. We can't + increase the count inside the critical section as we can't release + the shm_lock there. And we can't acquire the big lock with the + shm_lock held (otherwise we would deadlock too easily). */ + swap_entry = __get_swap_page(2); + if (!swap_entry.val) { + unlock_kernel(); return 0; + } + unlock_kernel(); shm_lockall(); check_id: @@ -1034,12 +944,8 @@ swap_idx = 0; if (++swap_id > shm_ids.max_id) { swap_id = 0; - if (loop) { -failed: - shm_unlockall(); - __swap_free(swap_entry, 2); - return 0; - } + if (loop) + goto failed; loop = 1; } goto check_id; @@ -1051,16 +957,43 @@ if (idx >= shp->shm_npages) goto next_id; - switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { - case RETRY: goto check_table; - case FAILED: goto failed; + page = SHM_ENTRY(shp, idx); + if (!pte_present(page)) + goto check_table; + page_map = pte_page(page); + if (zone && (!memclass(page_map->zone, zone))) + goto check_table; + swap_attempts++; + + if (--counter < 0) { /* failed */ +failed: + shm_unlockall(); + __swap_free(swap_entry, 2); + return 0; } + if (page_count(page_map) != 1) + goto check_table; + + if (!(page_map = prepare_highmem_swapout(page_map))) + goto failed; + SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry); swap_successes++; shm_swp++; shm_rss--; + + /* add the locked page to the swap cache before allowing + the swapin path to run lookup_swap_cache(). This avoids + reading a not yet uptodate block from disk. + NOTE: we just accounted the swap space reference for this + swap cache page at __get_swap_page() time. */ + add_to_swap_cache(page_map, swap_entry); shm_unlockall(); - shm_swap_postop(page_map); + lock_kernel(); + rw_swap_page(WRITE, page_map, 0); + unlock_kernel(); + + __free_page(page_map); return 1; } @@ -1082,41 +1015,31 @@ swap_free(entry); } -static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page) -{ - int n; - - for (n = 0; n < shp->shm_npages; n++) { - if (pte_none(SHM_ENTRY(shp,n))) - continue; - if (pte_present(SHM_ENTRY(shp,n))) - continue; - if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) { - shm_unuse_page(shp, n, entry, page); - return 1; - } - } - return 0; -} - /* * unuse_shm() search for an eventually swapped out shm page. */ void shm_unuse(swp_entry_t entry, struct page *page) { - int i; + int i, n; shm_lockall(); for (i = 0; i <= shm_ids.max_id; i++) { struct shmid_kernel *shp = shm_get(i); if(shp==NULL) continue; - if (shm_unuse_core(shp, entry, page)) - goto out; + for (n = 0; n < shp->shm_npages; n++) { + if (pte_none(SHM_ENTRY(shp,n))) + continue; + if (pte_present(SHM_ENTRY(shp,n))) + continue; + if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) { + shm_unuse_page(shp, n, entry, page); + goto out; + } + } } out: shm_unlockall(); - zmap_unuse(entry, page); } #ifdef CONFIG_PROC_FS @@ -1131,10 +1054,6 @@ for(i = 0; i <= shm_ids.max_id; i++) { struct shmid_kernel* shp = shm_lock(i); - if (shp == &zshmid_kernel) { - shm_unlock(i); - continue; - } if(shp!=NULL) { #define SMALL_STRING "%10d %10d %4o %10u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" #define BIG_STRING "%10d %10d %4o %21u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" @@ -1183,137 +1102,23 @@ } #endif -static struct shmid_kernel *zmap_list = 0; -static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED; -static unsigned long zswap_idx = 0; /* next to swap */ -static struct shmid_kernel *zswap_shp = 0; - -static struct vm_operations_struct shmzero_vm_ops = { - open: shmzero_open, - close: shmzero_close, - nopage: shm_nopage, - swapout: shm_swapout, -}; - int map_zero_setup(struct vm_area_struct *vma) { + int shmid; struct shmid_kernel *shp; - if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE))) - return -ENOMEM; - shp->id = zero_id; /* hack for shm_lock et al */ + down(&shm_ids.sem); + shmid = newseg(IPC_PRIVATE, S_IRWXUGO, vma->vm_end - vma->vm_start); + up(&shm_ids.sem); + if (shmid < 0) + return shmid; + + if (!(shp = shm_lock (shmid))) + BUG(); + shp->shm_perm.mode |= SHM_DEST; + shm_unlock (shmid); vma->vm_private_data = shp; - vma->vm_ops = &shmzero_vm_ops; - shmzero_open(vma); - spin_lock(&zmap_list_lock); - shp->attaches = (struct vm_area_struct *)zmap_list; - zmap_list = shp; - spin_unlock(&zmap_list_lock); + vma->vm_ops = &shm_vm_ops; + shm_open (vma); return 0; -} - -static void shmzero_open(struct vm_area_struct *shmd) -{ - struct shmid_kernel *shp; - - shp = (struct shmid_kernel *) shmd->vm_private_data; - down(&shp->sem); - shp->shm_nattch++; - up(&shp->sem); -} - -static void shmzero_close(struct vm_area_struct *shmd) -{ - int done = 0; - struct shmid_kernel *shp, *prev, *cur; - - shp = (struct shmid_kernel *) shmd->vm_private_data; - down(&shp->sem); - if (--shp->shm_nattch == 0) - done = 1; - up(&shp->sem); - if (done) { - spin_lock(&zmap_list_lock); - if (shp == zswap_shp) - zswap_shp = (struct shmid_kernel *)(shp->attaches); - if (shp == zmap_list) - zmap_list = (struct shmid_kernel *)(shp->attaches); - else { - prev = zmap_list; - cur = (struct shmid_kernel *)(prev->attaches); - while (cur != shp) { - prev = cur; - cur = (struct shmid_kernel *)(prev->attaches); - } - prev->attaches = (struct vm_area_struct *)(shp->attaches); - } - spin_unlock(&zmap_list_lock); - killseg_core(shp, 0); - } -} - -static void zmap_unuse(swp_entry_t entry, struct page *page) -{ - struct shmid_kernel *shp; - - spin_lock(&zmap_list_lock); - shp = zmap_list; - while (shp) { - if (shm_unuse_core(shp, entry, page)) - break; - shp = (struct shmid_kernel *)shp->attaches; - } - spin_unlock(&zmap_list_lock); -} - -static void zshm_swap (int prio, int gfp_mask, zone_t *zone) -{ - struct shmid_kernel *shp; - swp_entry_t swap_entry; - unsigned long idx; - int loop = 0; - int counter; - struct page * page_map; - - counter = 10; /* maybe we should use zshm_rss */ - if (!counter) - return; -next: - if (shm_swap_preop(&swap_entry)) - return; - - spin_lock(&zmap_list_lock); - if (zmap_list == 0) - goto failed; -next_id: - if ((shp = zswap_shp) == 0) { - if (loop) { -failed: - spin_unlock(&zmap_list_lock); - __swap_free(swap_entry, 2); - return; - } - zswap_shp = shp = zmap_list; - zswap_idx = 0; - loop = 1; - } - -check_table: - idx = zswap_idx++; - if (idx >= shp->shm_npages) { - zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches); - zswap_idx = 0; - goto next_id; - } - - switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { - case RETRY: goto check_table; - case FAILED: goto failed; - } - spin_unlock(&zmap_list_lock); - - shm_swap_postop(page_map); - if (counter) - goto next; - return; } ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 12:08 ` Christoph Rohland @ 2000-03-01 17:34 ` Kanoj Sarcar 2000-03-01 17:55 ` Christoph Rohland 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-01 17:34 UTC (permalink / raw) To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel > > Hi Kanoj and Linus, > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > [snip] > > > > I do not believe there is any good reason to expose the special > > shared memory segment used as a place holder for all /dev/zero > > mappings to users via ipc* commands. This special segment exists > > only because we want to reduce kernel code duplication, and all the > > zshmid_kernel/ zero_id checks just make sure that regular shared > > memory works pretty much the way it did before. (One thing I am > > unhappy about is that this special segment eats up a shm id, but > > that's probably not too bad). > > The appended proposal reduces code duplication and complexity a > lot. (The diff47 needs your patches against other files added.) > What you have sent is what I used as a first draft for the implementation. The good part of it is that it reduces code duplication. The _really_ bad part is that it penalizes users in terms of numbers of shared memory segments, max size of /dev/zero mappings, and limitations imposed by shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a shmid for each /dev/zero mapping is a good idea ... Furthermore, I did not want to change behavior of information returned by ipc* and various procfs commands, as well as swapout behavior, thus the creation of the zmap_list. I decided a few lines of special case checking in a handful of places was a much better option. If the current /dev/zero stuff hampers any plans you have with shm code (eg page cachification), I would be willing to talk about it ... Kanoj > I would vote to apply diff48 to the standard kernel. For me the whole > solution is still a workaround. > > Greetings > Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 17:34 ` Kanoj Sarcar @ 2000-03-01 17:55 ` Christoph Rohland 2000-03-01 18:18 ` Kanoj Sarcar 0 siblings, 1 reply; 19+ messages in thread From: Christoph Rohland @ 2000-03-01 17:55 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Linus Torvalds, linux-mm, linux-kernel kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > What you have sent is what I used as a first draft for the implementation. > The good part of it is that it reduces code duplication. The _really_ bad > part is that it penalizes users in terms of numbers of shared memory > segments, max size of /dev/zero mappings, and limitations imposed by > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a > shmid for each /dev/zero mapping is a good idea ... We can tune all these parameters at runtime. This should not be a reason. > Furthermore, I did not want to change behavior of information returned > by ipc* and various procfs commands, as well as swapout behavior, thus > the creation of the zmap_list. I decided a few lines of special case > checking in a handful of places was a much better option. IMHO all this SYSV ipc stuff is a totally broken API and many agree with me. I do not care to clutter up the output of it a little bit for this feature. Nobody can know who is creating private IPC segments. So nobody should be irritated by some more segments displayed/used. In the contrary: I like the ability to restrict the usage of these segments with the ipc parameters. Keep in mind you can stack a lot of segments for a DOS attack. and all the segments will use the whole memory. > If the current /dev/zero stuff hampers any plans you have with shm code > (eg page cachification), I would be willing to talk about it ... It makes shm fs a lot more work. And the special handling slows down shm handling. Greetings Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 17:55 ` Christoph Rohland @ 2000-03-01 18:18 ` Kanoj Sarcar 2000-03-01 19:42 ` Christoph Rohland 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-01 18:18 UTC (permalink / raw) To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > What you have sent is what I used as a first draft for the implementation. > > The good part of it is that it reduces code duplication. The _really_ bad > > part is that it penalizes users in terms of numbers of shared memory > > segments, max size of /dev/zero mappings, and limitations imposed by > > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a > > shmid for each /dev/zero mapping is a good idea ... > > We can tune all these parameters at runtime. This should not be a > reason. Show me the patch ... by the time you are done, you _probably_ would have complicated the code more than the current /dev/zero tweaks. > > > Furthermore, I did not want to change behavior of information returned > > by ipc* and various procfs commands, as well as swapout behavior, thus > > the creation of the zmap_list. I decided a few lines of special case > > checking in a handful of places was a much better option. > > IMHO all this SYSV ipc stuff is a totally broken API and many agree > with me. I do not care to clutter up the output of it a little bit for > this feature. In reality, /dev/zero should have nothing to do with SYSV. Currently, its that way because I wanted to minimize code duplication. Most of the *_core() routines can be taken into ipc/shm_core.c, and together with util.c, will let /dev/zero be decoupled from SYSV. > > Nobody can know who is creating private IPC segments. So nobody should > be irritated by some more segments displayed/used. The problem is more with the limits, much less with the output ... > > In the contrary: I like the ability to restrict the usage of these > segments with the ipc parameters. Keep in mind you can stack a lot of > segments for a DOS attack. and all the segments will use the whole > memory. Not sure what you are talking about here ... /dev/zero mmaps are subject to the same vm-resource checking as other mmaps, and this checking is a little different for "real" shm creation. > > > If the current /dev/zero stuff hampers any plans you have with shm code > > (eg page cachification), I would be willing to talk about it ... > > It makes shm fs a lot more work. And the special handling slows down > shm handling. The shm handling slow down is minimal. Most of this is an extra check in shm_nopage(), but that _only_ happens for /dev/zero segments, not for "real" shm segments. As to why shm fs is a lot more work, we can talk. Linus/Ingo did bring this up, at a general level, we think that adding an extra inode or other data structure in map_zero_setup() would be able to handle this. If a small amount of special case code is the problem, I would suggest keep the code as it is for now. Once you have the shm fs work done for "real" shm segments, I can look at how to handle /dev/zero segments. Kanoj > > Greetings > Christoph > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux.eu.org/Linux-MM/ > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 18:18 ` Kanoj Sarcar @ 2000-03-01 19:42 ` Christoph Rohland 2000-03-01 20:09 ` Kanoj Sarcar 0 siblings, 1 reply; 19+ messages in thread From: Christoph Rohland @ 2000-03-01 19:42 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Linus Torvalds, linux-mm, linux-kernel Hi Kanoj, kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > > > What you have sent is what I used as a first draft for the > > > implementation. The good part of it is that it reduces code > > > duplication. The _really_ bad part is that it penalizes users in > > > terms of numbers of shared memory segments, max size of > > > /dev/zero mappings, and limitations imposed by > > > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a > > > shmid for each /dev/zero mapping is a good idea ... > > > > We can tune all these parameters at runtime. This should not be a > > reason. > > Show me the patch ... by the time you are done, you _probably_ would > have complicated the code more than the current /dev/zero tweaks. It _is_ tunable in the current code. > > > Furthermore, I did not want to change behavior of information > > > returned by ipc* and various procfs commands, as well as swapout > > > behavior, thus the creation of the zmap_list. I decided a few > > > lines of special case checking in a handful of places was a much > > > better option. > > > > IMHO all this SYSV ipc stuff is a totally broken API and many > > agree with me. I do not care to clutter up the output of it a > > little bit for this feature. > > In reality, /dev/zero should have nothing to do with > SYSV. Currently, its that way because I wanted to minimize code > duplication. Most of the *_core() routines can be taken into > ipc/shm_core.c, and together with util.c, will let /dev/zero be > decoupled from SYSV. That would be the best case and thus your proposal is a workaround until the pagecache can handle it. > > Nobody can know who is creating private IPC segments. So nobody > > should be irritated by some more segments displayed/used. > > The problem is more with the limits, much less with the output ... And they are tunable... > > In the contrary: I like the ability to restrict the usage of these > > segments with the ipc parameters. Keep in mind you can stack a lot > > of segments for a DOS attack. and all the segments will use the > > whole memory. > > Not sure what you are talking about here ... /dev/zero mmaps are subject > to the same vm-resource checking as other mmaps, and this checking is > a little different for "real" shm creation. Think about first mmaping an anonymous shared segment, touching all the pages, unmapping most of it. Then start over again. You end up with loads of pages used and not freed and no longer accessible. > > > If the current /dev/zero stuff hampers any plans you have with shm code > > > (eg page cachification), I would be willing to talk about it ... > > > > It makes shm fs a lot more work. And the special handling slows down > > shm handling. > > The shm handling slow down is minimal. Most of this is an extra > check in shm_nopage(), but that _only_ happens for /dev/zero > segments, not for "real" shm segments. The check happens for _all_ segments and shm_nopage can be called very often on big machines under heavy load. > As to why shm fs is a lot more work, we can talk. Linus/Ingo did > bring this up, at a general level, we think that adding an extra > inode or other data structure in map_zero_setup() would be able to > handle this. > > If a small amount of special case code is the problem, I would suggest > keep the code as it is for now. Once you have the shm fs work done for > "real" shm segments, I can look at how to handle /dev/zero segments. shm fs is working. I will send a patch against 2.3.48 soon. Greetings Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 19:42 ` Christoph Rohland @ 2000-03-01 20:09 ` Kanoj Sarcar 2000-03-06 22:43 ` Stephen C. Tweedie 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-01 20:09 UTC (permalink / raw) To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel > > Hi Kanoj, > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > > > > > What you have sent is what I used as a first draft for the > > > > implementation. The good part of it is that it reduces code > > > > duplication. The _really_ bad part is that it penalizes users in > > > > terms of numbers of shared memory segments, max size of > > > > /dev/zero mappings, and limitations imposed by > > > > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a > > > > shmid for each /dev/zero mapping is a good idea ... > > > > > > We can tune all these parameters at runtime. This should not be a > > > reason. > > > > Show me the patch ... by the time you are done, you _probably_ would > > have complicated the code more than the current /dev/zero tweaks. > > It _is_ tunable in the current code. Oh, you are talking about asking administrators to do this tuning, which is unfair. I was talking about automatic in-kernel tuning whenever a new /dev/zero segment is created/destroyed etc ... > > > > > Furthermore, I did not want to change behavior of information > > > > returned by ipc* and various procfs commands, as well as swapout > > > > behavior, thus the creation of the zmap_list. I decided a few > > > > lines of special case checking in a handful of places was a much > > > > better option. > > > > > > IMHO all this SYSV ipc stuff is a totally broken API and many > > > agree with me. I do not care to clutter up the output of it a > > > little bit for this feature. > > > > In reality, /dev/zero should have nothing to do with > > SYSV. Currently, its that way because I wanted to minimize code > > duplication. Most of the *_core() routines can be taken into > > ipc/shm_core.c, and together with util.c, will let /dev/zero be > > decoupled from SYSV. > > That would be the best case and thus your proposal is a workaround > until the pagecache can handle it. > > > > Nobody can know who is creating private IPC segments. So nobody > > > should be irritated by some more segments displayed/used. > > > > The problem is more with the limits, much less with the output ... > > And they are tunable... See above ... > > > > In the contrary: I like the ability to restrict the usage of these > > > segments with the ipc parameters. Keep in mind you can stack a lot > > > of segments for a DOS attack. and all the segments will use the > > > whole memory. > > > > Not sure what you are talking about here ... /dev/zero mmaps are subject > > to the same vm-resource checking as other mmaps, and this checking is > > a little different for "real" shm creation. > > Think about first mmaping an anonymous shared segment, touching all > the pages, unmapping most of it. Then start over again. You end up > with loads of pages used and not freed and no longer accessible. True ... and this can be independently handled via a shmzero_unmap() type operation in shmzero_vm_ops (I never claimed the /dev/zero stuff is complete :-)) Even in the absence of that, vm_enough_memory() should in theory be able to prevent deadlocks, with all its known caveats ... > > > > > If the current /dev/zero stuff hampers any plans you have with shm code > > > > (eg page cachification), I would be willing to talk about it ... > > > > > > It makes shm fs a lot more work. And the special handling slows down > > > shm handling. > > > > The shm handling slow down is minimal. Most of this is an extra > > check in shm_nopage(), but that _only_ happens for /dev/zero > > segments, not for "real" shm segments. > > The check happens for _all_ segments and shm_nopage can be called very > often on big machines under heavy load. I was talking about the if ((shp != shm_lock(shp->id)) && (is_shmzero == 0)) checks in shm_nopage. > > > As to why shm fs is a lot more work, we can talk. Linus/Ingo did > > bring this up, at a general level, we think that adding an extra > > inode or other data structure in map_zero_setup() would be able to > > handle this. > > > > If a small amount of special case code is the problem, I would suggest > > keep the code as it is for now. Once you have the shm fs work done for > > "real" shm segments, I can look at how to handle /dev/zero segments. > > shm fs is working. I will send a patch against 2.3.48 soon. Great, we can see what makes sense for /dev/zero wrt shmfs ... Kanoj > > Greetings > Christoph > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-01 20:09 ` Kanoj Sarcar @ 2000-03-06 22:43 ` Stephen C. Tweedie 2000-03-06 23:01 ` Kanoj Sarcar 0 siblings, 1 reply; 19+ messages in thread From: Stephen C. Tweedie @ 2000-03-06 22:43 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Christoph Rohland, Linus Torvalds, linux-mm, linux-kernel Hi, On Wed, 1 Mar 2000 12:09:11 -0800 (PST), kanoj@google.engr.sgi.com (Kanoj Sarcar) said: > Great, we can see what makes sense for /dev/zero wrt shmfs ... In principle, there is no reason why we need any special support for this sort of thing. The swap cache already does very nearly all we need. The swap cache maintains links between swap locations on disk and (potentially shared) anonymous pages in memory. It was designed so that even if you fork, as long as the resulting COW pages remain unchanged, the two processes will share the same pages of physical memory or swap no matter what. Basically, as soon as you try to swap out an anonymous page from one process, a swap entry is allocated and a swap cache entry is set up. Only once the last process swaps out does the physical page in memory get released. Until that happens, any lookup of the swap entry from a swapped-out process will find the page already in memory. To make this work for shared anonymous pages, we need two changes to the swap cache. We need to teach the swap cache about writable anonymous pages, and we need to be able to defer the physical writing of the page to swap until the last reference to the swap cache frees up the page. Do that, and shared /dev/zero maps will Just Work. The mechanics of this are probably a little too subtle to get right for 2.4 at this stage (in particular you need to know when to write the page back to disk: we don't have the necessary PAGE_DIRTY infrastructure in place for anonymous pages), but I definitely think this is the right way to do things in the long term. --Stephen -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-06 22:43 ` Stephen C. Tweedie @ 2000-03-06 23:01 ` Kanoj Sarcar 2000-03-08 12:02 ` Christoph Rohland 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-06 23:01 UTC (permalink / raw) To: Stephen C. Tweedie Cc: Christoph Rohland, Linus Torvalds, linux-mm, linux-kernel > > To make this work for shared anonymous pages, we need two changes to the > swap cache. We need to teach the swap cache about writable anonymous > pages, and we need to be able to defer the physical writing of the page > to swap until the last reference to the swap cache frees up the page. > Do that, and shared /dev/zero maps will Just Work. > The current implementation of /dev/zero shared memory is to treat the mapping as similarly as possible to a shared memory segment. The common code handles the swap cache interactions, and both cases qualify as shared anonymous mappings. While its not well tested, in theory it should work. We are currently agonizing over how to integrate the /dev/zero code with shmfs patch. Kanoj -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-06 23:01 ` Kanoj Sarcar @ 2000-03-08 12:02 ` Christoph Rohland 2000-03-08 17:51 ` Kanoj Sarcar 0 siblings, 1 reply; 19+ messages in thread From: Christoph Rohland @ 2000-03-08 12:02 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar Hi Kanoj, kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > To make this work for shared anonymous pages, we need two changes > > to the swap cache. We need to teach the swap cache about writable > > anonymous pages, and we need to be able to defer the physical > > writing of the page to swap until the last reference to the swap > > cache frees up the page. Do that, and shared /dev/zero maps will > > Just Work. > > The current implementation of /dev/zero shared memory is to treat > the mapping as similarly as possible to a shared memory segment. The > common code handles the swap cache interactions, and both cases > qualify as shared anonymous mappings. While its not well tested, in > theory it should work. We are currently agonizing over how to > integrate the /dev/zero code with shmfs patch. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Since this is not as easy as you thought, wouldn't it be better to do the /dev/zero shared maps in the swap cache instead of this workaround over shm? Thus we would get the mechanisms to redo all shm stuff wrt swap cache. At the same time we would not hinder the development of normal shm code to use file semantics (aka shm fs) which will give us posix shm. Greetings Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-08 12:02 ` Christoph Rohland @ 2000-03-08 17:51 ` Kanoj Sarcar 2000-03-08 18:35 ` Christoph Rohland 0 siblings, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-08 17:51 UTC (permalink / raw) To: Christoph Rohland Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar > > Hi Kanoj, > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > To make this work for shared anonymous pages, we need two changes > > > to the swap cache. We need to teach the swap cache about writable > > > anonymous pages, and we need to be able to defer the physical > > > writing of the page to swap until the last reference to the swap > > > cache frees up the page. Do that, and shared /dev/zero maps will > > > Just Work. > > > > The current implementation of /dev/zero shared memory is to treat > > the mapping as similarly as possible to a shared memory segment. The > > common code handles the swap cache interactions, and both cases > > qualify as shared anonymous mappings. While its not well tested, in > > theory it should work. We are currently agonizing over how to > > integrate the /dev/zero code with shmfs patch. > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > Since this is not as easy as you thought, wouldn't it be better to do > the /dev/zero shared maps in the swap cache instead of this workaround > over shm? Thus we would get the mechanisms to redo all shm stuff wrt > swap cache. > > At the same time we would not hinder the development of normal shm > code to use file semantics (aka shm fs) which will give us posix shm. > > Greetings > Christoph > I am not sure why you think the /dev/zero code is a workaround on top of shm. A lot of code and mechanisms are easily sharable between shm and /dev/zero, since they are, as I pointed out, anonymous shared pages. The only differences are when the data structures are torn down, and which processes may attach to the segments. Btw, implementing /dev/zero using shm code mostly is _quite_ easy, that's how the code has been since 2.3.48. Even integrating with shmfs has been pretty easy, as you have seen in the patches I have CCed you on. The harder part is to look towards the future and do what Linus suggested, namely associate each mapping with an inode so in the future the inodecache might possibly be used to manage the shm pages. As you know, I sent out a patch for that yesterday. Its completely okay by me to take in a dev-zero/shmfs integration patch that is not perfect wrt /dev/zero, as I have indicated to you and Linus, just so that the shmfs work gets in. I can fix minor problems with the /dev/zero code as they come up. What sct suggests is quite involved, as he himself mentions. Just implementing /dev/zero is probably not a good reason to undertake it. Hope this makes sense. Kanoj -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-08 17:51 ` Kanoj Sarcar @ 2000-03-08 18:35 ` Christoph Rohland 2000-03-08 18:48 ` Linus Torvalds 2000-03-08 18:57 ` Kanoj Sarcar 0 siblings, 2 replies; 19+ messages in thread From: Christoph Rohland @ 2000-03-08 18:35 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > I am not sure why you think the /dev/zero code is a workaround on > top of shm. A lot of code and mechanisms are easily sharable between > shm and /dev/zero, since they are, as I pointed out, anonymous > shared pages. The only differences are when the data structures are > torn down, and which processes may attach to the segments. Because I think the current shm code should be redone in a way that shared anonymous pages live in the swap cache. You could say the shm code is a workaround :-) > Btw, implementing /dev/zero using shm code mostly is _quite_ easy, > that's how the code has been since 2.3.48. Even integrating with > shmfs has been pretty easy, as you have seen in the patches I have > CCed you on. The harder part is to look towards the future and do > what Linus suggested, namely associate each mapping with an inode so > in the future the inodecache might possibly be used to manage the > shm pages. As you know, I sent out a patch for that yesterday. In my opinion this is one of two orthogonal steps. shm fs targets the better integration in the file system semantics. > Its completely okay by me to take in a dev-zero/shmfs integration > patch that is not perfect wrt /dev/zero, as I have indicated to > you and Linus, just so that the shmfs work gets in. I can fix > minor problems with the /dev/zero code as they come up. > > What sct suggests is quite involved, as he himself mentions. Just > implementing /dev/zero is probably not a good reason to undertake > it. But IMHO reworking shm based on the /dev/zero stuff would be a good reason to do the /dev/zero stuff right. That's all I wanted to say in my last mail. Perhaps I am a little bit too opposed to these changes because I have seen too many patches thrown on the shm code during the 2.3 cycle which were plain buggy and nobody cared. Most of my kernel work since some time is doing quite stupid tests on the shm code. BTW: I am just running these tests on your patch and it seems to work quite well. (I will let it run over night) If it survives that I will also throw some quite complicated /dev/zero tests on it later. Greetings Christoph -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-08 18:35 ` Christoph Rohland @ 2000-03-08 18:48 ` Linus Torvalds 2000-03-08 18:57 ` Kanoj Sarcar 1 sibling, 0 replies; 19+ messages in thread From: Linus Torvalds @ 2000-03-08 18:48 UTC (permalink / raw) To: Christoph Rohland; +Cc: Kanoj Sarcar, Stephen C. Tweedie, linux-mm, Ingo Molnar On 8 Mar 2000, Christoph Rohland wrote: > > Because I think the current shm code should be redone in a way that > shared anonymous pages live in the swap cache. You could say the shm > code is a workaround :-) Note that this is true of both shm AND /dev/zero. Whether it is done in the current really clunky manner (ugly special shm tables) or in the RightWay(tm) (page cache), shm and /dev/zero should always basically be the same. The only difference between shm and /dev/zero is how you access and set up mappings, not how the actual mapping then works. Linus -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-08 18:35 ` Christoph Rohland 2000-03-08 18:48 ` Linus Torvalds @ 2000-03-08 18:57 ` Kanoj Sarcar 2000-03-09 18:15 ` Christoph Rohland 1 sibling, 1 reply; 19+ messages in thread From: Kanoj Sarcar @ 2000-03-08 18:57 UTC (permalink / raw) To: Christoph Rohland Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > > I am not sure why you think the /dev/zero code is a workaround on > > top of shm. A lot of code and mechanisms are easily sharable between > > shm and /dev/zero, since they are, as I pointed out, anonymous > > shared pages. The only differences are when the data structures are > > torn down, and which processes may attach to the segments. > > Because I think the current shm code should be redone in a way that > shared anonymous pages live in the swap cache. You could say the shm > code is a workaround :-) > No arguments there :-) But it seems a little ambitious to me to implement shmfs, dev-zero and rework the swapcache at the same time. We are probably on the right track, having done /dev/zero, getting done with shmfs. Then, if we want to take the risk, we can improve the core shm/swapcache interactions in 2.3/2.4/2.5. > > Btw, implementing /dev/zero using shm code mostly is _quite_ easy, > > that's how the code has been since 2.3.48. Even integrating with > > shmfs has been pretty easy, as you have seen in the patches I have > > CCed you on. The harder part is to look towards the future and do > > what Linus suggested, namely associate each mapping with an inode so > > in the future the inodecache might possibly be used to manage the > > shm pages. As you know, I sent out a patch for that yesterday. > > In my opinion this is one of two orthogonal steps. shm fs targets the > better integration in the file system semantics. > > > Its completely okay by me to take in a dev-zero/shmfs integration > > patch that is not perfect wrt /dev/zero, as I have indicated to > > you and Linus, just so that the shmfs work gets in. I can fix > > minor problems with the /dev/zero code as they come up. > > > > What sct suggests is quite involved, as he himself mentions. Just > > implementing /dev/zero is probably not a good reason to undertake > > it. > > But IMHO reworking shm based on the /dev/zero stuff would be a good > reason to do the /dev/zero stuff right. That's all I wanted to say in > my last mail. > > Perhaps I am a little bit too opposed to these changes because I have > seen too many patches thrown on the shm code during the 2.3 cycle > which were plain buggy and nobody cared. Most of my kernel work since > some time is doing quite stupid tests on the shm code. > > BTW: I am just running these tests on your patch and it seems to work > quite well. (I will let it run over night) If it survives that I will > also throw some quite complicated /dev/zero tests on it later. Great! Is there a way to capture tests written by individual developers and testers and have them be shared, so everyone can use them (when there are no licensing/copyright issues)? I would definitely like to get your test programs and throw them at the kernel myself. Lets talk offline if you are willing to share your tests. Kanoj > > Greetings > Christoph > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux.eu.org/Linux-MM/ > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/ ^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [RFC] [RFT] Shared /dev/zero mmaping feature 2000-03-08 18:57 ` Kanoj Sarcar @ 2000-03-09 18:15 ` Christoph Rohland 0 siblings, 0 replies; 19+ messages in thread From: Christoph Rohland @ 2000-03-09 18:15 UTC (permalink / raw) To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar [-- Attachment #1: Type: text/plain, Size: 3822 bytes --] kanoj@google.engr.sgi.com (Kanoj Sarcar) writes: > > Because I think the current shm code should be redone in a way that > > shared anonymous pages live in the swap cache. You could say the shm > > code is a workaround :-) > > No arguments there :-) But it seems a little ambitious to me to > implement shmfs, dev-zero and rework the swapcache at the same time. > We are probably on the right track, having done /dev/zero, getting > done with shmfs. Then, if we want to take the risk, we can improve > the core shm/swapcache interactions in 2.3/2.4/2.5. O.K. I would really like to work on that further. > > BTW: I am just running these tests on your patch and it seems to work > > quite well. (I will let it run over night) If it survives that I will > > also throw some quite complicated /dev/zero tests on it later. > > Great! Is there a way to capture tests written by individual > developers and testers and have them be shared, so everyone can use > them (when there are no licensing/copyright issues)? I would > definitely like to get your test programs and throw them at the > kernel myself. Lets talk offline if you are willing to share your > tests. Unfortunately I do not have web/ftp space to put the programs on, but they are in the attachments. I normally run them in two steps: 1) Many processes runing against a nunber of segments which fit into the main memory. 2) 2 processes wich access more segments so they get it swapping. All processes run with 20% probability for deletion. My tests with your patch went so far mostly well. The machine locks up after some while or kills init, but this is AFAICS stock behaviour. One bug was introduced by your patch. On swapoff we get an oops: ksymoops 0.7c on i686 2.3.50. Options used -V (default) -K (specified) -l /proc/modules (default) -o /lib/modules/2.3.50/ (default) -m /boot/System.map-2.3.50 (specified) No modules in ksyms, skipping objects No ksyms, skipping lsmod Unable to handle kernel NULL pointer dereference at virtual address 00000000 *pde = 21118001 Oops: 0000 CPU: 7 EIP: 0010:[<c0185d17>] Using defaults from ksymoops -t elf32-i386 -a i386 EFLAGS: 00010246 eax: 00000000 ebx: 00000000 ecx: 00000000 edx: 00000000 esi: 00000000 edi: c029ac74 ebp: c0185474 esp: d1c63f24 ds: 0018 es: 0018 ss: 0018 Process swapoff (pid: 624, stackpage=d1c63000) Stack: c029ac74 c032cb74 c19daa94 02944c00 00000000 c0186889 c029ac74 02944c00 c19daa94 00000001 c19daa94 02944c00 0002944c c0185e62 02944c00 c19daa94 c02aa000 c19daa94 02944c00 c013baa7 02944c00 c19daa94 d1c62000 00000000 Call Trace: [<c0186889>] [<c0185e62>] [<c013baa7>] [<c013bec9>] [<c010b49c>] Code: 8b 14 91 8b 04 da 89 44 24 10 0b 44 da 04 74 3e 8b 04 da 8b >>EIP; c0185d17 <shm_unuse_core+33/90> <===== Trace; c0186889 <zmap_unuse+121/1d8> Trace; c0185e62 <shm_unuse+ee/f8> Trace; c013baa7 <try_to_unuse+1fb/3a0> Trace; c013bec9 <sys_swapoff+27d/4b8> Trace; c010b49c <system_call+34/38> Code; c0185d17 <shm_unuse_core+33/90> 00000000 <_EIP>: Code; c0185d17 <shm_unuse_core+33/90> <===== 0: 8b 14 91 movl (%ecx,%edx,4),%edx <===== Code; c0185d1a <shm_unuse_core+36/90> 3: 8b 04 da movl (%edx,%ebx,8),%eax Code; c0185d1d <shm_unuse_core+39/90> 6: 89 44 24 10 movl %eax,0x10(%esp,1) Code; c0185d21 <shm_unuse_core+3d/90> a: 0b 44 da 04 orl 0x4(%edx,%ebx,8),%eax Code; c0185d25 <shm_unuse_core+41/90> e: 74 3e je 4e <_EIP+0x4e> c0185d65 <shm_unuse_core+81/90> Code; c0185d27 <shm_unuse_core+43/90> 10: 8b 04 da movl (%edx,%ebx,8),%eax Code; c0185d2a <shm_unuse_core+46/90> 13: 8b 00 movl (%eax),%eax Greetings Christoph -- [-- Attachment #2: ipctst.c --] [-- Type: text/plain, Size: 1695 bytes --] #include <stdlib.h> #include <stdio.h> #include <errno.h> #include <sys/types.h> #include <sys/ipc.h> #include <sys/shm.h> int main (int ac, char **av) { int segs, size, proc, rmpr; unsigned long long iter; struct shmid_ds buf; pid_t pid; if (ac < 6) { printf ("usage: shmtst segs size proc iter rm%%\n"); exit (1); } segs = atoi (av[1]); size = atoi (av[2]); proc = atoi (av[3]); iter = atoi (av[4]); rmpr = atoi (av[5]); iter = 1 << iter; printf ("using %d segs of size %d (%llu iterations)\n", segs, size, iter); while (-- proc) { if ((pid = fork()) > 0) { printf ("started process %d\n", (int) pid); } else { break; } } srandom (getpid()); while (iter--) { key_t key; int seg, i; unsigned char c, *ptr; volatile unsigned char *p; key = random() % segs +1; if ((seg = shmget (key, size, IPC_CREAT| 0600)) == -1) { perror("shmget"); if (errno != EIDRM) exit (1); continue; } if (0) sched_yield(); if ((ptr = shmat (seg, 0, 0)) == (unsigned char *) -1) { perror ("shmat"); continue; } for (p = ptr; p < ptr + size; p += 4097) *p = (unsigned char) (p - ptr); for (p = ptr; p < ptr + size; p += 4097) { c = *p; if (c == (unsigned char)(p-ptr)) continue; shmctl (seg, IPC_STAT, &buf); printf ("n=%i, m = %i: %i != %i", (int) buf.shm_nattch, (int)buf.shm_perm.mode, (int)(unsigned char)(p-ptr), (int) c); for (i = 0 ; i < 5; i++) { printf (", %i", (int)*p); sched_yield(); } printf ("\n"); } if (shmdt (ptr) != 0) { perror("shmdt"); exit (1); } if (random () % 100 < rmpr && shmctl (seg, IPC_RMID, NULL) == -1) perror("shmctl IPC_RMID"); } } [-- Attachment #3: shmtst.c --] [-- Type: text/plain, Size: 1540 bytes --] #include <stdlib.h> #include <stdio.h> #include <errno.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/mman.h> int main (int ac, char **av) { int segs, size, proc, rmpr; unsigned long long iter; pid_t pid; struct stat buf; if (ac < 6) { printf ("usage: shmtst segs size proc iter rm%%\n"); exit (1); } segs = atoi (av[1]); size = atoi (av[2]); proc = atoi (av[3]); iter = atoi (av[4]); rmpr = atoi (av[5]); iter = 1 << iter; printf ("using %d segs of size %d (%llu iterations)\n", segs, size, iter); while (-- proc) { if ((pid = fork()) > 0) { printf ("started process %d\n", (int) pid); } else { break; } } srandom (getpid()); while (iter--) { int seg, key; char name[20]; char *ptr, *p; key = random() % segs; sprintf (name, "my%d", key); if ((seg = shm_open (name, O_CREAT| O_RDWR , 0666)) == -1) { perror("shm_open"); exit (1); } if (ftruncate (seg, size) == -1) { perror ("ftruncate"); exit (2); } if ((ptr = mmap (0, size, PROT_READ|PROT_WRITE,MAP_SHARED, seg, 0)) == MAP_FAILED) { perror ("mmap"); exit (3); } for (p = ptr; p < ptr + size; p += 4097) *p = (char) p; for (p = ptr; p < ptr + size; p += 4097) if (*p != (char)p) printf ("*p(%i) != p(%i)\n", (int)*p, (int)p & 0xff); if (munmap (ptr, size) == -1) perror ("munmap"); if (close(seg) == -1) perror ("close"); if (random () % 100 < rmpr && shm_unlink (name) == -1) perror("shm_unlink"); } } [-- Attachment #4: libposix4.c --] [-- Type: text/plain, Size: 408 bytes --] #include <asm/unistd.h> #include <errno.h> #define PREFIX "/var/shm" int shm_open (const char *pathname, int flags, int mode) { char name[strlen(pathname) + sizeof(PREFIX)]; sprintf (name, PREFIX "/%s", pathname); return open (name, flags, mode); } int shm_unlink(const char *pathname){ char name[strlen(pathname) + sizeof(PREFIX)]; sprintf (name, PREFIX "/%s", pathname); return unlink (name); } ^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2000-03-09 18:15 UTC | newest] Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar 2000-02-26 16:38 ` Linus Torvalds 2000-02-26 21:47 ` Kanoj Sarcar 2000-02-29 10:54 ` Christoph Rohland 2000-02-29 18:30 ` Kanoj Sarcar 2000-03-01 12:08 ` Christoph Rohland 2000-03-01 17:34 ` Kanoj Sarcar 2000-03-01 17:55 ` Christoph Rohland 2000-03-01 18:18 ` Kanoj Sarcar 2000-03-01 19:42 ` Christoph Rohland 2000-03-01 20:09 ` Kanoj Sarcar 2000-03-06 22:43 ` Stephen C. Tweedie 2000-03-06 23:01 ` Kanoj Sarcar 2000-03-08 12:02 ` Christoph Rohland 2000-03-08 17:51 ` Kanoj Sarcar 2000-03-08 18:35 ` Christoph Rohland 2000-03-08 18:48 ` Linus Torvalds 2000-03-08 18:57 ` Kanoj Sarcar 2000-03-09 18:15 ` Christoph Rohland
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox