linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC] [RFT] Shared /dev/zero mmaping feature
@ 2000-02-25 23:08 Kanoj Sarcar
  2000-02-26 16:38 ` Linus Torvalds
  2000-02-29 10:54 ` Christoph Rohland
  0 siblings, 2 replies; 19+ messages in thread
From: Kanoj Sarcar @ 2000-02-25 23:08 UTC (permalink / raw)
  To: linux-mm, linux-kernel; +Cc: torvalds, Kanoj Sarcar

Hi all,

This is a patch against 2.3.47 that tries to implement shared /dev/zero
mappings. This is just a first cut attempt, I am hoping I will find a
few people to apply the patch and throw some real life programs at it
(preferably on low memory machines so that swapping is induced). 

Currently, you will also need to turn on CONFIG_SYSVIPC, but most of
the shm.c code can be split into a new ipc/shm_core.c file that is
always compiled in, irrespective of CONFIG_SYSVIPC. Linus, do you 
think this is the proper direction to follow?

Thanks. Comments and feedback welcome ...

Kanoj


--- drivers/char/mem.c	Fri Feb 25 13:03:46 2000
+++ drivers/char/mem.c	Fri Feb 25 14:26:44 2000
@@ -433,8 +433,12 @@
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+	int ret;
+	extern int map_zero_setup(struct vm_area_struct *);
+
 	if (vma->vm_flags & VM_SHARED)
-		return -EINVAL;
+		if (ret = map_zero_setup(vma))
+			return ret;
 	if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
 		return -EAGAIN;
 	return 0;
--- ipc/shm.c	Fri Feb 25 13:04:14 2000
+++ ipc/shm.c	Fri Feb 25 14:56:39 2000
@@ -11,6 +11,7 @@
  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/config.h>
@@ -70,6 +71,13 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
+static void zmap_unuse(swp_entry_t entry, struct page *page);
+static void shmzero_open(struct vm_area_struct *shmd);
+static void shmzero_close(struct vm_area_struct *shmd);
+static int zero_id;
+static struct kern_ipc_perm zero_perm;
+
 size_t shm_ctlmax = SHMMAX;
 int shm_ctlall = SHMALL;
 int shm_ctlmni = SHMMNI;
@@ -103,6 +111,8 @@
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 #endif
+	zero_id = ipc_addid(&shm_ids, &zero_perm, shm_ctlmni);
+	shm_unlock(zero_id);
 	return;
 }
 
@@ -179,6 +189,26 @@
 	return 0;
 }
 
+static inline struct shmid_kernel *newseg_alloc(int numpages)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+	if (!shp)
+		return 0;
+
+	shp->shm_dir = shm_alloc (numpages);
+	if (!shp->shm_dir) {
+		kfree(shp);
+		return 0;
+	}
+	shp->shm_npages = numpages;
+	shp->attaches = NULL;
+	shp->shm_nattch = 0;
+	init_MUTEX(&shp->sem);
+	return(shp);
+}
+
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	struct shmid_kernel *shp;
@@ -193,15 +223,8 @@
 	if (shm_tot + numpages >= shm_ctlall)
 		return -ENOSPC;
 
-	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
-	if (!shp)
-		return -ENOMEM;
-
-	shp->shm_dir = shm_alloc (numpages);
-	if (!shp->shm_dir) {
-		kfree(shp);
+	if (!(shp = newseg_alloc(numpages)))
 		return -ENOMEM;
-	}
 	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
 	if(id == -1) {
 		shm_free(shp->shm_dir,numpages);
@@ -212,13 +235,10 @@
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->shm_segsz = size;
 	shp->shm_cpid = current->pid;
-	shp->attaches = NULL;
-	shp->shm_lpid = shp->shm_nattch = 0;
+	shp->shm_lpid = 0;
 	shp->shm_atime = shp->shm_dtime = 0;
 	shp->shm_ctime = CURRENT_TIME;
-	shp->shm_npages = numpages;
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
-	init_MUTEX(&shp->sem);
 
 	shm_tot += numpages;
 	shm_unlock(id);
@@ -255,6 +275,35 @@
 	return err;
 }
 
+static void killseg_core(struct shmid_kernel *shp, int doacc)
+{
+	int i, numpages, rss, swp;
+
+	numpages = shp->shm_npages;
+	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
+		pte_t pte;
+		pte = SHM_ENTRY (shp,i);
+		if (pte_none(pte))
+			continue;
+		if (pte_present(pte)) {
+			__free_page (pte_page(pte));
+			rss++;
+		} else {
+			swap_free(pte_to_swp_entry(pte));
+			swp++;
+		}
+	}
+	shm_free (shp->shm_dir, numpages);
+	kfree(shp);
+	if (doacc) {
+		shm_lockall();
+		shm_rss -= rss;
+		shm_swp -= swp;
+		shm_tot -= numpages;
+		shm_unlockall();
+	}
+}
+
 /*
  * Only called after testing nattch and SHM_DEST.
  * Here pages, pgtable and shmid_kernel are freed.
@@ -262,8 +311,6 @@
 static void killseg (int shmid)
 {
 	struct shmid_kernel *shp;
-	int i, numpages;
-	int rss, swp;
 
 	down(&shm_ids.sem);
 	shp = shm_lock(shmid);
@@ -284,28 +331,8 @@
 		BUG();
 	shm_unlock(shmid);
 	up(&shm_ids.sem);
+	killseg_core(shp, 1);
 
-	numpages = shp->shm_npages;
-	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
-		pte_t pte;
-		pte = SHM_ENTRY (shp,i);
-		if (pte_none(pte))
-			continue;
-		if (pte_present(pte)) {
-			__free_page (pte_page(pte));
-			rss++;
-		} else {
-			swap_free(pte_to_swp_entry(pte));
-			swp++;
-		}
-	}
-	shm_free (shp->shm_dir, numpages);
-	kfree(shp);
-	shm_lockall();
-	shm_rss -= rss;
-	shm_swp -= swp;
-	shm_tot -= numpages;
-	shm_unlockall();
 	return;
 }
 
@@ -835,10 +862,12 @@
 	struct shmid_kernel *shp;
 	unsigned int idx;
 	struct page * page;
+	int is_shmzero;
 
 	shp = (struct shmid_kernel *) shmd->vm_private_data;
 	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
 	idx += shmd->vm_pgoff;
+	is_shmzero = (shp->id == zero_id);
 
 	/*
 	 * A shared mapping past the last page of the file is an error
@@ -850,7 +879,7 @@
 		return NULL;
 	}
 	down(&shp->sem);
-	if(shp != shm_lock(shp->id))
+	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 		BUG();
 
 	pte = SHM_ENTRY(shp,idx);
@@ -864,7 +893,7 @@
 			if (!page)
 				goto oom;
 			clear_highpage(page);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(pte);
@@ -882,11 +911,11 @@
 			delete_from_swap_cache(page);
 			page = replace_with_highmem(page);
 			swap_free(entry);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
-			shm_swp--;
+			if (is_shmzero) shm_swp--;
 		}
-		shm_rss++;
+		if (is_shmzero) shm_rss++;
 		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
 		SHM_ENTRY(shp, idx) = pte;
 	} else
@@ -904,6 +933,65 @@
 	return NOPAGE_OOM;
 }
 
+#define OKAY	0
+#define RETRY	1
+#define FAILED	2
+
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
+{
+	pte_t page;
+	struct page *page_map;
+
+	page = SHM_ENTRY(shp, idx);
+	if (!pte_present(page))
+		return RETRY;
+	page_map = pte_page(page);
+	if (zone && (!memclass(page_map->zone, zone)))
+		return RETRY;
+	if (shp->id != zero_id) swap_attempts++;
+
+	if (--counter < 0) /* failed */
+		return FAILED;
+	if (page_count(page_map) != 1)
+		return RETRY;
+
+	if (!(page_map = prepare_highmem_swapout(page_map)))
+		return FAILED;
+	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
+
+	/* add the locked page to the swap cache before allowing
+	   the swapin path to run lookup_swap_cache(). This avoids
+	   reading a not yet uptodate block from disk.
+	   NOTE: we just accounted the swap space reference for this
+	   swap cache page at __get_swap_page() time. */
+	add_to_swap_cache(*outpage = page_map, swap_entry);
+	return OKAY;
+}
+
+static void shm_swap_postop(struct page *page)
+{
+	lock_kernel();
+	rw_swap_page(WRITE, page, 0);
+	unlock_kernel();
+	__free_page(page);
+}
+
+static int shm_swap_preop(swp_entry_t *swap_entry)
+{
+	lock_kernel();
+	/* subtle: preload the swap count for the swap cache. We can't
+	   increase the count inside the critical section as we can't release
+	   the shm_lock there. And we can't acquire the big lock with the
+	   shm_lock held (otherwise we would deadlock too easily). */
+	*swap_entry = __get_swap_page(2);
+	if (!(*swap_entry).val) {
+		unlock_kernel();
+		return 1;
+	}
+	unlock_kernel();
+	return 0;
+}
+
 /*
  * Goes through counter = (shm_rss >> prio) present shm pages.
  */
@@ -912,7 +1000,6 @@
 
 int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
-	pte_t page;
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
@@ -919,21 +1006,13 @@
 	int loop = 0;
 	int counter;
 	struct page * page_map;
-	
+
+	zshm_swap(prio, gfp_mask, zone);
 	counter = shm_rss >> prio;
 	if (!counter)
 		return 0;
-	lock_kernel();
-	/* subtle: preload the swap count for the swap cache. We can't
-	   increase the count inside the critical section as we can't release
-	   the shm_lock there. And we can't acquire the big lock with the
-	   shm_lock held (otherwise we would deadlock too easily). */
-	swap_entry = __get_swap_page(2);
-	if (!swap_entry.val) {
-		unlock_kernel();
+	if (shm_swap_preop(&swap_entry))
 		return 0;
-	}
-	unlock_kernel();
 
 	shm_lockall();
 check_id:
@@ -943,8 +1022,12 @@
 		swap_idx = 0;
 		if (++swap_id > shm_ids.max_id) {
 			swap_id = 0;
-			if (loop)
-				goto failed;
+			if (loop) {
+failed:
+				shm_unlockall();
+				__swap_free(swap_entry, 2);
+				return 0;
+			}
 			loop = 1;
 		}
 		goto check_id;
@@ -956,43 +1039,16 @@
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	page = SHM_ENTRY(shp, idx);
-	if (!pte_present(page))
-		goto check_table;
-	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
-		goto check_table;
-	swap_attempts++;
-
-	if (--counter < 0) { /* failed */
-failed:
-		shm_unlockall();
-		__swap_free(swap_entry, 2);
-		return 0;
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
 	}
-	if (page_count(page_map) != 1)
-		goto check_table;
-
-	if (!(page_map = prepare_highmem_swapout(page_map)))
-		goto failed;
-	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
-
-	/* add the locked page to the swap cache before allowing
-	   the swapin path to run lookup_swap_cache(). This avoids
-	   reading a not yet uptodate block from disk.
-	   NOTE: we just accounted the swap space reference for this
-	   swap cache page at __get_swap_page() time. */
-	add_to_swap_cache(page_map, swap_entry);
 	shm_unlockall();
 
-	lock_kernel();
-	rw_swap_page(WRITE, page_map, 0);
-	unlock_kernel();
-
-	__free_page(page_map);
+	shm_swap_postop(page_map);
 	return 1;
 }
 
@@ -1014,12 +1070,29 @@
 	swap_free(entry);
 }
 
+static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
+{
+	int n;
+
+	for (n = 0; n < shp->shm_npages; n++) {
+		if (pte_none(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_present(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
+			shm_unuse_page(shp, n, entry, page);
+			return 1;
+		}
+	}
+	return 0;
+}
+
 /*
  * unuse_shm() search for an eventually swapped out shm page.
  */
 void shm_unuse(swp_entry_t entry, struct page *page)
 {
-	int i, n;
+	int i;
 
 	shm_lockall();
 	for (i = 0; i <= shm_ids.max_id; i++) {
@@ -1026,19 +1099,12 @@
 		struct shmid_kernel *shp = shm_get(i);
 		if(shp==NULL)
 			continue;
-		for (n = 0; n < shp->shm_npages; n++) {
-			if (pte_none(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_present(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
-				shm_unuse_page(shp, n, entry, page);
-				goto out;
-			}
-		}
+		if (shm_unuse_core(shp, entry, page))
+			goto out;
 	}
 out:
 	shm_unlockall();
+	zmap_unuse(entry, page);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1100,3 +1166,138 @@
 	return len;
 }
 #endif
+
+static struct shmid_kernel *zmap_list = 0;
+static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long zswap_idx = 0; /* next to swap */
+static struct shmid_kernel *zswap_shp = 0;
+
+static struct vm_operations_struct shmzero_vm_ops = {
+	open:		shmzero_open,
+	close:		shmzero_close,
+	nopage:		shm_nopage,
+	swapout:	shm_swapout,
+};
+
+int map_zero_setup(struct vm_area_struct *vma)
+{
+	struct shmid_kernel *shp;
+
+	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
+		return -ENOMEM;
+	shp->id = zero_id;	/* hack for shm_lock et al */
+	vma->vm_private_data = shp;
+	vma->vm_ops = &shmzero_vm_ops;
+	shmzero_open(vma);
+	spin_lock(&zmap_list_lock);
+	shp->attaches = (struct vm_area_struct *)zmap_list;
+	zmap_list = shp;
+	spin_unlock(&zmap_list_lock);
+	return 0;
+}
+
+static void shmzero_open(struct vm_area_struct *shmd)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	shp->shm_nattch++;
+	up(&shp->sem);
+}
+
+static void shmzero_close(struct vm_area_struct *shmd)
+{
+	int done = 0;
+	struct shmid_kernel *shp, *prev, *cur;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	if (--shp->shm_nattch == 0)
+		done = 1;
+	up(&shp->sem);
+	if (done) {
+		spin_lock(&zmap_list_lock);
+		if (shp == zswap_shp)
+			zswap_shp = (struct shmid_kernel *)(shp->attaches);
+		if (shp == zmap_list)
+			zmap_list = (struct shmid_kernel *)(shp->attaches);
+		else {
+			prev = zmap_list;
+			cur = (struct shmid_kernel *)(prev->attaches);
+			while (cur != shp) {
+				prev = cur;
+				cur = (struct shmid_kernel *)(prev->attaches);
+			}
+			prev->attaches = (struct vm_area_struct *)(shp->attaches);
+		}
+		spin_unlock(&zmap_list_lock);
+		killseg_core(shp, 0);
+	}
+}
+
+static void zmap_unuse(swp_entry_t entry, struct page *page)
+{
+	struct shmid_kernel *shp;
+
+	spin_lock(&zmap_list_lock);
+	shp = zmap_list;
+	while (shp) {
+		if (shm_unuse_core(shp, entry, page))
+			break;
+		shp = (struct shmid_kernel *)shp->attaches;
+	}
+	spin_unlock(&zmap_list_lock);
+}
+
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
+{
+	struct shmid_kernel *shp;
+	swp_entry_t swap_entry;
+	unsigned long idx;
+	int loop = 0;
+	int counter;
+	struct page * page_map;
+
+	counter = 10;	/* maybe we should use zshm_rss */
+	if (!counter)
+		return;
+next:
+	if (shm_swap_preop(&swap_entry))
+		return;
+
+	spin_lock(&zmap_list_lock);
+	if (zmap_list == 0)
+		goto failed;
+next_id:
+	if ((shp = zswap_shp) == 0) {
+		if (loop) {
+failed:
+			spin_unlock(&zmap_list_lock);
+			__swap_free(swap_entry, 2);
+			return;
+		}
+		zswap_shp = shp = zmap_list;
+		zswap_idx = 0;
+		loop = 1;
+	}
+
+check_table:
+	idx = zswap_idx++;
+	if (idx >= shp->shm_npages) {
+		zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
+		zswap_idx = 0;
+		goto next_id;
+	}
+
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
+	}
+	spin_unlock(&zmap_list_lock);
+
+	shm_swap_postop(page_map);
+	if (counter)
+		goto next;
+	return;
+}
--- ipc/util.c	Fri Feb 25 13:04:14 2000
+++ ipc/util.c	Fri Feb 25 14:54:39 2000
@@ -317,4 +317,9 @@
 {
 }
 
+int map_zero_setup(struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SYSVIPC */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar
@ 2000-02-26 16:38 ` Linus Torvalds
  2000-02-26 21:47   ` Kanoj Sarcar
  2000-02-29 10:54 ` Christoph Rohland
  1 sibling, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2000-02-26 16:38 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: linux-mm, linux-kernel


On Fri, 25 Feb 2000, Kanoj Sarcar wrote:
> 
> This is a patch against 2.3.47 that tries to implement shared /dev/zero
> mappings. This is just a first cut attempt, I am hoping I will find a
> few people to apply the patch and throw some real life programs at it
> (preferably on low memory machines so that swapping is induced). 

I'd like to see the shm code use the page cache exclusively to track
pages, and this is going to mess with that, I think. We'll need an inode
(or at least a "struct address_space") to act as a "anchor" for the pages,
and it mustn't be the "/dev/zero" inode.

Before that is done I'd be nervous about letting external things call into
the shm subsystem, but maybe this is ok as-is (the only thing you really
export is "map_zero_setup()", and I guess it can just be updated as
required..)

However, whether that will be changed or not, the following looks really
wrong:

>  	if (vma->vm_flags & VM_SHARED)
> 		if (ret = map_zero_setup(vma))
> 			return ret;
>  	if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
>  		return -EAGAIN;

I do NOT believe that it can ever be right to do a "zeromap_page_range()"
on a shared memory area.  I'll tell you why:

	mmap(/dev/zero, MAP_SHARED)
	child = fork();
	if (child) {
		..modify mapping..
	} else {
		..wait for modifications to show up..
	}

the "zeromap_page_range()" thing mapped the area with private NULL pages.
The fork() copied the page tables. The modification of the private NULL
page causes the child to get a real shared page, but the mother will never
see that shared page if it only reads from the mapping because the mother
will still see the private one.


So I suspect at the very least it would have to be

	if (vma->vm_flags & VM_SHARED)
		return map_zero_setup(vma);

and let the whole area be populated with shared pages by "nopage".

Comments?

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-02-26 16:38 ` Linus Torvalds
@ 2000-02-26 21:47   ` Kanoj Sarcar
  0 siblings, 0 replies; 19+ messages in thread
From: Kanoj Sarcar @ 2000-02-26 21:47 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-mm, linux-kernel

> 
> I'd like to see the shm code use the page cache exclusively to track
> pages, and this is going to mess with that, I think. We'll need an inode
> (or at least a "struct address_space") to act as a "anchor" for the pages,
> and it mustn't be the "/dev/zero" inode.
> 
> Before that is done I'd be nervous about letting external things call into
> the shm subsystem, but maybe this is ok as-is (the only thing you really
> export is "map_zero_setup()", and I guess it can just be updated as
> required..)
>

Yes, as you point out, we can put in this code now. Whatever
changes are made to the underlying routines that manage shm pages, should
also be able to handle /dev/zero pages with minor tweaks. The chances that
the /dev/zero code will be a big hindrance to the page cache effort are
minimal ...
 
> However, whether that will be changed or not, the following looks really
> wrong:
> 
> >  	if (vma->vm_flags & VM_SHARED)
> > 		if (ret = map_zero_setup(vma))
> > 			return ret;
> >  	if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
> >  		return -EAGAIN;

Yes, you are right. I have made the change in the attached patch. I also
added in some checks so that users can not shmctl/shmat to the shmid being
used for /dev/zero.

Does this look ready for 2.3.48 then?

Kanoj

--- drivers/char/mem.c	Fri Feb 25 13:03:46 2000
+++ drivers/char/mem.c	Sat Feb 26 11:43:35 2000
@@ -5,6 +5,7 @@
  *
  *  Added devfs support. 
  *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
+ *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/config.h>
@@ -433,8 +434,10 @@
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+	extern int map_zero_setup(struct vm_area_struct *);
+
 	if (vma->vm_flags & VM_SHARED)
-		return -EINVAL;
+		return(map_zero_setup(vma));
 	if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
 		return -EAGAIN;
 	return 0;
--- ipc/shm.c	Fri Feb 25 13:04:14 2000
+++ ipc/shm.c	Sat Feb 26 13:10:21 2000
@@ -11,6 +11,7 @@
  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/config.h>
@@ -70,6 +71,13 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
+static void zmap_unuse(swp_entry_t entry, struct page *page);
+static void shmzero_open(struct vm_area_struct *shmd);
+static void shmzero_close(struct vm_area_struct *shmd);
+static int zero_id;
+static struct shmid_kernel zshmid_kernel;
+
 size_t shm_ctlmax = SHMMAX;
 int shm_ctlall = SHMALL;
 int shm_ctlmni = SHMMNI;
@@ -103,6 +111,8 @@
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 #endif
+	zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni);
+	shm_unlock(zero_id);
 	return;
 }
 
@@ -179,6 +189,26 @@
 	return 0;
 }
 
+static inline struct shmid_kernel *newseg_alloc(int numpages)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+	if (!shp)
+		return 0;
+
+	shp->shm_dir = shm_alloc (numpages);
+	if (!shp->shm_dir) {
+		kfree(shp);
+		return 0;
+	}
+	shp->shm_npages = numpages;
+	shp->attaches = NULL;
+	shp->shm_nattch = 0;
+	init_MUTEX(&shp->sem);
+	return(shp);
+}
+
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	struct shmid_kernel *shp;
@@ -193,15 +223,8 @@
 	if (shm_tot + numpages >= shm_ctlall)
 		return -ENOSPC;
 
-	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
-	if (!shp)
-		return -ENOMEM;
-
-	shp->shm_dir = shm_alloc (numpages);
-	if (!shp->shm_dir) {
-		kfree(shp);
+	if (!(shp = newseg_alloc(numpages)))
 		return -ENOMEM;
-	}
 	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
 	if(id == -1) {
 		shm_free(shp->shm_dir,numpages);
@@ -212,13 +235,10 @@
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->shm_segsz = size;
 	shp->shm_cpid = current->pid;
-	shp->attaches = NULL;
-	shp->shm_lpid = shp->shm_nattch = 0;
+	shp->shm_lpid = 0;
 	shp->shm_atime = shp->shm_dtime = 0;
 	shp->shm_ctime = CURRENT_TIME;
-	shp->shm_npages = numpages;
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
-	init_MUTEX(&shp->sem);
 
 	shm_tot += numpages;
 	shm_unlock(id);
@@ -255,6 +275,35 @@
 	return err;
 }
 
+static void killseg_core(struct shmid_kernel *shp, int doacc)
+{
+	int i, numpages, rss, swp;
+
+	numpages = shp->shm_npages;
+	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
+		pte_t pte;
+		pte = SHM_ENTRY (shp,i);
+		if (pte_none(pte))
+			continue;
+		if (pte_present(pte)) {
+			__free_page (pte_page(pte));
+			rss++;
+		} else {
+			swap_free(pte_to_swp_entry(pte));
+			swp++;
+		}
+	}
+	shm_free (shp->shm_dir, numpages);
+	kfree(shp);
+	if (doacc) {
+		shm_lockall();
+		shm_rss -= rss;
+		shm_swp -= swp;
+		shm_tot -= numpages;
+		shm_unlockall();
+	}
+}
+
 /*
  * Only called after testing nattch and SHM_DEST.
  * Here pages, pgtable and shmid_kernel are freed.
@@ -262,8 +311,6 @@
 static void killseg (int shmid)
 {
 	struct shmid_kernel *shp;
-	int i, numpages;
-	int rss, swp;
 
 	down(&shm_ids.sem);
 	shp = shm_lock(shmid);
@@ -284,28 +331,8 @@
 		BUG();
 	shm_unlock(shmid);
 	up(&shm_ids.sem);
+	killseg_core(shp, 1);
 
-	numpages = shp->shm_npages;
-	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
-		pte_t pte;
-		pte = SHM_ENTRY (shp,i);
-		if (pte_none(pte))
-			continue;
-		if (pte_present(pte)) {
-			__free_page (pte_page(pte));
-			rss++;
-		} else {
-			swap_free(pte_to_swp_entry(pte));
-			swp++;
-		}
-	}
-	shm_free (shp->shm_dir, numpages);
-	kfree(shp);
-	shm_lockall();
-	shm_rss -= rss;
-	shm_swp -= swp;
-	shm_tot -= numpages;
-	shm_unlockall();
 	return;
 }
 
@@ -458,6 +485,10 @@
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
+		if (shp == &zshmid_kernel) {
+			shm_unlock(shmid);
+			return -EINVAL;
+		}
 		if(cmd==SHM_STAT) {
 			err = -EINVAL;
 			if (shmid > shm_ids.max_id)
@@ -498,6 +529,10 @@
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
+		if (shp == &zshmid_kernel) {
+			shm_unlock(shmid);
+			return -EINVAL;
+		}
 		err=-EIDRM;
 		if(shm_checkid(shp,shmid))
 			goto out_unlock;
@@ -532,6 +567,8 @@
 	err=-EINVAL;
 	if(shp==NULL)
 		goto out_up;
+	if (shp == &zshmid_kernel)
+		goto out_unlock_up;
 	err=-EIDRM;
 	if(shm_checkid(shp,shmid))
 		goto out_unlock_up;
@@ -653,6 +690,8 @@
 	shp = shm_lock(shmid);
 	if (!shp)
 		goto out_up;
+	if (shp == &zshmid_kernel)
+		goto out_unlock_up;
 
 	err = -EACCES;
 	if (ipcperms(&shp->shm_perm, flg))
@@ -835,10 +874,12 @@
 	struct shmid_kernel *shp;
 	unsigned int idx;
 	struct page * page;
+	int is_shmzero;
 
 	shp = (struct shmid_kernel *) shmd->vm_private_data;
 	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
 	idx += shmd->vm_pgoff;
+	is_shmzero = (shp->id == zero_id);
 
 	/*
 	 * A shared mapping past the last page of the file is an error
@@ -850,7 +891,7 @@
 		return NULL;
 	}
 	down(&shp->sem);
-	if(shp != shm_lock(shp->id))
+	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 		BUG();
 
 	pte = SHM_ENTRY(shp,idx);
@@ -864,7 +905,7 @@
 			if (!page)
 				goto oom;
 			clear_highpage(page);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(pte);
@@ -882,11 +923,11 @@
 			delete_from_swap_cache(page);
 			page = replace_with_highmem(page);
 			swap_free(entry);
-			if(shp != shm_lock(shp->id))
+			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
 				BUG();
-			shm_swp--;
+			if (is_shmzero) shm_swp--;
 		}
-		shm_rss++;
+		if (is_shmzero) shm_rss++;
 		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
 		SHM_ENTRY(shp, idx) = pte;
 	} else
@@ -904,6 +945,65 @@
 	return NOPAGE_OOM;
 }
 
+#define OKAY	0
+#define RETRY	1
+#define FAILED	2
+
+static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
+{
+	pte_t page;
+	struct page *page_map;
+
+	page = SHM_ENTRY(shp, idx);
+	if (!pte_present(page))
+		return RETRY;
+	page_map = pte_page(page);
+	if (zone && (!memclass(page_map->zone, zone)))
+		return RETRY;
+	if (shp->id != zero_id) swap_attempts++;
+
+	if (--counter < 0) /* failed */
+		return FAILED;
+	if (page_count(page_map) != 1)
+		return RETRY;
+
+	if (!(page_map = prepare_highmem_swapout(page_map)))
+		return FAILED;
+	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
+
+	/* add the locked page to the swap cache before allowing
+	   the swapin path to run lookup_swap_cache(). This avoids
+	   reading a not yet uptodate block from disk.
+	   NOTE: we just accounted the swap space reference for this
+	   swap cache page at __get_swap_page() time. */
+	add_to_swap_cache(*outpage = page_map, swap_entry);
+	return OKAY;
+}
+
+static void shm_swap_postop(struct page *page)
+{
+	lock_kernel();
+	rw_swap_page(WRITE, page, 0);
+	unlock_kernel();
+	__free_page(page);
+}
+
+static int shm_swap_preop(swp_entry_t *swap_entry)
+{
+	lock_kernel();
+	/* subtle: preload the swap count for the swap cache. We can't
+	   increase the count inside the critical section as we can't release
+	   the shm_lock there. And we can't acquire the big lock with the
+	   shm_lock held (otherwise we would deadlock too easily). */
+	*swap_entry = __get_swap_page(2);
+	if (!(*swap_entry).val) {
+		unlock_kernel();
+		return 1;
+	}
+	unlock_kernel();
+	return 0;
+}
+
 /*
  * Goes through counter = (shm_rss >> prio) present shm pages.
  */
@@ -912,7 +1012,6 @@
 
 int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
-	pte_t page;
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
@@ -919,21 +1018,13 @@
 	int loop = 0;
 	int counter;
 	struct page * page_map;
-	
+
+	zshm_swap(prio, gfp_mask, zone);
 	counter = shm_rss >> prio;
 	if (!counter)
 		return 0;
-	lock_kernel();
-	/* subtle: preload the swap count for the swap cache. We can't
-	   increase the count inside the critical section as we can't release
-	   the shm_lock there. And we can't acquire the big lock with the
-	   shm_lock held (otherwise we would deadlock too easily). */
-	swap_entry = __get_swap_page(2);
-	if (!swap_entry.val) {
-		unlock_kernel();
+	if (shm_swap_preop(&swap_entry))
 		return 0;
-	}
-	unlock_kernel();
 
 	shm_lockall();
 check_id:
@@ -943,8 +1034,12 @@
 		swap_idx = 0;
 		if (++swap_id > shm_ids.max_id) {
 			swap_id = 0;
-			if (loop)
-				goto failed;
+			if (loop) {
+failed:
+				shm_unlockall();
+				__swap_free(swap_entry, 2);
+				return 0;
+			}
 			loop = 1;
 		}
 		goto check_id;
@@ -956,43 +1051,16 @@
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	page = SHM_ENTRY(shp, idx);
-	if (!pte_present(page))
-		goto check_table;
-	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
-		goto check_table;
-	swap_attempts++;
-
-	if (--counter < 0) { /* failed */
-failed:
-		shm_unlockall();
-		__swap_free(swap_entry, 2);
-		return 0;
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
 	}
-	if (page_count(page_map) != 1)
-		goto check_table;
-
-	if (!(page_map = prepare_highmem_swapout(page_map)))
-		goto failed;
-	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
-
-	/* add the locked page to the swap cache before allowing
-	   the swapin path to run lookup_swap_cache(). This avoids
-	   reading a not yet uptodate block from disk.
-	   NOTE: we just accounted the swap space reference for this
-	   swap cache page at __get_swap_page() time. */
-	add_to_swap_cache(page_map, swap_entry);
 	shm_unlockall();
 
-	lock_kernel();
-	rw_swap_page(WRITE, page_map, 0);
-	unlock_kernel();
-
-	__free_page(page_map);
+	shm_swap_postop(page_map);
 	return 1;
 }
 
@@ -1014,12 +1082,29 @@
 	swap_free(entry);
 }
 
+static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
+{
+	int n;
+
+	for (n = 0; n < shp->shm_npages; n++) {
+		if (pte_none(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_present(SHM_ENTRY(shp,n)))
+			continue;
+		if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
+			shm_unuse_page(shp, n, entry, page);
+			return 1;
+		}
+	}
+	return 0;
+}
+
 /*
  * unuse_shm() search for an eventually swapped out shm page.
  */
 void shm_unuse(swp_entry_t entry, struct page *page)
 {
-	int i, n;
+	int i;
 
 	shm_lockall();
 	for (i = 0; i <= shm_ids.max_id; i++) {
@@ -1026,19 +1111,12 @@
 		struct shmid_kernel *shp = shm_get(i);
 		if(shp==NULL)
 			continue;
-		for (n = 0; n < shp->shm_npages; n++) {
-			if (pte_none(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_present(SHM_ENTRY(shp,n)))
-				continue;
-			if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
-				shm_unuse_page(shp, n, entry, page);
-				goto out;
-			}
-		}
+		if (shm_unuse_core(shp, entry, page))
+			goto out;
 	}
 out:
 	shm_unlockall();
+	zmap_unuse(entry, page);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1053,6 +1131,10 @@
 
     	for(i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel* shp = shm_lock(i);
+		if (shp == &zshmid_kernel) {
+			shm_unlock(i);
+			continue;
+		}
 		if(shp!=NULL) {
 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
@@ -1100,3 +1182,138 @@
 	return len;
 }
 #endif
+
+static struct shmid_kernel *zmap_list = 0;
+static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long zswap_idx = 0; /* next to swap */
+static struct shmid_kernel *zswap_shp = 0;
+
+static struct vm_operations_struct shmzero_vm_ops = {
+	open:		shmzero_open,
+	close:		shmzero_close,
+	nopage:		shm_nopage,
+	swapout:	shm_swapout,
+};
+
+int map_zero_setup(struct vm_area_struct *vma)
+{
+	struct shmid_kernel *shp;
+
+	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
+		return -ENOMEM;
+	shp->id = zero_id;	/* hack for shm_lock et al */
+	vma->vm_private_data = shp;
+	vma->vm_ops = &shmzero_vm_ops;
+	shmzero_open(vma);
+	spin_lock(&zmap_list_lock);
+	shp->attaches = (struct vm_area_struct *)zmap_list;
+	zmap_list = shp;
+	spin_unlock(&zmap_list_lock);
+	return 0;
+}
+
+static void shmzero_open(struct vm_area_struct *shmd)
+{
+	struct shmid_kernel *shp;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	shp->shm_nattch++;
+	up(&shp->sem);
+}
+
+static void shmzero_close(struct vm_area_struct *shmd)
+{
+	int done = 0;
+	struct shmid_kernel *shp, *prev, *cur;
+
+	shp = (struct shmid_kernel *) shmd->vm_private_data;
+	down(&shp->sem);
+	if (--shp->shm_nattch == 0)
+		done = 1;
+	up(&shp->sem);
+	if (done) {
+		spin_lock(&zmap_list_lock);
+		if (shp == zswap_shp)
+			zswap_shp = (struct shmid_kernel *)(shp->attaches);
+		if (shp == zmap_list)
+			zmap_list = (struct shmid_kernel *)(shp->attaches);
+		else {
+			prev = zmap_list;
+			cur = (struct shmid_kernel *)(prev->attaches);
+			while (cur != shp) {
+				prev = cur;
+				cur = (struct shmid_kernel *)(prev->attaches);
+			}
+			prev->attaches = (struct vm_area_struct *)(shp->attaches);
+		}
+		spin_unlock(&zmap_list_lock);
+		killseg_core(shp, 0);
+	}
+}
+
+static void zmap_unuse(swp_entry_t entry, struct page *page)
+{
+	struct shmid_kernel *shp;
+
+	spin_lock(&zmap_list_lock);
+	shp = zmap_list;
+	while (shp) {
+		if (shm_unuse_core(shp, entry, page))
+			break;
+		shp = (struct shmid_kernel *)shp->attaches;
+	}
+	spin_unlock(&zmap_list_lock);
+}
+
+static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
+{
+	struct shmid_kernel *shp;
+	swp_entry_t swap_entry;
+	unsigned long idx;
+	int loop = 0;
+	int counter;
+	struct page * page_map;
+
+	counter = 10;	/* maybe we should use zshm_rss */
+	if (!counter)
+		return;
+next:
+	if (shm_swap_preop(&swap_entry))
+		return;
+
+	spin_lock(&zmap_list_lock);
+	if (zmap_list == 0)
+		goto failed;
+next_id:
+	if ((shp = zswap_shp) == 0) {
+		if (loop) {
+failed:
+			spin_unlock(&zmap_list_lock);
+			__swap_free(swap_entry, 2);
+			return;
+		}
+		zswap_shp = shp = zmap_list;
+		zswap_idx = 0;
+		loop = 1;
+	}
+
+check_table:
+	idx = zswap_idx++;
+	if (idx >= shp->shm_npages) {
+		zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
+		zswap_idx = 0;
+		goto next_id;
+	}
+
+	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
+		case RETRY: goto check_table;
+		case FAILED: goto failed;
+	}
+	spin_unlock(&zmap_list_lock);
+
+	shm_swap_postop(page_map);
+	if (counter)
+		goto next;
+	return;
+}
--- ipc/util.c	Fri Feb 25 13:04:14 2000
+++ ipc/util.c	Fri Feb 25 14:54:39 2000
@@ -317,4 +317,9 @@
 {
 }
 
+int map_zero_setup(struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SYSVIPC */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar
  2000-02-26 16:38 ` Linus Torvalds
@ 2000-02-29 10:54 ` Christoph Rohland
  2000-02-29 18:30   ` Kanoj Sarcar
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Rohland @ 2000-02-29 10:54 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: linux-mm, linux-kernel, torvalds

Hi Kanoj,


kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> This is a patch against 2.3.47 that tries to implement shared /dev/zero
> mappings. This is just a first cut attempt, I am hoping I will find a
> few people to apply the patch and throw some real life programs at it
> (preferably on low memory machines so that swapping is induced). 
> 
> Currently, you will also need to turn on CONFIG_SYSVIPC, but most of
> the shm.c code can be split into a new ipc/shm_core.c file that is
> always compiled in, irrespective of CONFIG_SYSVIPC. Linus, do you 
> think this is the proper direction to follow?
> 
> Thanks. Comments and feedback welcome ...

Why do you use this special zero_id stuff? It clutters up the whole
code.

If you would simply open a normal shm segment with key IPC_PRIVATE and
directly remove it nobody can attach to it and it will be released on
exit and everything. No special handling needed any more. BTW that's
exectly what we do in user space to circumvent the missing MAP_ANON |
MAP_SHARED.

I would also prefer to be able to see the allocated segments with the
ipc* commands.

Greetings
          Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-02-29 10:54 ` Christoph Rohland
@ 2000-02-29 18:30   ` Kanoj Sarcar
  2000-03-01 12:08     ` Christoph Rohland
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-02-29 18:30 UTC (permalink / raw)
  To: Christoph Rohland; +Cc: linux-mm, linux-kernel, torvalds, Kanoj Sarcar

> Why do you use this special zero_id stuff? It clutters up the whole
> code.

The zero_id stuff is required to differentiate between the struct 
shmid_kernel of a "real" shared memory segment and one that
represents a /dev/zero mapping. This is used in shm_swap_core(),
for accounting purposes, but that can be changed by adding a 
new flag to shm_swap_core. The more important use is in
shm_nopage().

> 
> If you would simply open a normal shm segment with key IPC_PRIVATE and
> directly remove it nobody can attach to it and it will be released on
> exit and everything. No special handling needed any more. BTW that's
> exectly what we do in user space to circumvent the missing MAP_ANON |
> MAP_SHARED.

Would this need to be done for each /dev/zero mapping? If so, then 
I prefer the way the code is right now, since the interference with
"real" shared memory is minimal. I was also trying to look for a way 
to prevent the zshmid_kernel hacks in shmat/shmctl (including setting
strict permissions), but couldn't come up with one ... Tell me in more
detail what you are suggesting here.

> 
> I would also prefer to be able to see the allocated segments with the
> ipc* commands.
>
I do not believe there is any good reason to expose the special shared
memory segment used as a place holder for all /dev/zero mappings to users
via ipc* commands. This special segment exists only because we 
want to reduce kernel code duplication, and all the zshmid_kernel/
zero_id checks just make sure that regular shared memory works
pretty much the way it did before. (One thing I am unhappy about
is that this special segment eats up a shm id, but that's probably
not too bad). 
 
Thanks.

Kanoj
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-02-29 18:30   ` Kanoj Sarcar
@ 2000-03-01 12:08     ` Christoph Rohland
  2000-03-01 17:34       ` Kanoj Sarcar
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Rohland @ 2000-03-01 12:08 UTC (permalink / raw)
  To: Kanoj Sarcar, Linus Torvalds; +Cc: linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 840 bytes --]

Hi Kanoj and Linus,

kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:

> [snip] 
>
> I do not believe there is any good reason to expose the special
> shared memory segment used as a place holder for all /dev/zero
> mappings to users via ipc* commands. This special segment exists
> only because we want to reduce kernel code duplication, and all the
> zshmid_kernel/ zero_id checks just make sure that regular shared
> memory works pretty much the way it did before. (One thing I am
> unhappy about is that this special segment eats up a shm id, but
> that's probably not too bad).

The appended proposal reduces code duplication and complexity a
lot. (The diff47 needs your patches against other files added.)

I would vote to apply diff48 to the standard kernel. For me the whole
solution is still a workaround. 

Greetings
		Christoph


[-- Attachment #2: patch shm.c against 2.3.47 --]
[-- Type: text/plain, Size: 951 bytes --]

--- 2.3.47/ipc/shm.c	Wed Mar  1 10:33:26 2000
+++ make48/ipc/shm.c	Wed Mar  1 12:54:49 2000
@@ -11,6 +11,7 @@
  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  */
 
 #include <linux/config.h>
@@ -1100,3 +1101,24 @@
 	return len;
 }
 #endif
+
+int map_zero_setup(struct vm_area_struct *vma)
+{
+        int shmid;
+	struct shmid_kernel *shp;
+
+	down(&shm_ids.sem);
+	shmid = newseg(IPC_PRIVATE, S_IRWXUGO, vma->vm_end - vma->vm_start);
+        up(&shm_ids.sem);
+        if (shmid < 0)
+                return shmid;
+
+        if (!(shp = shm_lock (shmid)))
+                BUG();
+        shp->shm_perm.mode |= SHM_DEST;
+        shm_unlock (shmid);
+	vma->vm_private_data = shp;
+	vma->vm_ops = &shm_vm_ops;
+	shm_open (vma);
+	return 0;
+}

[-- Attachment #3: patch shm.c against 2.3.48 --]
[-- Type: text/plain, Size: 14877 bytes --]

--- make48/ipc/shm.c.48	Wed Mar  1 10:34:39 2000
+++ make48/ipc/shm.c	Wed Mar  1 12:54:49 2000
@@ -71,13 +71,6 @@
 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #endif
 
-static void zshm_swap (int prio, int gfp_mask, zone_t *zone);
-static void zmap_unuse(swp_entry_t entry, struct page *page);
-static void shmzero_open(struct vm_area_struct *shmd);
-static void shmzero_close(struct vm_area_struct *shmd);
-static int zero_id;
-static struct shmid_kernel zshmid_kernel;
-
 size_t shm_ctlmax = SHMMAX;
 int shm_ctlall = SHMALL;
 int shm_ctlmni = SHMMNI;
@@ -111,8 +104,6 @@
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 #endif
-	zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, shm_ctlmni);
-	shm_unlock(zero_id);
 	return;
 }
 
@@ -189,26 +180,6 @@
 	return 0;
 }
 
-static inline struct shmid_kernel *newseg_alloc(int numpages)
-{
-	struct shmid_kernel *shp;
-
-	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
-	if (!shp)
-		return 0;
-
-	shp->shm_dir = shm_alloc (numpages);
-	if (!shp->shm_dir) {
-		kfree(shp);
-		return 0;
-	}
-	shp->shm_npages = numpages;
-	shp->attaches = NULL;
-	shp->shm_nattch = 0;
-	init_MUTEX(&shp->sem);
-	return(shp);
-}
-
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	struct shmid_kernel *shp;
@@ -223,8 +194,15 @@
 	if (shm_tot + numpages >= shm_ctlall)
 		return -ENOSPC;
 
-	if (!(shp = newseg_alloc(numpages)))
+	shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_KERNEL);
+	if (!shp)
+		return -ENOMEM;
+
+	shp->shm_dir = shm_alloc (numpages);
+	if (!shp->shm_dir) {
+		kfree(shp);
 		return -ENOMEM;
+	}
 	id = ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
 	if(id == -1) {
 		shm_free(shp->shm_dir,numpages);
@@ -235,10 +213,13 @@
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 	shp->shm_segsz = size;
 	shp->shm_cpid = current->pid;
-	shp->shm_lpid = 0;
+	shp->attaches = NULL;
+	shp->shm_lpid = shp->shm_nattch = 0;
 	shp->shm_atime = shp->shm_dtime = 0;
 	shp->shm_ctime = CURRENT_TIME;
+	shp->shm_npages = numpages;
 	shp->id = shm_buildid(id,shp->shm_perm.seq);
+	init_MUTEX(&shp->sem);
 
 	shm_tot += numpages;
 	shm_unlock(id);
@@ -275,35 +256,6 @@
 	return err;
 }
 
-static void killseg_core(struct shmid_kernel *shp, int doacc)
-{
-	int i, numpages, rss, swp;
-
-	numpages = shp->shm_npages;
-	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
-		pte_t pte;
-		pte = SHM_ENTRY (shp,i);
-		if (pte_none(pte))
-			continue;
-		if (pte_present(pte)) {
-			__free_page (pte_page(pte));
-			rss++;
-		} else {
-			swap_free(pte_to_swp_entry(pte));
-			swp++;
-		}
-	}
-	shm_free (shp->shm_dir, numpages);
-	kfree(shp);
-	if (doacc) {
-		shm_lockall();
-		shm_rss -= rss;
-		shm_swp -= swp;
-		shm_tot -= numpages;
-		shm_unlockall();
-	}
-}
-
 /*
  * Only called after testing nattch and SHM_DEST.
  * Here pages, pgtable and shmid_kernel are freed.
@@ -311,6 +263,8 @@
 static void killseg (int shmid)
 {
 	struct shmid_kernel *shp;
+	int i, numpages;
+	int rss, swp;
 
 	down(&shm_ids.sem);
 	shp = shm_lock(shmid);
@@ -331,8 +285,28 @@
 		BUG();
 	shm_unlock(shmid);
 	up(&shm_ids.sem);
-	killseg_core(shp, 1);
 
+	numpages = shp->shm_npages;
+	for (i = 0, rss = 0, swp = 0; i < numpages ; i++) {
+		pte_t pte;
+		pte = SHM_ENTRY (shp,i);
+		if (pte_none(pte))
+			continue;
+		if (pte_present(pte)) {
+			__free_page (pte_page(pte));
+			rss++;
+		} else {
+			swap_free(pte_to_swp_entry(pte));
+			swp++;
+		}
+	}
+	shm_free (shp->shm_dir, numpages);
+	kfree(shp);
+	shm_lockall();
+	shm_rss -= rss;
+	shm_swp -= swp;
+	shm_tot -= numpages;
+	shm_unlockall();
 	return;
 }
 
@@ -485,10 +459,6 @@
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
-		if (shp == &zshmid_kernel) {
-			shm_unlock(shmid);
-			return -EINVAL;
-		}
 		if(cmd==SHM_STAT) {
 			err = -EINVAL;
 			if (shmid > shm_ids.max_id)
@@ -529,10 +499,6 @@
 		shp = shm_lock(shmid);
 		if(shp==NULL)
 			return -EINVAL;
-		if (shp == &zshmid_kernel) {
-			shm_unlock(shmid);
-			return -EINVAL;
-		}
 		err=-EIDRM;
 		if(shm_checkid(shp,shmid))
 			goto out_unlock;
@@ -567,8 +533,6 @@
 	err=-EINVAL;
 	if(shp==NULL)
 		goto out_up;
-	if (shp == &zshmid_kernel)
-		goto out_unlock_up;
 	err=-EIDRM;
 	if(shm_checkid(shp,shmid))
 		goto out_unlock_up;
@@ -690,8 +654,6 @@
 	shp = shm_lock(shmid);
 	if (!shp)
 		goto out_up;
-	if (shp == &zshmid_kernel)
-		goto out_unlock_up;
 
 	err = -EACCES;
 	if (ipcperms(&shp->shm_perm, flg))
@@ -874,12 +836,10 @@
 	struct shmid_kernel *shp;
 	unsigned int idx;
 	struct page * page;
-	int is_shmzero;
 
 	shp = (struct shmid_kernel *) shmd->vm_private_data;
 	idx = (address - shmd->vm_start) >> PAGE_SHIFT;
 	idx += shmd->vm_pgoff;
-	is_shmzero = (shp->id == zero_id);
 
 	/*
 	 * A shared mapping past the last page of the file is an error
@@ -891,7 +851,7 @@
 		return NULL;
 	}
 	down(&shp->sem);
-	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
+	if(shp != shm_lock(shp->id))
 		BUG();
 
 	pte = SHM_ENTRY(shp,idx);
@@ -905,7 +865,7 @@
 			if (!page)
 				goto oom;
 			clear_highpage(page);
-			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
+			if(shp != shm_lock(shp->id))
 				BUG();
 		} else {
 			swp_entry_t entry = pte_to_swp_entry(pte);
@@ -923,11 +883,11 @@
 			delete_from_swap_cache(page);
 			page = replace_with_highmem(page);
 			swap_free(entry);
-			if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))
+			if(shp != shm_lock(shp->id))
 				BUG();
-			if (is_shmzero) shm_swp--;
+			shm_swp--;
 		}
-		if (is_shmzero) shm_rss++;
+		shm_rss++;
 		pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
 		SHM_ENTRY(shp, idx) = pte;
 	} else
@@ -945,65 +905,6 @@
 	return NOPAGE_OOM;
 }
 
-#define OKAY	0
-#define RETRY	1
-#define FAILED	2
-
-static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage)
-{
-	pte_t page;
-	struct page *page_map;
-
-	page = SHM_ENTRY(shp, idx);
-	if (!pte_present(page))
-		return RETRY;
-	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
-		return RETRY;
-	if (shp->id != zero_id) swap_attempts++;
-
-	if (--counter < 0) /* failed */
-		return FAILED;
-	if (page_count(page_map) != 1)
-		return RETRY;
-
-	if (!(page_map = prepare_highmem_swapout(page_map)))
-		return FAILED;
-	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
-
-	/* add the locked page to the swap cache before allowing
-	   the swapin path to run lookup_swap_cache(). This avoids
-	   reading a not yet uptodate block from disk.
-	   NOTE: we just accounted the swap space reference for this
-	   swap cache page at __get_swap_page() time. */
-	add_to_swap_cache(*outpage = page_map, swap_entry);
-	return OKAY;
-}
-
-static void shm_swap_postop(struct page *page)
-{
-	lock_kernel();
-	rw_swap_page(WRITE, page, 0);
-	unlock_kernel();
-	__free_page(page);
-}
-
-static int shm_swap_preop(swp_entry_t *swap_entry)
-{
-	lock_kernel();
-	/* subtle: preload the swap count for the swap cache. We can't
-	   increase the count inside the critical section as we can't release
-	   the shm_lock there. And we can't acquire the big lock with the
-	   shm_lock held (otherwise we would deadlock too easily). */
-	*swap_entry = __get_swap_page(2);
-	if (!(*swap_entry).val) {
-		unlock_kernel();
-		return 1;
-	}
-	unlock_kernel();
-	return 0;
-}
-
 /*
  * Goes through counter = (shm_rss >> prio) present shm pages.
  */
@@ -1012,19 +913,28 @@
 
 int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
+	pte_t page;
 	struct shmid_kernel *shp;
 	swp_entry_t swap_entry;
 	unsigned long id, idx;
 	int loop = 0;
 	int counter;
 	struct page * page_map;
-
-	zshm_swap(prio, gfp_mask, zone);
+	
 	counter = shm_rss >> prio;
 	if (!counter)
 		return 0;
-	if (shm_swap_preop(&swap_entry))
+	lock_kernel();
+	/* subtle: preload the swap count for the swap cache. We can't
+	   increase the count inside the critical section as we can't release
+	   the shm_lock there. And we can't acquire the big lock with the
+	   shm_lock held (otherwise we would deadlock too easily). */
+	swap_entry = __get_swap_page(2);
+	if (!swap_entry.val) {
+		unlock_kernel();
 		return 0;
+	}
+	unlock_kernel();
 
 	shm_lockall();
 check_id:
@@ -1034,12 +944,8 @@
 		swap_idx = 0;
 		if (++swap_id > shm_ids.max_id) {
 			swap_id = 0;
-			if (loop) {
-failed:
-				shm_unlockall();
-				__swap_free(swap_entry, 2);
-				return 0;
-			}
+			if (loop)
+				goto failed;
 			loop = 1;
 		}
 		goto check_id;
@@ -1051,16 +957,43 @@
 	if (idx >= shp->shm_npages)
 		goto next_id;
 
-	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
-		case RETRY: goto check_table;
-		case FAILED: goto failed;
+	page = SHM_ENTRY(shp, idx);
+	if (!pte_present(page))
+		goto check_table;
+	page_map = pte_page(page);
+	if (zone && (!memclass(page_map->zone, zone)))
+		goto check_table;
+	swap_attempts++;
+
+	if (--counter < 0) { /* failed */
+failed:
+		shm_unlockall();
+		__swap_free(swap_entry, 2);
+		return 0;
 	}
+	if (page_count(page_map) != 1)
+		goto check_table;
+
+	if (!(page_map = prepare_highmem_swapout(page_map)))
+		goto failed;
+	SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
+
+	/* add the locked page to the swap cache before allowing
+	   the swapin path to run lookup_swap_cache(). This avoids
+	   reading a not yet uptodate block from disk.
+	   NOTE: we just accounted the swap space reference for this
+	   swap cache page at __get_swap_page() time. */
+	add_to_swap_cache(page_map, swap_entry);
 	shm_unlockall();
 
-	shm_swap_postop(page_map);
+	lock_kernel();
+	rw_swap_page(WRITE, page_map, 0);
+	unlock_kernel();
+
+	__free_page(page_map);
 	return 1;
 }
 
@@ -1082,41 +1015,31 @@
 	swap_free(entry);
 }
 
-static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
-{
-	int n;
-
-	for (n = 0; n < shp->shm_npages; n++) {
-		if (pte_none(SHM_ENTRY(shp,n)))
-			continue;
-		if (pte_present(SHM_ENTRY(shp,n)))
-			continue;
-		if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
-			shm_unuse_page(shp, n, entry, page);
-			return 1;
-		}
-	}
-	return 0;
-}
-
 /*
  * unuse_shm() search for an eventually swapped out shm page.
  */
 void shm_unuse(swp_entry_t entry, struct page *page)
 {
-	int i;
+	int i, n;
 
 	shm_lockall();
 	for (i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel *shp = shm_get(i);
 		if(shp==NULL)
 			continue;
-		if (shm_unuse_core(shp, entry, page))
-			goto out;
+		for (n = 0; n < shp->shm_npages; n++) {
+			if (pte_none(SHM_ENTRY(shp,n)))
+				continue;
+			if (pte_present(SHM_ENTRY(shp,n)))
+				continue;
+			if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
+				shm_unuse_page(shp, n, entry, page);
+				goto out;
+			}
+		}
 	}
 out:
 	shm_unlockall();
-	zmap_unuse(entry, page);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1131,10 +1054,6 @@
 
     	for(i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel* shp = shm_lock(i);
-		if (shp == &zshmid_kernel) {
-			shm_unlock(i);
-			continue;
-		}
 		if(shp!=NULL) {
 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
@@ -1183,137 +1102,23 @@
 }
 #endif
 
-static struct shmid_kernel *zmap_list = 0;
-static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
-static unsigned long zswap_idx = 0; /* next to swap */
-static struct shmid_kernel *zswap_shp = 0;
-
-static struct vm_operations_struct shmzero_vm_ops = {
-	open:		shmzero_open,
-	close:		shmzero_close,
-	nopage:		shm_nopage,
-	swapout:	shm_swapout,
-};
-
 int map_zero_setup(struct vm_area_struct *vma)
 {
+        int shmid;
 	struct shmid_kernel *shp;
 
-	if (!(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE)))
-		return -ENOMEM;
-	shp->id = zero_id;	/* hack for shm_lock et al */
+	down(&shm_ids.sem);
+	shmid = newseg(IPC_PRIVATE, S_IRWXUGO, vma->vm_end - vma->vm_start);
+        up(&shm_ids.sem);
+        if (shmid < 0)
+                return shmid;
+
+        if (!(shp = shm_lock (shmid)))
+                BUG();
+        shp->shm_perm.mode |= SHM_DEST;
+        shm_unlock (shmid);
 	vma->vm_private_data = shp;
-	vma->vm_ops = &shmzero_vm_ops;
-	shmzero_open(vma);
-	spin_lock(&zmap_list_lock);
-	shp->attaches = (struct vm_area_struct *)zmap_list;
-	zmap_list = shp;
-	spin_unlock(&zmap_list_lock);
+	vma->vm_ops = &shm_vm_ops;
+	shm_open (vma);
 	return 0;
-}
-
-static void shmzero_open(struct vm_area_struct *shmd)
-{
-	struct shmid_kernel *shp;
-
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	down(&shp->sem);
-	shp->shm_nattch++;
-	up(&shp->sem);
-}
-
-static void shmzero_close(struct vm_area_struct *shmd)
-{
-	int done = 0;
-	struct shmid_kernel *shp, *prev, *cur;
-
-	shp = (struct shmid_kernel *) shmd->vm_private_data;
-	down(&shp->sem);
-	if (--shp->shm_nattch == 0)
-		done = 1;
-	up(&shp->sem);
-	if (done) {
-		spin_lock(&zmap_list_lock);
-		if (shp == zswap_shp)
-			zswap_shp = (struct shmid_kernel *)(shp->attaches);
-		if (shp == zmap_list)
-			zmap_list = (struct shmid_kernel *)(shp->attaches);
-		else {
-			prev = zmap_list;
-			cur = (struct shmid_kernel *)(prev->attaches);
-			while (cur != shp) {
-				prev = cur;
-				cur = (struct shmid_kernel *)(prev->attaches);
-			}
-			prev->attaches = (struct vm_area_struct *)(shp->attaches);
-		}
-		spin_unlock(&zmap_list_lock);
-		killseg_core(shp, 0);
-	}
-}
-
-static void zmap_unuse(swp_entry_t entry, struct page *page)
-{
-	struct shmid_kernel *shp;
-
-	spin_lock(&zmap_list_lock);
-	shp = zmap_list;
-	while (shp) {
-		if (shm_unuse_core(shp, entry, page))
-			break;
-		shp = (struct shmid_kernel *)shp->attaches;
-	}
-	spin_unlock(&zmap_list_lock);
-}
-
-static void zshm_swap (int prio, int gfp_mask, zone_t *zone)
-{
-	struct shmid_kernel *shp;
-	swp_entry_t swap_entry;
-	unsigned long idx;
-	int loop = 0;
-	int counter;
-	struct page * page_map;
-
-	counter = 10;	/* maybe we should use zshm_rss */
-	if (!counter)
-		return;
-next:
-	if (shm_swap_preop(&swap_entry))
-		return;
-
-	spin_lock(&zmap_list_lock);
-	if (zmap_list == 0)
-		goto failed;
-next_id:
-	if ((shp = zswap_shp) == 0) {
-		if (loop) {
-failed:
-			spin_unlock(&zmap_list_lock);
-			__swap_free(swap_entry, 2);
-			return;
-		}
-		zswap_shp = shp = zmap_list;
-		zswap_idx = 0;
-		loop = 1;
-	}
-
-check_table:
-	idx = zswap_idx++;
-	if (idx >= shp->shm_npages) {
-		zswap_shp = (struct shmid_kernel *)(zswap_shp->attaches);
-		zswap_idx = 0;
-		goto next_id;
-	}
-
-	switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) {
-		case RETRY: goto check_table;
-		case FAILED: goto failed;
-	}
-	spin_unlock(&zmap_list_lock);
-
-	shm_swap_postop(page_map);
-	if (counter)
-		goto next;
-	return;
 }

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 12:08     ` Christoph Rohland
@ 2000-03-01 17:34       ` Kanoj Sarcar
  2000-03-01 17:55         ` Christoph Rohland
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-01 17:34 UTC (permalink / raw)
  To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel

> 
> Hi Kanoj and Linus,
> 
> kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> 
> > [snip] 
> >
> > I do not believe there is any good reason to expose the special
> > shared memory segment used as a place holder for all /dev/zero
> > mappings to users via ipc* commands. This special segment exists
> > only because we want to reduce kernel code duplication, and all the
> > zshmid_kernel/ zero_id checks just make sure that regular shared
> > memory works pretty much the way it did before. (One thing I am
> > unhappy about is that this special segment eats up a shm id, but
> > that's probably not too bad).
> 
> The appended proposal reduces code duplication and complexity a
> lot. (The diff47 needs your patches against other files added.)
>

What you have sent is what I used as a first draft for the implementation.
The good part of it is that it reduces code duplication. The _really_ bad
part is that it penalizes users in terms of numbers of shared memory 
segments, max size of /dev/zero mappings, and limitations imposed by
shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a 
shmid for each /dev/zero mapping is a good idea ...

Furthermore, I did not want to change behavior of information returned
by ipc* and various procfs commands, as well as swapout behavior, thus
the creation of the zmap_list. I decided a few lines of special case
checking in a handful of places was a much better option.

If the current /dev/zero stuff hampers any plans you have with shm code 
(eg page cachification), I would be willing to talk about it ...

Kanoj

> I would vote to apply diff48 to the standard kernel. For me the whole
> solution is still a workaround. 
> 
> Greetings
> 		Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 17:34       ` Kanoj Sarcar
@ 2000-03-01 17:55         ` Christoph Rohland
  2000-03-01 18:18           ` Kanoj Sarcar
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Rohland @ 2000-03-01 17:55 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Linus Torvalds, linux-mm, linux-kernel

kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:

> What you have sent is what I used as a first draft for the implementation.
> The good part of it is that it reduces code duplication. The _really_ bad
> part is that it penalizes users in terms of numbers of shared memory 
> segments, max size of /dev/zero mappings, and limitations imposed by
> shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a 
> shmid for each /dev/zero mapping is a good idea ...

We can tune all these parameters at runtime. This should not be a
reason.

> Furthermore, I did not want to change behavior of information returned
> by ipc* and various procfs commands, as well as swapout behavior, thus
> the creation of the zmap_list. I decided a few lines of special case
> checking in a handful of places was a much better option.

IMHO all this SYSV ipc stuff is a totally broken API and many agree
with me. I do not care to clutter up the output of it a little bit for
this feature.

Nobody can know who is creating private IPC segments. So nobody should
be irritated by some more segments displayed/used.

In the contrary: I like the ability to restrict the usage of these
segments with the ipc parameters. Keep in mind you can stack a lot of
segments for a DOS attack. and all the segments will use the whole
memory.

> If the current /dev/zero stuff hampers any plans you have with shm code 
> (eg page cachification), I would be willing to talk about it ...

It makes shm fs a lot more work. And the special handling slows down
shm handling.

Greetings
		Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 17:55         ` Christoph Rohland
@ 2000-03-01 18:18           ` Kanoj Sarcar
  2000-03-01 19:42             ` Christoph Rohland
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-01 18:18 UTC (permalink / raw)
  To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel

> 
> kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> 
> > What you have sent is what I used as a first draft for the implementation.
> > The good part of it is that it reduces code duplication. The _really_ bad
> > part is that it penalizes users in terms of numbers of shared memory 
> > segments, max size of /dev/zero mappings, and limitations imposed by
> > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a 
> > shmid for each /dev/zero mapping is a good idea ...
> 
> We can tune all these parameters at runtime. This should not be a
> reason.

Show me the patch ... by the time you are done, you _probably_ would
have complicated the code more than the current /dev/zero tweaks.

> 
> > Furthermore, I did not want to change behavior of information returned
> > by ipc* and various procfs commands, as well as swapout behavior, thus
> > the creation of the zmap_list. I decided a few lines of special case
> > checking in a handful of places was a much better option.
> 
> IMHO all this SYSV ipc stuff is a totally broken API and many agree
> with me. I do not care to clutter up the output of it a little bit for
> this feature.

In reality, /dev/zero should have nothing to do with SYSV. Currently,
its that way because I wanted to minimize code duplication. Most of
the *_core() routines can be taken into ipc/shm_core.c, and together 
with util.c, will let /dev/zero be decoupled from SYSV.

> 
> Nobody can know who is creating private IPC segments. So nobody should
> be irritated by some more segments displayed/used.

The problem is more with the limits, much less with the output ...

> 
> In the contrary: I like the ability to restrict the usage of these
> segments with the ipc parameters. Keep in mind you can stack a lot of
> segments for a DOS attack. and all the segments will use the whole
> memory.

Not sure what you are talking about here ... /dev/zero mmaps are subject
to the same vm-resource checking as other mmaps, and this checking is
a little different for "real" shm creation.

> 
> > If the current /dev/zero stuff hampers any plans you have with shm code 
> > (eg page cachification), I would be willing to talk about it ...
> 
> It makes shm fs a lot more work. And the special handling slows down
> shm handling.

The shm handling slow down is minimal. Most of this is an extra check in 
shm_nopage(), but that _only_ happens for /dev/zero segments, not for
"real" shm segments.

As to why shm fs is a lot more work, we can talk. Linus/Ingo did bring this
up, at a general level, we think that adding an extra inode or other
data structure in map_zero_setup() would be able to handle this.

If a small amount of special case code is the problem, I would suggest 
keep the code as it is for now. Once you have the shm fs work done for 
"real" shm segments, I can look at how to handle /dev/zero segments.

Kanoj

> 
> Greetings
> 		Christoph
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux.eu.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 18:18           ` Kanoj Sarcar
@ 2000-03-01 19:42             ` Christoph Rohland
  2000-03-01 20:09               ` Kanoj Sarcar
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Rohland @ 2000-03-01 19:42 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Linus Torvalds, linux-mm, linux-kernel

Hi Kanoj,

kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:

> > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> > 
> > > What you have sent is what I used as a first draft for the
> > > implementation.  The good part of it is that it reduces code
> > > duplication. The _really_ bad part is that it penalizes users in
> > > terms of numbers of shared memory segments, max size of
> > > /dev/zero mappings, and limitations imposed by
> > > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a
> > > shmid for each /dev/zero mapping is a good idea ...
> > 
> > We can tune all these parameters at runtime. This should not be a
> > reason.
> 
> Show me the patch ... by the time you are done, you _probably_ would
> have complicated the code more than the current /dev/zero tweaks.

It _is_ tunable in the current code.

> > > Furthermore, I did not want to change behavior of information
> > > returned by ipc* and various procfs commands, as well as swapout
> > > behavior, thus the creation of the zmap_list. I decided a few
> > > lines of special case checking in a handful of places was a much
> > > better option.
> > 
> > IMHO all this SYSV ipc stuff is a totally broken API and many
> > agree with me. I do not care to clutter up the output of it a
> > little bit for this feature.
> 
> In reality, /dev/zero should have nothing to do with
> SYSV. Currently, its that way because I wanted to minimize code
> duplication. Most of the *_core() routines can be taken into
> ipc/shm_core.c, and together with util.c, will let /dev/zero be
> decoupled from SYSV.

That would be the best case and thus your proposal is a workaround
until the pagecache can handle it.

> > Nobody can know who is creating private IPC segments. So nobody
> > should be irritated by some more segments displayed/used.
> 
> The problem is more with the limits, much less with the output ...

And they are tunable...
 
> > In the contrary: I like the ability to restrict the usage of these
> > segments with the ipc parameters. Keep in mind you can stack a lot
> > of segments for a DOS attack. and all the segments will use the
> > whole memory.
> 
> Not sure what you are talking about here ... /dev/zero mmaps are subject
> to the same vm-resource checking as other mmaps, and this checking is
> a little different for "real" shm creation.

Think about first mmaping an anonymous shared segment, touching all
the pages, unmapping most of it. Then start over again. You end up
with loads of pages used and not freed and no longer accessible.

> > > If the current /dev/zero stuff hampers any plans you have with shm code 
> > > (eg page cachification), I would be willing to talk about it ...
> > 
> > It makes shm fs a lot more work. And the special handling slows down
> > shm handling.
> 
> The shm handling slow down is minimal. Most of this is an extra
> check in shm_nopage(), but that _only_ happens for /dev/zero
> segments, not for "real" shm segments.

The check happens for _all_ segments and shm_nopage can be called very
often on big machines under heavy load.

> As to why shm fs is a lot more work, we can talk. Linus/Ingo did
> bring this up, at a general level, we think that adding an extra
> inode or other data structure in map_zero_setup() would be able to
> handle this.
> 
> If a small amount of special case code is the problem, I would suggest 
> keep the code as it is for now. Once you have the shm fs work done for 
> "real" shm segments, I can look at how to handle /dev/zero segments.

shm fs is working. I will send a patch against 2.3.48 soon.

Greetings
		Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 19:42             ` Christoph Rohland
@ 2000-03-01 20:09               ` Kanoj Sarcar
  2000-03-06 22:43                 ` Stephen C. Tweedie
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-01 20:09 UTC (permalink / raw)
  To: Christoph Rohland; +Cc: Linus Torvalds, linux-mm, linux-kernel

> 
> Hi Kanoj,
> 
> kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> 
> > > kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> > > 
> > > > What you have sent is what I used as a first draft for the
> > > > implementation.  The good part of it is that it reduces code
> > > > duplication. The _really_ bad part is that it penalizes users in
> > > > terms of numbers of shared memory segments, max size of
> > > > /dev/zero mappings, and limitations imposed by
> > > > shm_ctlmax/shm_ctlall/shm_ctlmni etc. I do not think taking up a
> > > > shmid for each /dev/zero mapping is a good idea ...
> > > 
> > > We can tune all these parameters at runtime. This should not be a
> > > reason.
> > 
> > Show me the patch ... by the time you are done, you _probably_ would
> > have complicated the code more than the current /dev/zero tweaks.
> 
> It _is_ tunable in the current code.

Oh, you are talking about asking administrators to do this tuning, which
is unfair. I was talking about automatic in-kernel tuning whenever a new
/dev/zero segment is created/destroyed etc ...

> 
> > > > Furthermore, I did not want to change behavior of information
> > > > returned by ipc* and various procfs commands, as well as swapout
> > > > behavior, thus the creation of the zmap_list. I decided a few
> > > > lines of special case checking in a handful of places was a much
> > > > better option.
> > > 
> > > IMHO all this SYSV ipc stuff is a totally broken API and many
> > > agree with me. I do not care to clutter up the output of it a
> > > little bit for this feature.
> > 
> > In reality, /dev/zero should have nothing to do with
> > SYSV. Currently, its that way because I wanted to minimize code
> > duplication. Most of the *_core() routines can be taken into
> > ipc/shm_core.c, and together with util.c, will let /dev/zero be
> > decoupled from SYSV.
> 
> That would be the best case and thus your proposal is a workaround
> until the pagecache can handle it.
> 
> > > Nobody can know who is creating private IPC segments. So nobody
> > > should be irritated by some more segments displayed/used.
> > 
> > The problem is more with the limits, much less with the output ...
> 
> And they are tunable...

See above ...

>  
> > > In the contrary: I like the ability to restrict the usage of these
> > > segments with the ipc parameters. Keep in mind you can stack a lot
> > > of segments for a DOS attack. and all the segments will use the
> > > whole memory.
> > 
> > Not sure what you are talking about here ... /dev/zero mmaps are subject
> > to the same vm-resource checking as other mmaps, and this checking is
> > a little different for "real" shm creation.
> 
> Think about first mmaping an anonymous shared segment, touching all
> the pages, unmapping most of it. Then start over again. You end up
> with loads of pages used and not freed and no longer accessible.

True ... and this can be independently handled via a shmzero_unmap() 
type operation in shmzero_vm_ops (I never claimed the /dev/zero stuff 
is complete :-)) Even in the absence of that, vm_enough_memory() should
in theory be able to prevent deadlocks, with all its known caveats ...

> 
> > > > If the current /dev/zero stuff hampers any plans you have with shm code 
> > > > (eg page cachification), I would be willing to talk about it ...
> > > 
> > > It makes shm fs a lot more work. And the special handling slows down
> > > shm handling.
> > 
> > The shm handling slow down is minimal. Most of this is an extra
> > check in shm_nopage(), but that _only_ happens for /dev/zero
> > segments, not for "real" shm segments.
> 
> The check happens for _all_ segments and shm_nopage can be called very
> often on big machines under heavy load.

I was talking about the 

	if ((shp != shm_lock(shp->id)) && (is_shmzero == 0))

checks in shm_nopage.

> 
> > As to why shm fs is a lot more work, we can talk. Linus/Ingo did
> > bring this up, at a general level, we think that adding an extra
> > inode or other data structure in map_zero_setup() would be able to
> > handle this.
> > 
> > If a small amount of special case code is the problem, I would suggest 
> > keep the code as it is for now. Once you have the shm fs work done for 
> > "real" shm segments, I can look at how to handle /dev/zero segments.
> 
> shm fs is working. I will send a patch against 2.3.48 soon.

Great, we can see what makes sense for /dev/zero wrt shmfs ...

Kanoj

> 
> Greetings
> 		Christoph
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-01 20:09               ` Kanoj Sarcar
@ 2000-03-06 22:43                 ` Stephen C. Tweedie
  2000-03-06 23:01                   ` Kanoj Sarcar
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen C. Tweedie @ 2000-03-06 22:43 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Christoph Rohland, Linus Torvalds, linux-mm, linux-kernel

Hi,

On Wed, 1 Mar 2000 12:09:11 -0800 (PST), kanoj@google.engr.sgi.com
(Kanoj Sarcar) said:

> Great, we can see what makes sense for /dev/zero wrt shmfs ...

In principle, there is no reason why we need any special support for
this sort of thing.  The swap cache already does very nearly all we
need.

The swap cache maintains links between swap locations on disk and
(potentially shared) anonymous pages in memory.  It was designed so that
even if you fork, as long as the resulting COW pages remain unchanged,
the two processes will share the same pages of physical memory or swap
no matter what.

Basically, as soon as you try to swap out an anonymous page from one
process, a swap entry is allocated and a swap cache entry is set up.
Only once the last process swaps out does the physical page in memory
get released.  Until that happens, any lookup of the swap entry from a
swapped-out process will find the page already in memory.

To make this work for shared anonymous pages, we need two changes to the
swap cache.  We need to teach the swap cache about writable anonymous
pages, and we need to be able to defer the physical writing of the page
to swap until the last reference to the swap cache frees up the page.
Do that, and shared /dev/zero maps will Just Work.

The mechanics of this are probably a little too subtle to get right for
2.4 at this stage (in particular you need to know when to write the page
back to disk: we don't have the necessary PAGE_DIRTY infrastructure in
place for anonymous pages), but I definitely think this is the right way
to do things in the long term.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-06 22:43                 ` Stephen C. Tweedie
@ 2000-03-06 23:01                   ` Kanoj Sarcar
  2000-03-08 12:02                     ` Christoph Rohland
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-06 23:01 UTC (permalink / raw)
  To: Stephen C. Tweedie
  Cc: Christoph Rohland, Linus Torvalds, linux-mm, linux-kernel

> 
> To make this work for shared anonymous pages, we need two changes to the
> swap cache.  We need to teach the swap cache about writable anonymous
> pages, and we need to be able to defer the physical writing of the page
> to swap until the last reference to the swap cache frees up the page.
> Do that, and shared /dev/zero maps will Just Work.
>

The current implementation of /dev/zero shared memory is to treat the
mapping as similarly as possible to a shared memory segment. The common
code handles the swap cache interactions, and both cases qualify as shared
anonymous mappings. While its not well tested, in theory it should work. 
We are currently agonizing over how to integrate the /dev/zero code with 
shmfs patch.

Kanoj
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-06 23:01                   ` Kanoj Sarcar
@ 2000-03-08 12:02                     ` Christoph Rohland
  2000-03-08 17:51                       ` Kanoj Sarcar
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Rohland @ 2000-03-08 12:02 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar

Hi Kanoj,

kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> > To make this work for shared anonymous pages, we need two changes
> > to the swap cache.  We need to teach the swap cache about writable
> > anonymous pages, and we need to be able to defer the physical
> > writing of the page to swap until the last reference to the swap
> > cache frees up the page.  Do that, and shared /dev/zero maps will
> > Just Work.
> 
> The current implementation of /dev/zero shared memory is to treat
> the mapping as similarly as possible to a shared memory segment. The
> common code handles the swap cache interactions, and both cases
> qualify as shared anonymous mappings. While its not well tested, in
> theory it should work. We are currently agonizing over how to
> integrate the /dev/zero code with shmfs patch.
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Since this is not as easy as you thought, wouldn't it be better to do 
the /dev/zero shared maps in the swap cache instead of this workaround
over shm? Thus we would get the mechanisms to redo all shm stuff wrt
swap cache.

At the same time we would not hinder the development of normal shm
code to use file semantics (aka shm fs) which will give us posix shm.

Greetings
		Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-08 12:02                     ` Christoph Rohland
@ 2000-03-08 17:51                       ` Kanoj Sarcar
  2000-03-08 18:35                         ` Christoph Rohland
  0 siblings, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-08 17:51 UTC (permalink / raw)
  To: Christoph Rohland
  Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar

> 
> Hi Kanoj,
> 
> kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> > > To make this work for shared anonymous pages, we need two changes
> > > to the swap cache.  We need to teach the swap cache about writable
> > > anonymous pages, and we need to be able to defer the physical
> > > writing of the page to swap until the last reference to the swap
> > > cache frees up the page.  Do that, and shared /dev/zero maps will
> > > Just Work.
> > 
> > The current implementation of /dev/zero shared memory is to treat
> > the mapping as similarly as possible to a shared memory segment. The
> > common code handles the swap cache interactions, and both cases
> > qualify as shared anonymous mappings. While its not well tested, in
> > theory it should work. We are currently agonizing over how to
> > integrate the /dev/zero code with shmfs patch.
>   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Since this is not as easy as you thought, wouldn't it be better to do 
> the /dev/zero shared maps in the swap cache instead of this workaround
> over shm? Thus we would get the mechanisms to redo all shm stuff wrt
> swap cache.
> 
> At the same time we would not hinder the development of normal shm
> code to use file semantics (aka shm fs) which will give us posix shm.
> 
> Greetings
> 		Christoph
> 

I am not sure why you think the /dev/zero code is a workaround on top
of shm. A lot of code and mechanisms are easily sharable between shm  
and /dev/zero, since they are, as I pointed out, anonymous shared
pages. The only differences are when the data structures are torn 
down, and which processes may attach to the segments. 

Btw, implementing /dev/zero using shm code mostly is _quite_ easy,
that's how the code has been since 2.3.48. Even integrating with
shmfs has been pretty easy, as you have seen in the patches I have
CCed you on. The harder part is to look towards the future and do
what Linus suggested, namely associate each mapping with an inode
so in the future the inodecache might possibly be used to manage
the shm pages. As you know, I sent out a patch for that yesterday.

Its completely okay by me to take in a dev-zero/shmfs integration
patch that is not perfect wrt /dev/zero, as I have indicated to 
you and Linus, just so that the shmfs work gets in. I can fix
minor problems with the /dev/zero code as they come up.

What sct suggests is quite involved, as he himself mentions. Just
implementing /dev/zero is probably not a good reason to undertake
it.

Hope this makes sense.

Kanoj
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-08 17:51                       ` Kanoj Sarcar
@ 2000-03-08 18:35                         ` Christoph Rohland
  2000-03-08 18:48                           ` Linus Torvalds
  2000-03-08 18:57                           ` Kanoj Sarcar
  0 siblings, 2 replies; 19+ messages in thread
From: Christoph Rohland @ 2000-03-08 18:35 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar

kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:

> I am not sure why you think the /dev/zero code is a workaround on
> top of shm. A lot of code and mechanisms are easily sharable between
> shm and /dev/zero, since they are, as I pointed out, anonymous
> shared pages. The only differences are when the data structures are
> torn down, and which processes may attach to the segments.

Because I think the current shm code should be redone in a way that
shared anonymous pages live in the swap cache. You could say the shm
code is a workaround :-)

> Btw, implementing /dev/zero using shm code mostly is _quite_ easy,
> that's how the code has been since 2.3.48. Even integrating with
> shmfs has been pretty easy, as you have seen in the patches I have
> CCed you on. The harder part is to look towards the future and do
> what Linus suggested, namely associate each mapping with an inode so
> in the future the inodecache might possibly be used to manage the
> shm pages. As you know, I sent out a patch for that yesterday.

In my opinion this is one of two orthogonal steps. shm fs targets the
better integration in the file system semantics.

> Its completely okay by me to take in a dev-zero/shmfs integration
> patch that is not perfect wrt /dev/zero, as I have indicated to 
> you and Linus, just so that the shmfs work gets in. I can fix
> minor problems with the /dev/zero code as they come up.
> 
> What sct suggests is quite involved, as he himself mentions. Just
> implementing /dev/zero is probably not a good reason to undertake
> it.

But IMHO reworking shm based on the /dev/zero stuff would be a good
reason to do the /dev/zero stuff right. That's all I wanted to say in
my last mail.

Perhaps I am a little bit too opposed to these changes because I have
seen too many patches thrown on the shm code during the 2.3 cycle
which were plain buggy and nobody cared. Most of my kernel work since
some time is doing quite stupid tests on the shm code.

BTW: I am just running these tests on your patch and it seems to work
quite well. (I will let it run over night) If it survives that I will
also throw some quite complicated /dev/zero tests on it later.

Greetings
		Christoph
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-08 18:35                         ` Christoph Rohland
@ 2000-03-08 18:48                           ` Linus Torvalds
  2000-03-08 18:57                           ` Kanoj Sarcar
  1 sibling, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2000-03-08 18:48 UTC (permalink / raw)
  To: Christoph Rohland; +Cc: Kanoj Sarcar, Stephen C. Tweedie, linux-mm, Ingo Molnar


On 8 Mar 2000, Christoph Rohland wrote:
> 
> Because I think the current shm code should be redone in a way that
> shared anonymous pages live in the swap cache. You could say the shm
> code is a workaround :-)

Note that this is true of both shm AND /dev/zero.

Whether it is done in the current really clunky manner (ugly special shm
tables) or in the RightWay(tm) (page cache), shm and /dev/zero should
always basically be the same. The only difference between shm and
/dev/zero is how you access and set up mappings, not how the actual
mapping then works.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-08 18:35                         ` Christoph Rohland
  2000-03-08 18:48                           ` Linus Torvalds
@ 2000-03-08 18:57                           ` Kanoj Sarcar
  2000-03-09 18:15                             ` Christoph Rohland
  1 sibling, 1 reply; 19+ messages in thread
From: Kanoj Sarcar @ 2000-03-08 18:57 UTC (permalink / raw)
  To: Christoph Rohland
  Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar

> 
> kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> 
> > I am not sure why you think the /dev/zero code is a workaround on
> > top of shm. A lot of code and mechanisms are easily sharable between
> > shm and /dev/zero, since they are, as I pointed out, anonymous
> > shared pages. The only differences are when the data structures are
> > torn down, and which processes may attach to the segments.
> 
> Because I think the current shm code should be redone in a way that
> shared anonymous pages live in the swap cache. You could say the shm
> code is a workaround :-)
>

No arguments there :-) But it seems a little ambitious to me to
implement shmfs, dev-zero and rework the swapcache at the same time.
We are probably on the right track, having done /dev/zero, getting 
done with shmfs. Then, if we want to take the risk, we can improve
the core shm/swapcache interactions in 2.3/2.4/2.5.

> > Btw, implementing /dev/zero using shm code mostly is _quite_ easy,
> > that's how the code has been since 2.3.48. Even integrating with
> > shmfs has been pretty easy, as you have seen in the patches I have
> > CCed you on. The harder part is to look towards the future and do
> > what Linus suggested, namely associate each mapping with an inode so
> > in the future the inodecache might possibly be used to manage the
> > shm pages. As you know, I sent out a patch for that yesterday.
> 
> In my opinion this is one of two orthogonal steps. shm fs targets the
> better integration in the file system semantics.
> 
> > Its completely okay by me to take in a dev-zero/shmfs integration
> > patch that is not perfect wrt /dev/zero, as I have indicated to 
> > you and Linus, just so that the shmfs work gets in. I can fix
> > minor problems with the /dev/zero code as they come up.
> > 
> > What sct suggests is quite involved, as he himself mentions. Just
> > implementing /dev/zero is probably not a good reason to undertake
> > it.
> 
> But IMHO reworking shm based on the /dev/zero stuff would be a good
> reason to do the /dev/zero stuff right. That's all I wanted to say in
> my last mail.
> 
> Perhaps I am a little bit too opposed to these changes because I have
> seen too many patches thrown on the shm code during the 2.3 cycle
> which were plain buggy and nobody cared. Most of my kernel work since
> some time is doing quite stupid tests on the shm code.
> 
> BTW: I am just running these tests on your patch and it seems to work
> quite well. (I will let it run over night) If it survives that I will
> also throw some quite complicated /dev/zero tests on it later.

Great! Is there a way to capture tests written by individual developers
and testers and have them be shared, so everyone can use them (when there
are no licensing/copyright issues)? I would definitely like to get your
test programs and throw them at the kernel myself. Lets talk offline if you 
are willing to share your tests.

Kanoj

> 
> Greetings
> 		Christoph
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux.eu.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] [RFT] Shared /dev/zero mmaping feature
  2000-03-08 18:57                           ` Kanoj Sarcar
@ 2000-03-09 18:15                             ` Christoph Rohland
  0 siblings, 0 replies; 19+ messages in thread
From: Christoph Rohland @ 2000-03-09 18:15 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Stephen C. Tweedie, Linus Torvalds, linux-mm, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 3822 bytes --]


kanoj@google.engr.sgi.com (Kanoj Sarcar) writes:
> > Because I think the current shm code should be redone in a way that
> > shared anonymous pages live in the swap cache. You could say the shm
> > code is a workaround :-)
> 
> No arguments there :-) But it seems a little ambitious to me to
> implement shmfs, dev-zero and rework the swapcache at the same time.
> We are probably on the right track, having done /dev/zero, getting 
> done with shmfs. Then, if we want to take the risk, we can improve
> the core shm/swapcache interactions in 2.3/2.4/2.5.

O.K. I would really like to work on that further.

> > BTW: I am just running these tests on your patch and it seems to work
> > quite well. (I will let it run over night) If it survives that I will
> > also throw some quite complicated /dev/zero tests on it later.
> 
> Great! Is there a way to capture tests written by individual
> developers and testers and have them be shared, so everyone can use
> them (when there are no licensing/copyright issues)? I would
> definitely like to get your test programs and throw them at the
> kernel myself. Lets talk offline if you are willing to share your
> tests.

Unfortunately I do not have web/ftp space to put the programs on, but
they are in the attachments. I normally run them in two steps:
1) Many processes runing against a nunber of segments which fit into
   the main memory.
2) 2 processes wich access more segments so they get it swapping.
All processes run with 20% probability for deletion.

My tests with your patch went so far mostly well. The machine locks up
after some while or kills init, but this is AFAICS stock behaviour.

One bug was introduced by your patch. On swapoff we get an oops:

ksymoops 0.7c on i686 2.3.50.  Options used
     -V (default)
     -K (specified)
     -l /proc/modules (default)
     -o /lib/modules/2.3.50/ (default)
     -m /boot/System.map-2.3.50 (specified)

No modules in ksyms, skipping objects
No ksyms, skipping lsmod
Unable to handle kernel NULL pointer dereference at virtual address 00000000
*pde = 21118001
Oops: 0000
CPU:    7
EIP:    0010:[<c0185d17>]
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010246
eax: 00000000   ebx: 00000000   ecx: 00000000   edx: 00000000
esi: 00000000   edi: c029ac74   ebp: c0185474   esp: d1c63f24
ds: 0018   es: 0018   ss: 0018
Process swapoff (pid: 624, stackpage=d1c63000)
Stack: c029ac74 c032cb74 c19daa94 02944c00 00000000 c0186889 c029ac74 02944c00 
       c19daa94 00000001 c19daa94 02944c00 0002944c c0185e62 02944c00 c19daa94 
       c02aa000 c19daa94 02944c00 c013baa7 02944c00 c19daa94 d1c62000 00000000 
Call Trace: [<c0186889>] [<c0185e62>] [<c013baa7>] [<c013bec9>] [<c010b49c>] 
Code: 8b 14 91 8b 04 da 89 44 24 10 0b 44 da 04 74 3e 8b 04 da 8b 

>>EIP; c0185d17 <shm_unuse_core+33/90>   <=====
Trace; c0186889 <zmap_unuse+121/1d8>
Trace; c0185e62 <shm_unuse+ee/f8>
Trace; c013baa7 <try_to_unuse+1fb/3a0>
Trace; c013bec9 <sys_swapoff+27d/4b8>
Trace; c010b49c <system_call+34/38>
Code;  c0185d17 <shm_unuse_core+33/90>
00000000 <_EIP>:
Code;  c0185d17 <shm_unuse_core+33/90>   <=====
   0:   8b 14 91                  movl   (%ecx,%edx,4),%edx   <=====
Code;  c0185d1a <shm_unuse_core+36/90>
   3:   8b 04 da                  movl   (%edx,%ebx,8),%eax
Code;  c0185d1d <shm_unuse_core+39/90>
   6:   89 44 24 10               movl   %eax,0x10(%esp,1)
Code;  c0185d21 <shm_unuse_core+3d/90>
   a:   0b 44 da 04               orl    0x4(%edx,%ebx,8),%eax
Code;  c0185d25 <shm_unuse_core+41/90>
   e:   74 3e                     je     4e <_EIP+0x4e> c0185d65 <shm_unuse_core+81/90>
Code;  c0185d27 <shm_unuse_core+43/90>
  10:   8b 04 da                  movl   (%edx,%ebx,8),%eax
Code;  c0185d2a <shm_unuse_core+46/90>
  13:   8b 00                     movl   (%eax),%eax

Greetings
		Christoph

-- 

[-- Attachment #2: ipctst.c --]
[-- Type: text/plain, Size: 1695 bytes --]

#include <stdlib.h>
#include <stdio.h>

#include <errno.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>

int main (int ac, char **av) {
	int segs, size, proc, rmpr;
	unsigned long long iter;
	struct shmid_ds buf;
	pid_t pid;

	if (ac < 6) {
		printf ("usage: shmtst segs size proc iter rm%%\n");
		exit (1);
	}
	segs = atoi (av[1]);
	size = atoi (av[2]);
	proc = atoi (av[3]);
	iter = atoi (av[4]);
	rmpr = atoi (av[5]);

	iter = 1 << iter;
	printf ("using %d segs of size %d (%llu iterations)\n", 
		segs, size, iter);
	while (-- proc) {
		if ((pid = fork()) > 0) {
			printf ("started process %d\n", (int) pid);
		} else {
			break;
		}
	}
	srandom (getpid());
	while (iter--) {
		key_t key;
		int seg, i;
		unsigned char c, *ptr;
		volatile unsigned char *p;

		key = random() % segs +1;
		if ((seg = shmget (key, size, IPC_CREAT| 0600)) == -1) {
			perror("shmget");
			if (errno != EIDRM)
				exit (1);
			continue;
		}
		if (0) sched_yield();
		if ((ptr = shmat (seg, 0, 0)) == (unsigned char *) -1) {
			perror ("shmat");
			continue;
		}
		for (p = ptr; p < ptr + size; p += 4097)
			*p = (unsigned char) (p - ptr);
		for (p = ptr; p < ptr + size; p += 4097) {
			c = *p;
			if (c == (unsigned char)(p-ptr)) 
				continue;
			shmctl (seg, IPC_STAT, &buf);
			printf ("n=%i, m = %i: %i != %i", (int) buf.shm_nattch,
				(int)buf.shm_perm.mode,
				(int)(unsigned char)(p-ptr), (int) c);
			for (i = 0 ; i < 5; i++) {
				printf (", %i", (int)*p);
				sched_yield();
			}
			printf ("\n");
		}

		if (shmdt (ptr) != 0) {
			perror("shmdt");
			exit (1);
		}
		if (random () % 100 < rmpr &&
		    shmctl (seg, IPC_RMID, NULL) == -1) 
			perror("shmctl IPC_RMID");
	}
}	

[-- Attachment #3: shmtst.c --]
[-- Type: text/plain, Size: 1540 bytes --]

#include <stdlib.h>
#include <stdio.h>

#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>

int main (int ac, char **av) {
	int segs, size, proc, rmpr;
	unsigned long long iter;
	pid_t pid;
	struct stat buf;

	if (ac < 6) {
		printf ("usage: shmtst segs size proc iter rm%%\n");
		exit (1);
	}
	segs = atoi (av[1]);
	size = atoi (av[2]);
	proc = atoi (av[3]);
	iter = atoi (av[4]);
	rmpr = atoi (av[5]);

	iter = 1 << iter;
	printf ("using %d segs of size %d (%llu iterations)\n", 
		segs, size, iter);
	while (-- proc) {
		if ((pid = fork()) > 0) {
			printf ("started process %d\n", (int) pid);
		} else {
			break;
		}
	}
	srandom (getpid());
	while (iter--) {
		int seg, key;
		char name[20];
		char *ptr, *p;

		key = random() % segs;
		sprintf (name, "my%d", key);
		if ((seg = shm_open (name, O_CREAT| O_RDWR , 0666)) == -1) {
			perror("shm_open");
			exit (1);
		}
	        if (ftruncate (seg, size) == -1) {
			perror ("ftruncate");
			exit (2);
		}
	        if ((ptr = mmap (0, size, PROT_READ|PROT_WRITE,MAP_SHARED,
				 seg, 0)) == MAP_FAILED) {	
			perror ("mmap");
			exit (3);
		}
		for (p = ptr; p < ptr + size; p += 4097)
			*p = (char) p;
		for (p = ptr; p < ptr + size; p += 4097)
			if (*p != (char)p)
				printf ("*p(%i) != p(%i)\n", (int)*p, (int)p & 0xff);

	    	if (munmap (ptr, size) == -1)
			perror ("munmap");

	    	if (close(seg) == -1)
			perror ("close");

		if (random () % 100 < rmpr &&
		    shm_unlink (name) == -1) 
			perror("shm_unlink");
	}
}	

[-- Attachment #4: libposix4.c --]
[-- Type: text/plain, Size: 408 bytes --]

#include <asm/unistd.h>
#include <errno.h>

#define PREFIX "/var/shm"
int shm_open (const char *pathname, int flags, int mode) {
	char name[strlen(pathname) + sizeof(PREFIX)];
	sprintf (name, PREFIX "/%s", pathname);
	return open (name, flags, mode);
}

int shm_unlink(const char *pathname){
	char name[strlen(pathname) + sizeof(PREFIX)];
	sprintf (name, PREFIX "/%s", pathname);
	return unlink (name);
}




^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2000-03-09 18:15 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-02-25 23:08 [RFC] [RFT] Shared /dev/zero mmaping feature Kanoj Sarcar
2000-02-26 16:38 ` Linus Torvalds
2000-02-26 21:47   ` Kanoj Sarcar
2000-02-29 10:54 ` Christoph Rohland
2000-02-29 18:30   ` Kanoj Sarcar
2000-03-01 12:08     ` Christoph Rohland
2000-03-01 17:34       ` Kanoj Sarcar
2000-03-01 17:55         ` Christoph Rohland
2000-03-01 18:18           ` Kanoj Sarcar
2000-03-01 19:42             ` Christoph Rohland
2000-03-01 20:09               ` Kanoj Sarcar
2000-03-06 22:43                 ` Stephen C. Tweedie
2000-03-06 23:01                   ` Kanoj Sarcar
2000-03-08 12:02                     ` Christoph Rohland
2000-03-08 17:51                       ` Kanoj Sarcar
2000-03-08 18:35                         ` Christoph Rohland
2000-03-08 18:48                           ` Linus Torvalds
2000-03-08 18:57                           ` Kanoj Sarcar
2000-03-09 18:15                             ` Christoph Rohland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox