[RFC][PATCH 3/5] ksm source code

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH 3/5] ksm source code
@ 2008-01-21 16:12 Izik Eidus
  2008-01-24  7:24 ` [kvm-devel] " Chris Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Izik Eidus @ 2008-01-21 16:12 UTC (permalink / raw)
  To: kvm-devel, andrea, avi, dor.laor, linux-mm, yaniv

[-- Attachment #1: Type: text/plain, Size: 12 bytes --]


-- 
woof.


[-- Attachment #2: ksm.c --]
[-- Type: text/x-csrc, Size: 20454 bytes --]

/*
 * Memory merging driver for Linux
 *
 * This module enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
 * Copyright (C) 2008 Qumranet, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 */

#include "ksm.h"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/anon_inodes.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <asm-x86/kvm_host.h>
#include <linux/jhash.h>
#include "linux/ksm.h"

#include <asm/tlbflush.h>

MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

struct ksm *ksm;

static int page_hash_size = 0;
module_param(page_hash_size, int, 0);
MODULE_PARM_DESC(page_hash_size, "Hash table size for the pages checksum");

static int rmap_hash_size = 0;
module_param(rmap_hash_size, int, 0);
MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");

int ksm_slab_init(void)
{
	int ret = 1;

	ksm->page_item_cache = kmem_cache_create("ksm_page_item",
						 sizeof(struct page_item), 0,
						 0, NULL);
	if (!ksm->page_item_cache)
		goto out;

	ksm->rmap_item_cache = kmem_cache_create("ksm_rmap_item",
						 sizeof(struct rmap_item), 0,
						 0, NULL);
	if (!ksm->rmap_item_cache)
		goto out_free;
	return 0;

out_free:
	kmem_cache_destroy(ksm->page_item_cache);
out:
	return ret;
}

void ksm_slab_free(void)
{
	kmem_cache_destroy(ksm->rmap_item_cache);
	kmem_cache_destroy(ksm->page_item_cache);
}

static struct page_item *alloc_page_item(void)
{
	void *obj;

	obj = kmem_cache_zalloc(ksm->page_item_cache, GFP_KERNEL);
	return (struct page_item *)obj;
}

static void free_page_item(struct page_item *page_item)
{
	kfree(page_item);
}

static struct rmap_item *alloc_rmap_item(void)
{
	void *obj;

	obj = kmem_cache_zalloc(ksm->rmap_item_cache, GFP_KERNEL);
	return (struct rmap_item *)obj;
}

static void free_rmap_item(struct rmap_item *rmap_item)
{
	kfree(rmap_item);
}

static int inline PageKsm(struct page *page)
{
	return !PageAnon(page);
}

static int page_hash_init(void)
{
	if (!page_hash_size) {
		struct sysinfo sinfo;

		si_meminfo(&sinfo);
		page_hash_size = sinfo.totalram;
	}
	ksm->npages_hash = page_hash_size;
	ksm->page_hash = vmalloc(ksm->npages_hash *
				 sizeof(struct hlist_head *));
	if (IS_ERR(ksm->page_hash))
		return PTR_ERR(ksm->page_hash);
	memset(ksm->page_hash, 0,
	       ksm->npages_hash  * sizeof(struct hlist_head *));
	return 0;
}

static void page_hash_free(void)
{
	int i;
	struct hlist_head *bucket;
	struct hlist_node *node, *n;
	struct page_item *page_item;

	for (i = 0; i < ksm->npages_hash; ++i) {
		bucket = &ksm->page_hash[i];
		hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
			hlist_del(&page_item->link);
			free_page_item(page_item);
		}
	}
	vfree(ksm->page_hash);
}

static int rmap_hash_init(void)
{
	if (!rmap_hash_size) {
		struct sysinfo sinfo;

		si_meminfo(&sinfo);
		rmap_hash_size = sinfo.totalram;
	}
	ksm->nrmaps_hash = rmap_hash_size;
	ksm->rmap_hash = vmalloc(ksm->nrmaps_hash *
				 sizeof(struct hlist_head *));
	if (IS_ERR(ksm->rmap_hash))
		return PTR_ERR(ksm->rmap_hash);
	memset(ksm->rmap_hash, 0,
	       ksm->nrmaps_hash * sizeof(struct hlist_head *));
	return 0;
}

static void rmap_hash_free(void)
{
	int i;
	struct hlist_head *bucket;
	struct hlist_node *node, *n;
	struct rmap_item *rmap_item;

	for (i = 0; i < ksm->nrmaps_hash; ++i) {
		bucket = &ksm->rmap_hash[i];
		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
			hlist_del(&rmap_item->link);
			free_rmap_item(rmap_item);
		}
	}
	vfree(ksm->rmap_hash);
}

static inline u32 calc_hash_index(void *addr)
{
	return jhash(addr, PAGE_SIZE, 17) % ksm->npages_hash;
}

static void remove_page_from_hash(struct mm_struct *mm, unsigned long addr)
{
	struct rmap_item *rmap_item;
	struct hlist_head *bucket;
	struct hlist_node *node, *n;

	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
		if (mm == rmap_item->page_item->mm &&
		    rmap_item->page_item->addr == addr) {
			hlist_del(&rmap_item->page_item->link);
			free_page_item(rmap_item->page_item);
			hlist_del(&rmap_item->link);
			free_rmap_item(rmap_item);
			return;
		}
	}
}

static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
						struct ksm_memory_region *mem)
{
	struct ksm_mem_slot *slot;
	int ret = 1;

	if (!current->mm)
		goto out;

	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
	if (IS_ERR(slot)) {
		ret = PTR_ERR(slot);
		goto out;
	}

	slot->mm = get_task_mm(current);
	slot->addr = mem->addr;
	slot->npages = mem->npages;
	list_add_tail(&slot->link, &ksm->slots);
	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
	ret = 0;
out:
	return ret;
}

static void remove_mm_from_hash(struct mm_struct *mm)
{
	struct ksm_mem_slot *slot;
	int pages_count = 0;

	list_for_each_entry(slot, &ksm->slots, link)
		if (slot->mm == mm)
			break;
	if (!slot)
		BUG();

	spin_lock(&ksm->hash_lock);
	while (pages_count < slot->npages)
	{
		remove_page_from_hash(mm, slot->addr + pages_count * PAGE_SIZE);
		pages_count++;
	}
	spin_unlock(&ksm->hash_lock);
	list_del(&slot->link);
}

static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
{
	struct ksm_mem_slot *slot, *node;

	down_write(&ksm->slots_lock);
	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
		remove_mm_from_hash(slot->mm);
		mmput(slot->mm);
		list_del(&slot->sma_link);
		kfree(slot);
	}
	up_write(&ksm->slots_lock);
	return 0;
}

static int ksm_sma_release(struct inode *inode, struct file *filp)
{
	struct ksm_sma *ksm_sma = filp->private_data;
	int r;

	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
	return r;
}

static long ksm_sma_ioctl(struct file *filp,
			  unsigned int ioctl, unsigned long arg)
{
	struct ksm_sma *sma = filp->private_data;
	void __user *argp = (void __user *)arg;
	int r = EINVAL;

	switch (ioctl) {
	case KSM_REGISTER_MEMORY_REGION: {
		struct ksm_memory_region ksm_memory_region;

		r = -EFAULT;
		if (copy_from_user(&ksm_memory_region, argp,
				   sizeof ksm_memory_region))
			goto out;
		r = ksm_sma_ioctl_register_memory_region(sma,
							 &ksm_memory_region);
		break;
	}
	case KSM_REMOVE_MEMORY_REGION:
		r = ksm_sma_ioctl_remove_memory_region(sma);
		break;
	}

out:
	return r;
}

static int insert_page_to_hash(struct ksm_scan *ksm_scan,
			       unsigned long hash_index,
			       struct page_item *page_item,
			       struct rmap_item *rmap_item)
{
	struct ksm_mem_slot *slot;
	struct hlist_head *bucket;

	slot = ksm_scan->slot_index;
	page_item->addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
	page_item->mm = slot->mm;
	bucket = &ksm->page_hash[hash_index];
	hlist_add_head(&page_item->link, bucket);

	rmap_item->page_item = page_item;
	rmap_item->oldindex = hash_index;
	bucket = &ksm->rmap_hash[page_item->addr % ksm->nrmaps_hash];
	hlist_add_head(&rmap_item->link, bucket);
	return 0;
}

static void update_hash(struct ksm_scan *ksm_scan,
		       unsigned long hash_index)
{
	struct rmap_item *rmap_item;
	struct ksm_mem_slot *slot;
	struct hlist_head *bucket;
	struct hlist_node *node, *n;
	unsigned long addr;

	slot = ksm_scan->slot_index;;
	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
		if (slot->mm == rmap_item->page_item->mm &&
		    rmap_item->page_item->addr == addr) {
			if (hash_index != rmap_item->oldindex) {
				hlist_del(&rmap_item->page_item->link);
				free_page_item(rmap_item->page_item);
				hlist_del(&rmap_item->link);
				free_rmap_item(rmap_item);
			}
			return;
		}
	}
}

static void lock_two_pages(struct page *page1, struct page *page2)
{
	if (page1 < page2) {
		lock_page(page1);
		lock_page(page2);
	}
	else {
		lock_page(page2);
		lock_page(page1);
	}
}

static void unlock_two_pages(struct page *page1, struct page *page2)
{
	unlock_page(page1);
	unlock_page(page2);
}

static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
{
	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
	unsigned long addr;

	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
		return -EFAULT;
	return addr;
}

static int is_present_pte(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep;
	spinlock_t *ptl;
	int ret = 0;

	pgd = pgd_offset(mm, addr);
	if (!pgd_present(*pgd))
		goto out;

	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		goto out;

	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		goto out;

	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!ptep)
		goto out;

	if (pte_present(*ptep))
		ret = 1;

	pte_unmap_unlock(ptep, ptl);
out:
	return ret;
}
/*
 * try_to_merge_one_page - take two pages and merge them into one
 * note:
 * oldpage should be anon page while newpage should be file mapped page
 */
static int try_to_merge_one_page(struct mm_struct *mm,
				 struct vm_area_struct *vma,
				 struct page *oldpage,
				 struct page *newpage,
				 pgprot_t newprot)
{
	int ret = 0;
	unsigned long page_addr_in_vma;
	void *oldaddr, *newaddr;

	get_page(newpage);
	get_page(oldpage);

	down_write(&mm->mmap_sem);

	lock_two_pages(oldpage, newpage);

	page_addr_in_vma = addr_in_vma(vma, oldpage);
	if (page_addr_in_vma == -EFAULT)
		goto out_unlock;

	/*
	 * if the page is swapped or in swap cache, we cannot replace its pte
	 * we might want to run here swap_free in the future (it isnt exported)
	 */
	if (!is_present_pte(mm, page_addr_in_vma))
		goto out_unlock;

	if (!page_wrprotect(oldpage))
		goto out_unlock;

	oldaddr = kmap_atomic(oldpage, KM_USER0);
	newaddr = kmap_atomic(newpage, KM_USER1);

	ret = 1;
	if (!memcmp(oldaddr, newaddr, PAGE_SIZE))
		ret = replace_page(vma, oldpage, newpage, newprot);

	kunmap_atomic(oldaddr, KM_USER0);
	kunmap_atomic(newaddr, KM_USER1);

	if (!ret)
		ret = 1;
	else
		ret = 0;

out_unlock:
	unlock_two_pages(oldpage, newpage);
	up_write(&mm->mmap_sem);
	put_page(oldpage);
	put_page(newpage);
	return ret;
}

static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
				  struct mm_struct *mm2, struct page *page2,
				  unsigned long addr1, unsigned long addr2)
{
	struct vm_area_struct *vma;
	pgprot_t prot;
	int ret = 0;

	/*
	 * in case page2 isnt shared (it isnt PageKsm we have to allocate a new
	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
	 * point to it, in case page is shared page, we can just make the pte of
	 * mm1(page1) point to page2
	 */
	if (PageKsm(page2)) {
		vma = find_vma(mm1, addr1);
		if (!vma)
			return ret;
		prot = vma->vm_page_prot;
		pgprot_val(prot) &= ~VM_WRITE;
		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
	}
	else {
		struct page *kpage;

		kpage = alloc_page(GFP_KERNEL |  __GFP_HIGHMEM);
		if (!kpage)
			return ret;

		vma = find_vma(mm1, addr1);
		if (!vma)
			return ret;
		prot = vma->vm_page_prot;
		pgprot_val(prot) &= ~VM_WRITE;

		copy_highpage(kpage, page1);
		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
		page_cache_release(kpage);

		if (ret) {
			vma = find_vma(mm2, addr2);
			if (!vma)
				return ret;

			prot = vma->vm_page_prot;
			pgprot_val(prot) &= ~VM_WRITE;

			ret += try_to_merge_one_page(mm2, vma, page2, kpage,
						     prot);
		}
	}
	return ret;
}

static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page)
{
	struct hlist_head *bucket;
	struct hlist_node *node, *n;
	struct page_item *page_item;
	struct ksm_mem_slot *slot;
	unsigned long hash_index;
	unsigned long addr;
	void *page_addr;
	int ret = 0;
	int used = 0;

	page_addr = kmap(page);
	hash_index = calc_hash_index(page_addr);
	bucket = &ksm->page_hash[hash_index];

	slot = ksm_scan->slot_index;
	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;

	spin_lock(&ksm->hash_lock);
	/*
	 * update_hash must to be called everytime that there is a chance
	 * that the data inside the page was changed since the last time it
	 * was inserted into our hash table, (to avoid cases where the page
	 * will be inserted more than once to the hash table)
	 */ 
	update_hash(ksm_scan, hash_index);
	spin_unlock(&ksm->hash_lock);

	hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
		int npages;
		struct page *hash_page[1];
		void *hash_page_addr;

		if (slot->mm == page_item->mm && addr == page_item->addr) {
			used = 1;
			continue;
		}

		down_read(&page_item->mm->mmap_sem);
		/* if the page is swapped out or in the swap cache we dont want
		 * to scan it (it is just for performence)
		 */
		if (!is_present_pte(page_item->mm, page_item->addr)) {
			up_read(&page_item->mm->mmap_sem);
			continue;
		}
		npages = get_user_pages(current, page_item->mm, page_item->addr,
					1, 0, 0, hash_page, NULL);
		up_read(&page_item->mm->mmap_sem);
		if (npages != 1)
			break;

		hash_page_addr = kmap_atomic(hash_page[0], KM_USER1);
		/*
 		 * we check if the pages are equal 2 times, one time when they
 		 * arent write protected and again after we write protect them
 		 */
		if (!memcmp(page_addr, hash_page_addr, PAGE_SIZE)) {
			kunmap_atomic(hash_page_addr, KM_USER0);
			kunmap(page_addr);

			ret = try_to_merge_two_pages(slot->mm, page,
						     page_item->mm,
						     hash_page[0], addr,
						     page_item->addr);
			page_cache_release(hash_page[0]);
			return ret;
		}
		kunmap_atomic(hash_page_addr, KM_USER0);
		page_cache_release(hash_page[0]);
	}
	kunmap(page_addr);
	/* if node is NULL and used=0, the page is not inside the hash table */
	if (!node && !used) {
		struct page_item *page_item;
		struct rmap_item *rmap_item;

		page_item = alloc_page_item();
		if (!page_item)
			return 1;

		rmap_item = alloc_rmap_item();
		if (!rmap_item) {
			kfree(page_item);
			return ret;
		}

		spin_lock(&ksm->hash_lock);
		update_hash(ksm_scan, hash_index);
		insert_page_to_hash(ksm_scan, hash_index, page_item, rmap_item);
		spin_unlock(&ksm->hash_lock);
	}

	return ret;
}

/* return 1 - no slots registered, nothing to be done */
static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan)
{
	struct ksm_mem_slot *slot;

	if (list_empty(&ksm->slots))
		/* no slot avaible!, return 1 */
		return 1;

	slot = ksm_scan->slot_index;
	/*
	 * first lets see if in this memory slot there are more pages that we
	 * can use as our scan bluck pages
	 */
	if ((slot->npages - ksm_scan->page_index - nscan) > 0) {
		ksm_scan->page_index += nscan;
		return 0;
	}

	list_for_each_entry_from(slot, &ksm->slots, link) {
		if (slot == ksm_scan->slot_index)
			continue;
		ksm_scan->page_index = 0;
		ksm_scan->slot_index = slot;
		return 0;
	}

	/* look like we finished scanning the whole memory, starting again */
	ksm_scan->page_index = 0;
 	ksm_scan->slot_index = list_first_entry(&ksm->slots,
						struct ksm_mem_slot, link);
	return 0;
}

/*
 * update slot_index so it point to vaild data, it is possible that by
 * the time we are here the data that ksm_scan was pointed to was released
 * so we have to call this function every time after taking the slots_lock
 */
static void scan_update_old_index(struct ksm_scan *ksm_scan)
{
	struct ksm_mem_slot *slot;

	if (list_empty(&ksm->slots))
		return;

	list_for_each_entry(slot, &ksm->slots, link) {
		if (ksm_scan->slot_index == slot)
			return;
	}

	ksm_scan->slot_index = list_first_entry(&ksm->slots,
						struct ksm_mem_slot, link);
}

static int ksm_scan_start(struct ksm_scan *ksm_scan, int scan_npages)
{
	struct ksm_mem_slot *slot;
	struct page *page[1];
	int val;
	int ret = 0;

	down_read(&ksm->slots_lock);

	scan_update_old_index(ksm_scan);

	while (scan_npages > 0) {
		if (scan_get_next_index(ksm_scan, 1)) {
			/* we have no slots, another ret value should be used */
			goto out;
		}

		slot = ksm_scan->slot_index;
		down_read(&slot->mm->mmap_sem);
		/* if the page is swappd out or in swap cache, we dont want to
		 * scan it (it is just for performence)
		 */
		if (is_present_pte(slot->mm, slot->addr +
				   ksm_scan->page_index * PAGE_SIZE)) {
			val = get_user_pages(current, slot->mm, slot->addr +
					     ksm_scan->page_index * PAGE_SIZE ,
					      1, 0, 0, page, NULL);
			up_read(&slot->mm->mmap_sem);
			if (val == 1) {
				if (!PageKsm(page[0]))
					cmp_and_merge_page(ksm_scan, page[0]);
				page_cache_release(page[0]);
			}
		} else
			up_read(&slot->mm->mmap_sem);
		scan_npages--;
	}

	scan_get_next_index(ksm_scan, 1);
out:
	up_read(&ksm->slots_lock);
	return ret;
}

static int ksm_scan_ioctl_start(struct ksm_scan *ksm_scan, int scan_npages)
{
	return ksm_scan_start(ksm_scan, scan_npages);
}

static int ksm_scan_release(struct inode *inode, struct file *filp)
{
	struct ksm_scan *ksm_scan = filp->private_data;

	kfree(ksm_scan);
	return 0;
}

static long ksm_scan_ioctl(struct file *filp,
			   unsigned int ioctl, unsigned long arg)
{
	struct ksm_scan *ksm_scan = filp->private_data;
	int r = EINVAL;

	switch (ioctl) {
	case KSM_SCAN:
		r = ksm_scan_ioctl_start(ksm_scan, arg);
	}
	return r;
}

static struct file_operations ksm_sma_fops = {
	.release        = ksm_sma_release,
	.unlocked_ioctl = ksm_sma_ioctl,
	.compat_ioctl   = ksm_sma_ioctl,
};

static int ksm_dev_ioctl_create_shared_memory_area(void)
{
	int fd, r;
	struct ksm_sma *ksm_sma;
	struct inode *inode;
	struct file *file;

	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
	if (IS_ERR(ksm_sma)) {
		r = PTR_ERR(ksm_sma);
		goto out;
	}

	INIT_LIST_HEAD(&ksm_sma->sma_slots);

	r = anon_inode_getfd(&fd, &inode, &file, "ksm-sma", &ksm_sma_fops,
			     ksm_sma);
	if (r)
		goto out_free;

	return fd;
out_free:
	kfree(ksm_sma);
out:
	return -1;
}

static struct file_operations ksm_scan_fops = {
	.release        = ksm_scan_release,
	.unlocked_ioctl = ksm_scan_ioctl,
	.compat_ioctl   = ksm_scan_ioctl,
};

static struct ksm_scan *ksm_scan_create(void)
{
	struct ksm_scan *ksm_scan;

	ksm_scan = kzalloc(sizeof(struct ksm_scan), GFP_KERNEL);
	return ksm_scan;
}

static int ksm_dev_ioctl_create_scan(void)
{
	int fd, r;
	struct inode *inode;
	struct file *file;
	struct ksm_scan *ksm_scan;

	ksm_scan = ksm_scan_create();
	if (IS_ERR(ksm_scan)) {
		r = PTR_ERR(ksm_scan);
		goto out;
	}

	r = anon_inode_getfd(&fd, &inode, &file, "ksm-scan", &ksm_scan_fops,
			     ksm_scan);
	if (r)
		goto out_free;
	return fd;

out_free:
	kfree(ksm_scan);
out:
	return -1;
}

static long ksm_dev_ioctl(struct file *filp,
			  unsigned int ioctl, unsigned long arg)
{
	long r = -EINVAL;

	switch(ioctl) {
	case KSM_GET_API_VERSION:
		r = KSM_API_VERSION;
		break;
	case KSM_CREATE_SHARED_MEMORY_AREA:
		r = ksm_dev_ioctl_create_shared_memory_area();
		break;
	case KSM_CREATE_SCAN:
		r = ksm_dev_ioctl_create_scan();
	default:
		return r;
	}
	return r;
}

static int ksm_dev_open(struct inode *inode, struct file *filp)
{
	try_module_get(THIS_MODULE);
	return 0;
}

static int ksm_dev_release(struct inode *inode, struct file *filp)
{
	module_put(THIS_MODULE);
	return 0;
}

static struct file_operations ksm_chardev_ops = {
	.open           = ksm_dev_open,
	.release        = ksm_dev_release,
	.unlocked_ioctl = ksm_dev_ioctl,
	.compat_ioctl   = ksm_dev_ioctl,
};

static struct miscdevice ksm_dev = {
	KSM_MINOR,
	"ksm",
	&ksm_chardev_ops,
};

static int __init ksm_init(void)
{
	int r;

	ksm = kzalloc(sizeof(struct ksm), GFP_KERNEL);
	if (IS_ERR(ksm)) {
		r = PTR_ERR(ksm);
		goto out;
	}

	r = ksm_slab_init();
	if (r)
		goto out_free;

	r = page_hash_init();
	if (r)
		goto out_free1;

	r = rmap_hash_init();
	if (r)
		goto out_free2;

	INIT_LIST_HEAD(&ksm->slots);
	init_rwsem(&ksm->slots_lock);
	spin_lock_init(&ksm->hash_lock);

	r = misc_register(&ksm_dev);
	if (r) {
		printk(KERN_ERR "ksm: misc device register failed\n");
		goto out_free3;
	}
	printk(KERN_WARNING "ksm loaded\n");
	return 0;

out_free3:
	rmap_hash_free();
out_free2:
	page_hash_free();
out_free1:
	ksm_slab_free();
out_free:
	kfree(ksm);
out:
	return r;
}

static void __exit ksm_exit(void)
{
	misc_deregister(&ksm_dev);
	rmap_hash_free();
	page_hash_free();
	ksm_slab_free();
	kfree(ksm);
}

module_init(ksm_init)
module_exit(ksm_exit)

[-- Attachment #3: ksm.h --]
[-- Type: text/x-chdr, Size: 1344 bytes --]

#ifndef __KSM_H
#define __KSM_H

#include <linux/types.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>

/* this should really go to linux/miscdevice.h */
#define KSM_MINOR 233

#define KSM_MAX_SMAS 256

#define SharedPage 1

struct ksm_mem_slot {
	struct list_head link;
	struct list_head sma_link;
	struct mm_struct *mm;
	unsigned long addr; /* the begining of the virtual address */
	unsigned int npages; /* number of pages to share */
};

/*
 * sma - shared memory area, each process have its own sma that contain the
 * information about the slots that it own
 */
struct ksm_sma {
	struct list_head sma_slots;
};

struct ksm {
	struct list_head slots;
	struct rw_semaphore slots_lock;

	int npages_hash;
	struct hlist_head *page_hash;
	int nrmaps_hash;
	struct hlist_head *rmap_hash;

	struct kmem_cache *page_item_cache;
	struct kmem_cache *rmap_item_cache;

	struct address_space address_space;
	spinlock_t hash_lock;
};

struct ksm_scan {
	struct ksm_mem_slot *slot_index; /* the slot we are scanning now */
	int page_index; /* the page inside sma that is now being scanned */
};

struct page_item {
	struct hlist_node link;
	struct mm_struct *mm;
	unsigned long addr;
};

struct rmap_item {
	struct hlist_node link;
	struct page_item *page_item;
	unsigned long oldindex;
};

#endif

[-- Attachment #4: ksm.h --]
[-- Type: text/x-chdr, Size: 899 bytes --]

#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H

/*
 * Userspace interface for /dev/ksm - kvm shared memory
 */

#include <asm/types.h>
#include <linux/ioctl.h>

#define KSM_API_VERSION 1

/* for KSM_REGISTER_MEMORY_REGION */
struct ksm_memory_region {
	__u32 npages; /* number of pages to share */
	__u64 addr; /* the begining of the virtual address */
};

#define KSMIO 0xAB

/* ioctls for /dev/ksm */
#define KSM_GET_API_VERSION              _IO(KSMIO,   0x00)
#define KSM_CREATE_SHARED_MEMORY_AREA    _IO(KSMIO,   0x01) /* return SMA fd */
#define KSM_CREATE_SCAN                  _IO(KSMIO,   0x02) /* return SCAN fd */

/* ioctls for SMA fds */
#define KSM_REGISTER_MEMORY_REGION       _IOW(KSMIO, 0x20,\
					      struct ksm_memory_region)
#define KSM_REMOVE_MEMORY_REGION         _IO(KSMIO,   0x21)

/* ioctls for SCAN fds */
#define KSM_SCAN                         _IO(KSMIO,   0x40)

#endif

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-21 16:12 [RFC][PATCH 3/5] ksm source code Izik Eidus
@ 2008-01-24  7:24 ` Chris Wright
  2008-01-24  9:07   ` Izik Eidus
  0 siblings, 1 reply; 9+ messages in thread
From: Chris Wright @ 2008-01-24  7:24 UTC (permalink / raw)
  To: Izik Eidus; +Cc: kvm-devel, andrea, avi, dor.laor, linux-mm, yaniv

* Izik Eidus (izike@qumranet.com) wrote:

Just a bunch of nitpicks.

> struct ksm *ksm;

static

> static int page_hash_size = 0;

no need to initialize to zero

> module_param(page_hash_size, int, 0);
> MODULE_PARM_DESC(page_hash_size, "Hash table size for the pages checksum");
> 
> static int rmap_hash_size = 0;

no need to initialize to zero

> module_param(rmap_hash_size, int, 0);
> MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
> 
> int ksm_slab_init(void)

static

> {
> 	int ret = 1;
> 
> 	ksm->page_item_cache = kmem_cache_create("ksm_page_item",
> 						 sizeof(struct page_item), 0,
> 						 0, NULL);
> 	if (!ksm->page_item_cache)
> 		goto out;
> 
> 	ksm->rmap_item_cache = kmem_cache_create("ksm_rmap_item",
> 						 sizeof(struct rmap_item), 0,
> 						 0, NULL);
> 	if (!ksm->rmap_item_cache)
> 		goto out_free;
> 	return 0;
> 
> out_free:
> 	kmem_cache_destroy(ksm->page_item_cache);
> out:
> 	return ret;
> }
> 
> void ksm_slab_free(void)
> {
> 	kmem_cache_destroy(ksm->rmap_item_cache);
> 	kmem_cache_destroy(ksm->page_item_cache);
> }
> 
> static struct page_item *alloc_page_item(void)
> {
> 	void *obj;
> 
> 	obj = kmem_cache_zalloc(ksm->page_item_cache, GFP_KERNEL);
> 	return (struct page_item *)obj;
> }
> 
> static void free_page_item(struct page_item *page_item)
> {
> 	kfree(page_item);

kmem_cache_free

> }
> 
> static struct rmap_item *alloc_rmap_item(void)
> {
> 	void *obj;
> 
> 	obj = kmem_cache_zalloc(ksm->rmap_item_cache, GFP_KERNEL);
> 	return (struct rmap_item *)obj;
> }
> 
> static void free_rmap_item(struct rmap_item *rmap_item)
> {
> 	kfree(rmap_item);

kmem_cache_free

> }
> 
> static int inline PageKsm(struct page *page)
> {
> 	return !PageAnon(page);
> }
> 
> static int page_hash_init(void)
> {
> 	if (!page_hash_size) {
> 		struct sysinfo sinfo;
> 
> 		si_meminfo(&sinfo);
> 		page_hash_size = sinfo.totalram;
> 	}
> 	ksm->npages_hash = page_hash_size;
> 	ksm->page_hash = vmalloc(ksm->npages_hash *
> 				 sizeof(struct hlist_head *));
> 	if (IS_ERR(ksm->page_hash))

allocator returns NULL on failure (btw, this mistake is pervasive in the
patch)

> 		return PTR_ERR(ksm->page_hash);
> 	memset(ksm->page_hash, 0,
> 	       ksm->npages_hash  * sizeof(struct hlist_head *));
> 	return 0;
> }
> 
> static void page_hash_free(void)
> {
> 	int i;
> 	struct hlist_head *bucket;
> 	struct hlist_node *node, *n;
> 	struct page_item *page_item;
> 
> 	for (i = 0; i < ksm->npages_hash; ++i) {
> 		bucket = &ksm->page_hash[i];
> 		hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
> 			hlist_del(&page_item->link);
> 			free_page_item(page_item);
> 		}
> 	}
> 	vfree(ksm->page_hash);
> }
> 
> static int rmap_hash_init(void)
> {
> 	if (!rmap_hash_size) {
> 		struct sysinfo sinfo;
> 
> 		si_meminfo(&sinfo);
> 		rmap_hash_size = sinfo.totalram;
> 	}
> 	ksm->nrmaps_hash = rmap_hash_size;
> 	ksm->rmap_hash = vmalloc(ksm->nrmaps_hash *
> 				 sizeof(struct hlist_head *));

failure == NULL

> 	if (IS_ERR(ksm->rmap_hash))
> 		return PTR_ERR(ksm->rmap_hash);
> 	memset(ksm->rmap_hash, 0,
> 	       ksm->nrmaps_hash * sizeof(struct hlist_head *));
> 	return 0;
> }
> 
> static void rmap_hash_free(void)
> {
> 	int i;
> 	struct hlist_head *bucket;
> 	struct hlist_node *node, *n;
> 	struct rmap_item *rmap_item;
> 
> 	for (i = 0; i < ksm->nrmaps_hash; ++i) {
> 		bucket = &ksm->rmap_hash[i];
> 		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
> 			hlist_del(&rmap_item->link);
> 			free_rmap_item(rmap_item);
> 		}
> 	}
> 	vfree(ksm->rmap_hash);
> }
> 
> static inline u32 calc_hash_index(void *addr)
> {
> 	return jhash(addr, PAGE_SIZE, 17) % ksm->npages_hash;
> }
> 
> static void remove_page_from_hash(struct mm_struct *mm, unsigned long addr)
> {
> 	struct rmap_item *rmap_item;
> 	struct hlist_head *bucket;
> 	struct hlist_node *node, *n;
> 
> 	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
> 	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
> 		if (mm == rmap_item->page_item->mm &&
> 		    rmap_item->page_item->addr == addr) {
> 			hlist_del(&rmap_item->page_item->link);
> 			free_page_item(rmap_item->page_item);
> 			hlist_del(&rmap_item->link);
> 			free_rmap_item(rmap_item);
> 			return;
> 		}
> 	}
> }
> 
> static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
> 						struct ksm_memory_region *mem)
> {
> 	struct ksm_mem_slot *slot;
> 	int ret = 1;

ret = 1 doesn't make sense as a failure

> 	if (!current->mm)
> 		goto out;
> 
> 	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
> 	if (IS_ERR(slot)) {

failure == NULL

> 		ret = PTR_ERR(slot);
> 		goto out;
> 	}
> 
> 	slot->mm = get_task_mm(current);
> 	slot->addr = mem->addr;
> 	slot->npages = mem->npages;
> 	list_add_tail(&slot->link, &ksm->slots);

slots_lock?

> 	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);

theoretical, but if threaded, registering program could do this
concurrently

> 	ret = 0;
> out:
> 	return ret;
> }
> 
> static void remove_mm_from_hash(struct mm_struct *mm)
> {
> 	struct ksm_mem_slot *slot;
> 	int pages_count = 0;
> 
> 	list_for_each_entry(slot, &ksm->slots, link)
> 		if (slot->mm == mm)
> 			break;
> 	if (!slot)
> 		BUG();
> 
> 	spin_lock(&ksm->hash_lock);
> 	while (pages_count < slot->npages)
> 	{
> 		remove_page_from_hash(mm, slot->addr + pages_count * PAGE_SIZE);
> 		pages_count++;
> 	}
> 	spin_unlock(&ksm->hash_lock);
> 	list_del(&slot->link);
> }
> 
> static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
> {
> 	struct ksm_mem_slot *slot, *node;
> 
> 	down_write(&ksm->slots_lock);
> 	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
> 		remove_mm_from_hash(slot->mm);
> 		mmput(slot->mm);
> 		list_del(&slot->sma_link);
> 		kfree(slot);
> 	}
> 	up_write(&ksm->slots_lock);
> 	return 0;
> }
> 
> static int ksm_sma_release(struct inode *inode, struct file *filp)
> {
> 	struct ksm_sma *ksm_sma = filp->private_data;
> 	int r;
> 
> 	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
> 	return r;

leaked ksm_sma?

> }
> 
> static long ksm_sma_ioctl(struct file *filp,
> 			  unsigned int ioctl, unsigned long arg)
> {
> 	struct ksm_sma *sma = filp->private_data;
> 	void __user *argp = (void __user *)arg;
> 	int r = EINVAL;
> 
> 	switch (ioctl) {
> 	case KSM_REGISTER_MEMORY_REGION: {
> 		struct ksm_memory_region ksm_memory_region;
> 
> 		r = -EFAULT;
> 		if (copy_from_user(&ksm_memory_region, argp,
> 				   sizeof ksm_memory_region))

this doesn't look compat safe:

 struct ksm_memory_region {
 	__u32 npages; /* number of pages to share */
 	__u64 addr; /* the begining of the virtual address */
 };

> 			goto out;
> 		r = ksm_sma_ioctl_register_memory_region(sma,
> 							 &ksm_memory_region);
> 		break;
> 	}
> 	case KSM_REMOVE_MEMORY_REGION:
> 		r = ksm_sma_ioctl_remove_memory_region(sma);
> 		break;
> 	}
> 
> out:
> 	return r;
> }
> 
> static int insert_page_to_hash(struct ksm_scan *ksm_scan,
> 			       unsigned long hash_index,
> 			       struct page_item *page_item,
> 			       struct rmap_item *rmap_item)
> {
> 	struct ksm_mem_slot *slot;
> 	struct hlist_head *bucket;
> 
> 	slot = ksm_scan->slot_index;
> 	page_item->addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
> 	page_item->mm = slot->mm;
> 	bucket = &ksm->page_hash[hash_index];
> 	hlist_add_head(&page_item->link, bucket);
> 
> 	rmap_item->page_item = page_item;
> 	rmap_item->oldindex = hash_index;
> 	bucket = &ksm->rmap_hash[page_item->addr % ksm->nrmaps_hash];
> 	hlist_add_head(&rmap_item->link, bucket);
> 	return 0;
> }
> 
> static void update_hash(struct ksm_scan *ksm_scan,
> 		       unsigned long hash_index)
> {
> 	struct rmap_item *rmap_item;
> 	struct ksm_mem_slot *slot;
> 	struct hlist_head *bucket;
> 	struct hlist_node *node, *n;
> 	unsigned long addr;
> 
> 	slot = ksm_scan->slot_index;;
> 	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
> 	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
> 	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
> 		if (slot->mm == rmap_item->page_item->mm &&
> 		    rmap_item->page_item->addr == addr) {
> 			if (hash_index != rmap_item->oldindex) {
> 				hlist_del(&rmap_item->page_item->link);
> 				free_page_item(rmap_item->page_item);
> 				hlist_del(&rmap_item->link);
> 				free_rmap_item(rmap_item);
> 			}
> 			return;
> 		}
> 	}
> }
> 
> static void lock_two_pages(struct page *page1, struct page *page2)
> {
> 	if (page1 < page2) {
> 		lock_page(page1);
> 		lock_page(page2);
> 	}
> 	else {
> 		lock_page(page2);
> 		lock_page(page1);
> 	}
> }
> 
> static void unlock_two_pages(struct page *page1, struct page *page2)
> {
> 	unlock_page(page1);
> 	unlock_page(page2);
> }
> 
> static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
> {
> 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> 	unsigned long addr;
> 
> 	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> 	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
> 		return -EFAULT;
> 	return addr;
> }
> 
> static int is_present_pte(struct mm_struct *mm, unsigned long addr)
> {
> 	pgd_t *pgd;
> 	pud_t *pud;
> 	pmd_t *pmd;
> 	pte_t *ptep;
> 	spinlock_t *ptl;
> 	int ret = 0;
> 
> 	pgd = pgd_offset(mm, addr);
> 	if (!pgd_present(*pgd))
> 		goto out;
> 
> 	pud = pud_offset(pgd, addr);
> 	if (!pud_present(*pud))
> 		goto out;
> 
> 	pmd = pmd_offset(pud, addr);
> 	if (!pmd_present(*pmd))
> 		goto out;
> 
> 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
> 	if (!ptep)
> 		goto out;
> 
> 	if (pte_present(*ptep))
> 		ret = 1;
> 
> 	pte_unmap_unlock(ptep, ptl);
> out:
> 	return ret;
> }

This is generic helper.

> /*
>  * try_to_merge_one_page - take two pages and merge them into one
>  * note:
>  * oldpage should be anon page while newpage should be file mapped page
>  */
> static int try_to_merge_one_page(struct mm_struct *mm,
> 				 struct vm_area_struct *vma,
> 				 struct page *oldpage,
> 				 struct page *newpage,
> 				 pgprot_t newprot)
> {
> 	int ret = 0;
> 	unsigned long page_addr_in_vma;
> 	void *oldaddr, *newaddr;
> 
> 	get_page(newpage);
> 	get_page(oldpage);
> 
> 	down_write(&mm->mmap_sem);
> 
> 	lock_two_pages(oldpage, newpage);
> 
> 	page_addr_in_vma = addr_in_vma(vma, oldpage);
> 	if (page_addr_in_vma == -EFAULT)
> 		goto out_unlock;
> 
> 	/*
> 	 * if the page is swapped or in swap cache, we cannot replace its pte
> 	 * we might want to run here swap_free in the future (it isnt exported)
> 	 */
> 	if (!is_present_pte(mm, page_addr_in_vma))
> 		goto out_unlock;
> 
> 	if (!page_wrprotect(oldpage))
> 		goto out_unlock;
> 
> 	oldaddr = kmap_atomic(oldpage, KM_USER0);
> 	newaddr = kmap_atomic(newpage, KM_USER1);
> 
> 	ret = 1;
> 	if (!memcmp(oldaddr, newaddr, PAGE_SIZE))
> 		ret = replace_page(vma, oldpage, newpage, newprot);

Does it make sense to leave oldpage r/o if replace_page fails?

> 	kunmap_atomic(oldaddr, KM_USER0);
> 	kunmap_atomic(newaddr, KM_USER1);
> 
> 	if (!ret)
> 		ret = 1;
> 	else
> 		ret = 0;
> 
> out_unlock:
> 	unlock_two_pages(oldpage, newpage);
> 	up_write(&mm->mmap_sem);
> 	put_page(oldpage);
> 	put_page(newpage);
> 	return ret;
> }
> 
> static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
> 				  struct mm_struct *mm2, struct page *page2,
> 				  unsigned long addr1, unsigned long addr2)
> {
> 	struct vm_area_struct *vma;
> 	pgprot_t prot;
> 	int ret = 0;
> 
> 	/*
> 	 * in case page2 isnt shared (it isnt PageKsm we have to allocate a new
> 	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
> 	 * point to it, in case page is shared page, we can just make the pte of
> 	 * mm1(page1) point to page2
> 	 */
> 	if (PageKsm(page2)) {
> 		vma = find_vma(mm1, addr1);
> 		if (!vma)
> 			return ret;
> 		prot = vma->vm_page_prot;
> 		pgprot_val(prot) &= ~VM_WRITE;
> 		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
> 	}
> 	else {
> 		struct page *kpage;
> 
> 		kpage = alloc_page(GFP_KERNEL |  __GFP_HIGHMEM);
> 		if (!kpage)
> 			return ret;
> 
> 		vma = find_vma(mm1, addr1);
> 		if (!vma)
> 			return ret;
> 		prot = vma->vm_page_prot;
> 		pgprot_val(prot) &= ~VM_WRITE;
> 
> 		copy_highpage(kpage, page1);
> 		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
> 		page_cache_release(kpage);
> 
> 		if (ret) {
> 			vma = find_vma(mm2, addr2);
> 			if (!vma)
> 				return ret;
> 
> 			prot = vma->vm_page_prot;
> 			pgprot_val(prot) &= ~VM_WRITE;
> 
> 			ret += try_to_merge_one_page(mm2, vma, page2, kpage,
> 						     prot);
> 		}
> 	}
> 	return ret;
> }
> 
> static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page)
> {
> 	struct hlist_head *bucket;
> 	struct hlist_node *node, *n;
> 	struct page_item *page_item;
> 	struct ksm_mem_slot *slot;
> 	unsigned long hash_index;
> 	unsigned long addr;
> 	void *page_addr;
> 	int ret = 0;
> 	int used = 0;
> 
> 	page_addr = kmap(page);
> 	hash_index = calc_hash_index(page_addr);
> 	bucket = &ksm->page_hash[hash_index];
> 
> 	slot = ksm_scan->slot_index;
> 	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
> 
> 	spin_lock(&ksm->hash_lock);
> 	/*
> 	 * update_hash must to be called everytime that there is a chance
> 	 * that the data inside the page was changed since the last time it
> 	 * was inserted into our hash table, (to avoid cases where the page
> 	 * will be inserted more than once to the hash table)
> 	 */ 
> 	update_hash(ksm_scan, hash_index);
> 	spin_unlock(&ksm->hash_lock);
> 
> 	hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
> 		int npages;
> 		struct page *hash_page[1];
> 		void *hash_page_addr;
> 
> 		if (slot->mm == page_item->mm && addr == page_item->addr) {
> 			used = 1;
> 			continue;
> 		}
> 
> 		down_read(&page_item->mm->mmap_sem);
> 		/* if the page is swapped out or in the swap cache we dont want
> 		 * to scan it (it is just for performence)
> 		 */
> 		if (!is_present_pte(page_item->mm, page_item->addr)) {
> 			up_read(&page_item->mm->mmap_sem);
> 			continue;
> 		}
> 		npages = get_user_pages(current, page_item->mm, page_item->addr,
> 					1, 0, 0, hash_page, NULL);
> 		up_read(&page_item->mm->mmap_sem);
> 		if (npages != 1)
> 			break;
> 
> 		hash_page_addr = kmap_atomic(hash_page[0], KM_USER1);
> 		/*
>  		 * we check if the pages are equal 2 times, one time when they
>  		 * arent write protected and again after we write protect them
>  		 */
> 		if (!memcmp(page_addr, hash_page_addr, PAGE_SIZE)) {
> 			kunmap_atomic(hash_page_addr, KM_USER0);

kmap slot mismatch

> 			kunmap(page_addr);
> 
> 			ret = try_to_merge_two_pages(slot->mm, page,
> 						     page_item->mm,
> 						     hash_page[0], addr,
> 						     page_item->addr);
> 			page_cache_release(hash_page[0]);
> 			return ret;
> 		}
> 		kunmap_atomic(hash_page_addr, KM_USER0);

kmap slot mismatch

> 		page_cache_release(hash_page[0]);
> 	}
> 	kunmap(page_addr);
> 	/* if node is NULL and used=0, the page is not inside the hash table */
> 	if (!node && !used) {
> 		struct page_item *page_item;
> 		struct rmap_item *rmap_item;
> 
> 		page_item = alloc_page_item();
> 		if (!page_item)
> 			return 1;
> 
> 		rmap_item = alloc_rmap_item();
> 		if (!rmap_item) {
> 			kfree(page_item);
> 			return ret;
> 		}
> 
> 		spin_lock(&ksm->hash_lock);
> 		update_hash(ksm_scan, hash_index);
> 		insert_page_to_hash(ksm_scan, hash_index, page_item, rmap_item);
> 		spin_unlock(&ksm->hash_lock);
> 	}
> 
> 	return ret;
> }
> 
> /* return 1 - no slots registered, nothing to be done */
> static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan)
> {
> 	struct ksm_mem_slot *slot;
> 
> 	if (list_empty(&ksm->slots))
> 		/* no slot avaible!, return 1 */
> 		return 1;
> 
> 	slot = ksm_scan->slot_index;
> 	/*
> 	 * first lets see if in this memory slot there are more pages that we
> 	 * can use as our scan bluck pages
> 	 */
> 	if ((slot->npages - ksm_scan->page_index - nscan) > 0) {
> 		ksm_scan->page_index += nscan;
> 		return 0;
> 	}
> 
> 	list_for_each_entry_from(slot, &ksm->slots, link) {
> 		if (slot == ksm_scan->slot_index)
> 			continue;
> 		ksm_scan->page_index = 0;
> 		ksm_scan->slot_index = slot;
> 		return 0;
> 	}
> 
> 	/* look like we finished scanning the whole memory, starting again */
> 	ksm_scan->page_index = 0;
>  	ksm_scan->slot_index = list_first_entry(&ksm->slots,
> 						struct ksm_mem_slot, link);
> 	return 0;
> }
> 
> /*
>  * update slot_index so it point to vaild data, it is possible that by
>  * the time we are here the data that ksm_scan was pointed to was released
>  * so we have to call this function every time after taking the slots_lock
>  */
> static void scan_update_old_index(struct ksm_scan *ksm_scan)
> {
> 	struct ksm_mem_slot *slot;
> 
> 	if (list_empty(&ksm->slots))
> 		return;
> 
> 	list_for_each_entry(slot, &ksm->slots, link) {
> 		if (ksm_scan->slot_index == slot)
> 			return;
> 	}
> 
> 	ksm_scan->slot_index = list_first_entry(&ksm->slots,
> 						struct ksm_mem_slot, link);
> }
> 
> static int ksm_scan_start(struct ksm_scan *ksm_scan, int scan_npages)
> {
> 	struct ksm_mem_slot *slot;
> 	struct page *page[1];

just struct page *page and pass &page to get_user_pages 
since it's just one page

> 	int val;
> 	int ret = 0;
> 
> 	down_read(&ksm->slots_lock);
> 
> 	scan_update_old_index(ksm_scan);
> 
> 	while (scan_npages > 0) {
> 		if (scan_get_next_index(ksm_scan, 1)) {
> 			/* we have no slots, another ret value should be used */
> 			goto out;
> 		}
> 
> 		slot = ksm_scan->slot_index;
> 		down_read(&slot->mm->mmap_sem);
> 		/* if the page is swappd out or in swap cache, we dont want to
> 		 * scan it (it is just for performence)
> 		 */
> 		if (is_present_pte(slot->mm, slot->addr +
> 				   ksm_scan->page_index * PAGE_SIZE)) {
> 			val = get_user_pages(current, slot->mm, slot->addr +
> 					     ksm_scan->page_index * PAGE_SIZE ,
> 					      1, 0, 0, page, NULL);
> 			up_read(&slot->mm->mmap_sem);
> 			if (val == 1) {
> 				if (!PageKsm(page[0]))
> 					cmp_and_merge_page(ksm_scan, page[0]);
> 				page_cache_release(page[0]);
> 			}
> 		} else
> 			up_read(&slot->mm->mmap_sem);
> 		scan_npages--;
> 	}
> 
> 	scan_get_next_index(ksm_scan, 1);
> out:
> 	up_read(&ksm->slots_lock);
> 	return ret;
> }
> 
> static int ksm_scan_ioctl_start(struct ksm_scan *ksm_scan, int scan_npages)
> {
> 	return ksm_scan_start(ksm_scan, scan_npages);
> }

Unnecessary wrapper

> static int ksm_scan_release(struct inode *inode, struct file *filp)
> {
> 	struct ksm_scan *ksm_scan = filp->private_data;
> 
> 	kfree(ksm_scan);
> 	return 0;
> }
> 
> static long ksm_scan_ioctl(struct file *filp,
> 			   unsigned int ioctl, unsigned long arg)
> {
> 	struct ksm_scan *ksm_scan = filp->private_data;
> 	int r = EINVAL;
> 
> 	switch (ioctl) {
> 	case KSM_SCAN:
> 		r = ksm_scan_ioctl_start(ksm_scan, arg);
> 	}
> 	return r;
> }
> 
> static struct file_operations ksm_sma_fops = {
> 	.release        = ksm_sma_release,
> 	.unlocked_ioctl = ksm_sma_ioctl,
> 	.compat_ioctl   = ksm_sma_ioctl,
> };
> 
> static int ksm_dev_ioctl_create_shared_memory_area(void)
> {
> 	int fd, r;
> 	struct ksm_sma *ksm_sma;
> 	struct inode *inode;
> 	struct file *file;
> 
> 	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
> 	if (IS_ERR(ksm_sma)) {

failure == NULL

> 		r = PTR_ERR(ksm_sma);
> 		goto out;
> 	}
> 
> 	INIT_LIST_HEAD(&ksm_sma->sma_slots);
> 
> 	r = anon_inode_getfd(&fd, &inode, &file, "ksm-sma", &ksm_sma_fops,
> 			     ksm_sma);
> 	if (r)
> 		goto out_free;
> 
> 	return fd;
> out_free:
> 	kfree(ksm_sma);
> out:
> 	return -1;

return r to propagate meaningful error

> }
> 
> static struct file_operations ksm_scan_fops = {
> 	.release        = ksm_scan_release,
> 	.unlocked_ioctl = ksm_scan_ioctl,
> 	.compat_ioctl   = ksm_scan_ioctl,
> };
> 
> static struct ksm_scan *ksm_scan_create(void)
> {
> 	struct ksm_scan *ksm_scan;
> 
> 	ksm_scan = kzalloc(sizeof(struct ksm_scan), GFP_KERNEL);
> 	return ksm_scan;
> }

Unnecessary wrapper

> static int ksm_dev_ioctl_create_scan(void)
> {
> 	int fd, r;
> 	struct inode *inode;
> 	struct file *file;
> 	struct ksm_scan *ksm_scan;
> 
> 	ksm_scan = ksm_scan_create();
> 	if (IS_ERR(ksm_scan)) {

failure == NULL

> 		r = PTR_ERR(ksm_scan);
> 		goto out;
> 	}
> 
> 	r = anon_inode_getfd(&fd, &inode, &file, "ksm-scan", &ksm_scan_fops,
> 			     ksm_scan);
> 	if (r)
> 		goto out_free;
> 	return fd;
> 
> out_free:
> 	kfree(ksm_scan);
> out:
> 	return -1;

return r to propagate error

> }
> 
> static long ksm_dev_ioctl(struct file *filp,
> 			  unsigned int ioctl, unsigned long arg)
> {
> 	long r = -EINVAL;
> 
> 	switch(ioctl) {
> 	case KSM_GET_API_VERSION:
> 		r = KSM_API_VERSION;
> 		break;
> 	case KSM_CREATE_SHARED_MEMORY_AREA:
> 		r = ksm_dev_ioctl_create_shared_memory_area();
> 		break;
> 	case KSM_CREATE_SCAN:
> 		r = ksm_dev_ioctl_create_scan();

What's the value of having multiple scanners?
And how expensive is scanning?

> 	default:
> 		return r;
> 	}
> 	return r;
> }
> 
> static int ksm_dev_open(struct inode *inode, struct file *filp)
> {
> 	try_module_get(THIS_MODULE);
> 	return 0;
> }
> 
> static int ksm_dev_release(struct inode *inode, struct file *filp)
> {
> 	module_put(THIS_MODULE);
> 	return 0;
> }
> 
> static struct file_operations ksm_chardev_ops = {
> 	.open           = ksm_dev_open,
> 	.release        = ksm_dev_release,
> 	.unlocked_ioctl = ksm_dev_ioctl,
> 	.compat_ioctl   = ksm_dev_ioctl,
> };
> 
> static struct miscdevice ksm_dev = {
> 	KSM_MINOR,
> 	"ksm",
> 	&ksm_chardev_ops,
> };
> 
> static int __init ksm_init(void)
> {
> 	int r;
> 
> 	ksm = kzalloc(sizeof(struct ksm), GFP_KERNEL);
> 	if (IS_ERR(ksm)) {

failure == NULL

> 		r = PTR_ERR(ksm);
> 		goto out;
> 	}
> 
> 	r = ksm_slab_init();
> 	if (r)
> 		goto out_free;
> 
> 	r = page_hash_init();
> 	if (r)
> 		goto out_free1;
> 
> 	r = rmap_hash_init();
> 	if (r)
> 		goto out_free2;
> 
> 	INIT_LIST_HEAD(&ksm->slots);
> 	init_rwsem(&ksm->slots_lock);
> 	spin_lock_init(&ksm->hash_lock);
> 
> 	r = misc_register(&ksm_dev);
> 	if (r) {
> 		printk(KERN_ERR "ksm: misc device register failed\n");
> 		goto out_free3;
> 	}
> 	printk(KERN_WARNING "ksm loaded\n");
> 	return 0;
> 
> out_free3:
> 	rmap_hash_free();
> out_free2:
> 	page_hash_free();
> out_free1:
> 	ksm_slab_free();
> out_free:
> 	kfree(ksm);
> out:
> 	return r;
> }
> 
> static void __exit ksm_exit(void)
> {
> 	misc_deregister(&ksm_dev);
> 	rmap_hash_free();
> 	page_hash_free();
> 	ksm_slab_free();
> 	kfree(ksm);
> }
> 
> module_init(ksm_init)
> module_exit(ksm_exit)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24  7:24 ` [kvm-devel] " Chris Wright
@ 2008-01-24  9:07   ` Izik Eidus
  2008-01-24  9:22     ` Avi Kivity
  2008-01-24 17:51     ` Chris Wright
  0 siblings, 2 replies; 9+ messages in thread
From: Izik Eidus @ 2008-01-24  9:07 UTC (permalink / raw)
  To: Chris Wright; +Cc: kvm-devel, andrea, avi, dor.laor, linux-mm, yaniv

Chris Wright wrote:
> * Izik Eidus (izike@qumranet.com) wrote:
>
> Just a bunch of nitpicks.
>
>   
>> struct ksm *ksm;
>>     
>
> static
>
>   

yes

>> static int page_hash_size = 0;
>>     
>
> no need to initialize to zero
>   
ok

>   
>> module_param(page_hash_size, int, 0);
>> MODULE_PARM_DESC(page_hash_size, "Hash table size for the pages checksum");
>>
>> static int rmap_hash_size = 0;
>>     
>
> no need to initialize to zero
>
>   
>> module_param(rmap_hash_size, int, 0);
>> MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
>>
>> int ksm_slab_init(void)
>>     
>
> static
>   

ok

>   
>> {
>> 	int ret = 1;
>>
>> 	ksm->page_item_cache = kmem_cache_create("ksm_page_item",
>> 						 sizeof(struct page_item), 0,
>> 						 0, NULL);
>> 	if (!ksm->page_item_cache)
>> 		goto out;
>>
>> 	ksm->rmap_item_cache = kmem_cache_create("ksm_rmap_item",
>> 						 sizeof(struct rmap_item), 0,
>> 						 0, NULL);
>> 	if (!ksm->rmap_item_cache)
>> 		goto out_free;
>> 	return 0;
>>
>> out_free:
>> 	kmem_cache_destroy(ksm->page_item_cache);
>> out:
>> 	return ret;
>> }
>>
>> void ksm_slab_free(void)
>> {
>> 	kmem_cache_destroy(ksm->rmap_item_cache);
>> 	kmem_cache_destroy(ksm->page_item_cache);
>> }
>>
>> static struct page_item *alloc_page_item(void)
>> {
>> 	void *obj;
>>
>> 	obj = kmem_cache_zalloc(ksm->page_item_cache, GFP_KERNEL);
>> 	return (struct page_item *)obj;
>> }
>>
>> static void free_page_item(struct page_item *page_item)
>> {
>> 	kfree(page_item);
>>     
>
> kmem_cache_free
>   

yes

>   
>> }
>>
>> static struct rmap_item *alloc_rmap_item(void)
>> {
>> 	void *obj;
>>
>> 	obj = kmem_cache_zalloc(ksm->rmap_item_cache, GFP_KERNEL);
>> 	return (struct rmap_item *)obj;
>> }
>>
>> static void free_rmap_item(struct rmap_item *rmap_item)
>> {
>> 	kfree(rmap_item);
>>     
>
> kmem_cache_free
>
>   
>> }
>>
>> static int inline PageKsm(struct page *page)
>> {
>> 	return !PageAnon(page);
>> }
>>
>> static int page_hash_init(void)
>> {
>> 	if (!page_hash_size) {
>> 		struct sysinfo sinfo;
>>
>> 		si_meminfo(&sinfo);
>> 		page_hash_size = sinfo.totalram;
>> 	}
>> 	ksm->npages_hash = page_hash_size;
>> 	ksm->page_hash = vmalloc(ksm->npages_hash *
>> 				 sizeof(struct hlist_head *));
>> 	if (IS_ERR(ksm->page_hash))
>>     
>
> allocator returns NULL on failure (btw, this mistake is pervasive in the
> patch)
>   

you are right, will change it

>   
>> 		return PTR_ERR(ksm->page_hash);
>> 	memset(ksm->page_hash, 0,
>> 	       ksm->npages_hash  * sizeof(struct hlist_head *));
>> 	return 0;
>> }
>>
>> static void page_hash_free(void)
>> {
>> 	int i;
>> 	struct hlist_head *bucket;
>> 	struct hlist_node *node, *n;
>> 	struct page_item *page_item;
>>
>> 	for (i = 0; i < ksm->npages_hash; ++i) {
>> 		bucket = &ksm->page_hash[i];
>> 		hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
>> 			hlist_del(&page_item->link);
>> 			free_page_item(page_item);
>> 		}
>> 	}
>> 	vfree(ksm->page_hash);
>> }
>>
>> static int rmap_hash_init(void)
>> {
>> 	if (!rmap_hash_size) {
>> 		struct sysinfo sinfo;
>>
>> 		si_meminfo(&sinfo);
>> 		rmap_hash_size = sinfo.totalram;
>> 	}
>> 	ksm->nrmaps_hash = rmap_hash_size;
>> 	ksm->rmap_hash = vmalloc(ksm->nrmaps_hash *
>> 				 sizeof(struct hlist_head *));
>>     
>
> failure == NULL
>
>   
>> 	if (IS_ERR(ksm->rmap_hash))
>> 		return PTR_ERR(ksm->rmap_hash);
>> 	memset(ksm->rmap_hash, 0,
>> 	       ksm->nrmaps_hash * sizeof(struct hlist_head *));
>> 	return 0;
>> }
>>
>> static void rmap_hash_free(void)
>> {
>> 	int i;
>> 	struct hlist_head *bucket;
>> 	struct hlist_node *node, *n;
>> 	struct rmap_item *rmap_item;
>>
>> 	for (i = 0; i < ksm->nrmaps_hash; ++i) {
>> 		bucket = &ksm->rmap_hash[i];
>> 		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
>> 			hlist_del(&rmap_item->link);
>> 			free_rmap_item(rmap_item);
>> 		}
>> 	}
>> 	vfree(ksm->rmap_hash);
>> }
>>
>> static inline u32 calc_hash_index(void *addr)
>> {
>> 	return jhash(addr, PAGE_SIZE, 17) % ksm->npages_hash;
>> }
>>
>> static void remove_page_from_hash(struct mm_struct *mm, unsigned long addr)
>> {
>> 	struct rmap_item *rmap_item;
>> 	struct hlist_head *bucket;
>> 	struct hlist_node *node, *n;
>>
>> 	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
>> 	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
>> 		if (mm == rmap_item->page_item->mm &&
>> 		    rmap_item->page_item->addr == addr) {
>> 			hlist_del(&rmap_item->page_item->link);
>> 			free_page_item(rmap_item->page_item);
>> 			hlist_del(&rmap_item->link);
>> 			free_rmap_item(rmap_item);
>> 			return;
>> 		}
>> 	}
>> }
>>
>> static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
>> 						struct ksm_memory_region *mem)
>> {
>> 	struct ksm_mem_slot *slot;
>> 	int ret = 1;
>>     
>
> ret = 1 doesn't make sense as a failure
>   

right, will change

>   
>> 	if (!current->mm)
>> 		goto out;
>>
>> 	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
>> 	if (IS_ERR(slot)) {
>>     
>
> failure == NULL
>
>   
>> 		ret = PTR_ERR(slot);
>> 		goto out;
>> 	}
>>
>> 	slot->mm = get_task_mm(current);
>> 	slot->addr = mem->addr;
>> 	slot->npages = mem->npages;
>> 	list_add_tail(&slot->link, &ksm->slots);
>>     
>
> slots_lock?
>   

good catch, i forgat to put it here

>   
>> 	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
>>     
>
> theoretical, but if threaded, registering program could do this
> concurrently
>   

you talk here about the miss slots_lock?, i agree that it needed here

>   
>> 	ret = 0;
>> out:
>> 	return ret;
>> }
>>
>> static void remove_mm_from_hash(struct mm_struct *mm)
>> {
>> 	struct ksm_mem_slot *slot;
>> 	int pages_count = 0;
>>
>> 	list_for_each_entry(slot, &ksm->slots, link)
>> 		if (slot->mm == mm)
>> 			break;
>> 	if (!slot)
>> 		BUG();
>>
>> 	spin_lock(&ksm->hash_lock);
>> 	while (pages_count < slot->npages)
>> 	{
>> 		remove_page_from_hash(mm, slot->addr + pages_count * PAGE_SIZE);
>> 		pages_count++;
>> 	}
>> 	spin_unlock(&ksm->hash_lock);
>> 	list_del(&slot->link);
>> }
>>
>> static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
>> {
>> 	struct ksm_mem_slot *slot, *node;
>>
>> 	down_write(&ksm->slots_lock);
>> 	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
>> 		remove_mm_from_hash(slot->mm);
>> 		mmput(slot->mm);
>> 		list_del(&slot->sma_link);
>> 		kfree(slot);
>> 	}
>> 	up_write(&ksm->slots_lock);
>> 	return 0;
>> }
>>
>> static int ksm_sma_release(struct inode *inode, struct file *filp)
>> {
>> 	struct ksm_sma *ksm_sma = filp->private_data;
>> 	int r;
>>
>> 	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
>> 	return r;
>>     
>
> leaked ksm_sma?
>   

yup leaked ksm_sma, will fix

>   
>> }
>>
>> static long ksm_sma_ioctl(struct file *filp,
>> 			  unsigned int ioctl, unsigned long arg)
>> {
>> 	struct ksm_sma *sma = filp->private_data;
>> 	void __user *argp = (void __user *)arg;
>> 	int r = EINVAL;
>>
>> 	switch (ioctl) {
>> 	case KSM_REGISTER_MEMORY_REGION: {
>> 		struct ksm_memory_region ksm_memory_region;
>>
>> 		r = -EFAULT;
>> 		if (copy_from_user(&ksm_memory_region, argp,
>> 				   sizeof ksm_memory_region))
>>     
>
> this doesn't look compat safe:
>
>  struct ksm_memory_region {
>  	__u32 npages; /* number of pages to share */
>  	__u64 addr; /* the begining of the virtual address */
>  };
>
>   

why isnt it compat safe?

>> 			goto out;
>> 		r = ksm_sma_ioctl_register_memory_region(sma,
>> 							 &ksm_memory_region);
>> 		break;
>> 	}
>> 	case KSM_REMOVE_MEMORY_REGION:
>> 		r = ksm_sma_ioctl_remove_memory_region(sma);
>> 		break;
>> 	}
>>
>> out:
>> 	return r;
>> }
>>
>> static int insert_page_to_hash(struct ksm_scan *ksm_scan,
>> 			       unsigned long hash_index,
>> 			       struct page_item *page_item,
>> 			       struct rmap_item *rmap_item)
>> {
>> 	struct ksm_mem_slot *slot;
>> 	struct hlist_head *bucket;
>>
>> 	slot = ksm_scan->slot_index;
>> 	page_item->addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
>> 	page_item->mm = slot->mm;
>> 	bucket = &ksm->page_hash[hash_index];
>> 	hlist_add_head(&page_item->link, bucket);
>>
>> 	rmap_item->page_item = page_item;
>> 	rmap_item->oldindex = hash_index;
>> 	bucket = &ksm->rmap_hash[page_item->addr % ksm->nrmaps_hash];
>> 	hlist_add_head(&rmap_item->link, bucket);
>> 	return 0;
>> }
>>
>> static void update_hash(struct ksm_scan *ksm_scan,
>> 		       unsigned long hash_index)
>> {
>> 	struct rmap_item *rmap_item;
>> 	struct ksm_mem_slot *slot;
>> 	struct hlist_head *bucket;
>> 	struct hlist_node *node, *n;
>> 	unsigned long addr;
>>
>> 	slot = ksm_scan->slot_index;;
>> 	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
>> 	bucket = &ksm->rmap_hash[addr % ksm->nrmaps_hash];
>> 	hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
>> 		if (slot->mm == rmap_item->page_item->mm &&
>> 		    rmap_item->page_item->addr == addr) {
>> 			if (hash_index != rmap_item->oldindex) {
>> 				hlist_del(&rmap_item->page_item->link);
>> 				free_page_item(rmap_item->page_item);
>> 				hlist_del(&rmap_item->link);
>> 				free_rmap_item(rmap_item);
>> 			}
>> 			return;
>> 		}
>> 	}
>> }
>>
>> static void lock_two_pages(struct page *page1, struct page *page2)
>> {
>> 	if (page1 < page2) {
>> 		lock_page(page1);
>> 		lock_page(page2);
>> 	}
>> 	else {
>> 		lock_page(page2);
>> 		lock_page(page1);
>> 	}
>> }
>>
>> static void unlock_two_pages(struct page *page1, struct page *page2)
>> {
>> 	unlock_page(page1);
>> 	unlock_page(page2);
>> }
>>
>> static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
>> {
>> 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
>> 	unsigned long addr;
>>
>> 	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> 	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
>> 		return -EFAULT;
>> 	return addr;
>> }
>>
>> static int is_present_pte(struct mm_struct *mm, unsigned long addr)
>> {
>> 	pgd_t *pgd;
>> 	pud_t *pud;
>> 	pmd_t *pmd;
>> 	pte_t *ptep;
>> 	spinlock_t *ptl;
>> 	int ret = 0;
>>
>> 	pgd = pgd_offset(mm, addr);
>> 	if (!pgd_present(*pgd))
>> 		goto out;
>>
>> 	pud = pud_offset(pgd, addr);
>> 	if (!pud_present(*pud))
>> 		goto out;
>>
>> 	pmd = pmd_offset(pud, addr);
>> 	if (!pmd_present(*pmd))
>> 		goto out;
>>
>> 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
>> 	if (!ptep)
>> 		goto out;
>>
>> 	if (pte_present(*ptep))
>> 		ret = 1;
>>
>> 	pte_unmap_unlock(ptep, ptl);
>> out:
>> 	return ret;
>> }
>>     
>
> This is generic helper.
>   

you recommended insert it to memory.c?

>   
>> /*
>>  * try_to_merge_one_page - take two pages and merge them into one
>>  * note:
>>  * oldpage should be anon page while newpage should be file mapped page
>>  */
>> static int try_to_merge_one_page(struct mm_struct *mm,
>> 				 struct vm_area_struct *vma,
>> 				 struct page *oldpage,
>> 				 struct page *newpage,
>> 				 pgprot_t newprot)
>> {
>> 	int ret = 0;
>> 	unsigned long page_addr_in_vma;
>> 	void *oldaddr, *newaddr;
>>
>> 	get_page(newpage);
>> 	get_page(oldpage);
>>
>> 	down_write(&mm->mmap_sem);
>>
>> 	lock_two_pages(oldpage, newpage);
>>
>> 	page_addr_in_vma = addr_in_vma(vma, oldpage);
>> 	if (page_addr_in_vma == -EFAULT)
>> 		goto out_unlock;
>>
>> 	/*
>> 	 * if the page is swapped or in swap cache, we cannot replace its pte
>> 	 * we might want to run here swap_free in the future (it isnt exported)
>> 	 */
>> 	if (!is_present_pte(mm, page_addr_in_vma))
>> 		goto out_unlock;
>>
>> 	if (!page_wrprotect(oldpage))
>> 		goto out_unlock;
>>
>> 	oldaddr = kmap_atomic(oldpage, KM_USER0);
>> 	newaddr = kmap_atomic(newpage, KM_USER1);
>>
>> 	ret = 1;
>> 	if (!memcmp(oldaddr, newaddr, PAGE_SIZE))
>> 		ret = replace_page(vma, oldpage, newpage, newprot);
>>     
>
> Does it make sense to leave oldpage r/o if replace_page fails?
>   

the chance here that it wont be the same is very very low,
but it could happen, and in this case we will have vmexit and pagefault 
+ copying of a page data for nothing

it can be solved by adding to the rmap something like:
page_writeble() tha will work the same as page_wrprotect() but will mark 
the pte as write instead of readonly,
the question is: if it wanted to the rmap code?


>   
>> 	kunmap_atomic(oldaddr, KM_USER0);
>> 	kunmap_atomic(newaddr, KM_USER1);
>>
>> 	if (!ret)
>> 		ret = 1;
>> 	else
>> 		ret = 0;
>>
>> out_unlock:
>> 	unlock_two_pages(oldpage, newpage);
>> 	up_write(&mm->mmap_sem);
>> 	put_page(oldpage);
>> 	put_page(newpage);
>> 	return ret;
>> }
>>
>> static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
>> 				  struct mm_struct *mm2, struct page *page2,
>> 				  unsigned long addr1, unsigned long addr2)
>> {
>> 	struct vm_area_struct *vma;
>> 	pgprot_t prot;
>> 	int ret = 0;
>>
>> 	/*
>> 	 * in case page2 isnt shared (it isnt PageKsm we have to allocate a new
>> 	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
>> 	 * point to it, in case page is shared page, we can just make the pte of
>> 	 * mm1(page1) point to page2
>> 	 */
>> 	if (PageKsm(page2)) {
>> 		vma = find_vma(mm1, addr1);
>> 		if (!vma)
>> 			return ret;
>> 		prot = vma->vm_page_prot;
>> 		pgprot_val(prot) &= ~VM_WRITE;
>> 		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
>> 	}
>> 	else {
>> 		struct page *kpage;
>>
>> 		kpage = alloc_page(GFP_KERNEL |  __GFP_HIGHMEM);
>> 		if (!kpage)
>> 			return ret;
>>
>> 		vma = find_vma(mm1, addr1);
>> 		if (!vma)
>> 			return ret;
>> 		prot = vma->vm_page_prot;
>> 		pgprot_val(prot) &= ~VM_WRITE;
>>
>> 		copy_highpage(kpage, page1);
>> 		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
>> 		page_cache_release(kpage);
>>
>> 		if (ret) {
>> 			vma = find_vma(mm2, addr2);
>> 			if (!vma)
>> 				return ret;
>>
>> 			prot = vma->vm_page_prot;
>> 			pgprot_val(prot) &= ~VM_WRITE;
>>
>> 			ret += try_to_merge_one_page(mm2, vma, page2, kpage,
>> 						     prot);
>> 		}
>> 	}
>> 	return ret;
>> }
>>
>> static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page)
>> {
>> 	struct hlist_head *bucket;
>> 	struct hlist_node *node, *n;
>> 	struct page_item *page_item;
>> 	struct ksm_mem_slot *slot;
>> 	unsigned long hash_index;
>> 	unsigned long addr;
>> 	void *page_addr;
>> 	int ret = 0;
>> 	int used = 0;
>>
>> 	page_addr = kmap(page);
>> 	hash_index = calc_hash_index(page_addr);
>> 	bucket = &ksm->page_hash[hash_index];
>>
>> 	slot = ksm_scan->slot_index;
>> 	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
>>
>> 	spin_lock(&ksm->hash_lock);
>> 	/*
>> 	 * update_hash must to be called everytime that there is a chance
>> 	 * that the data inside the page was changed since the last time it
>> 	 * was inserted into our hash table, (to avoid cases where the page
>> 	 * will be inserted more than once to the hash table)
>> 	 */ 
>> 	update_hash(ksm_scan, hash_index);
>> 	spin_unlock(&ksm->hash_lock);
>>
>> 	hlist_for_each_entry_safe(page_item, node, n, bucket, link) {
>> 		int npages;
>> 		struct page *hash_page[1];
>> 		void *hash_page_addr;
>>
>> 		if (slot->mm == page_item->mm && addr == page_item->addr) {
>> 			used = 1;
>> 			continue;
>> 		}
>>
>> 		down_read(&page_item->mm->mmap_sem);
>> 		/* if the page is swapped out or in the swap cache we dont want
>> 		 * to scan it (it is just for performence)
>> 		 */
>> 		if (!is_present_pte(page_item->mm, page_item->addr)) {
>> 			up_read(&page_item->mm->mmap_sem);
>> 			continue;
>> 		}
>> 		npages = get_user_pages(current, page_item->mm, page_item->addr,
>> 					1, 0, 0, hash_page, NULL);
>> 		up_read(&page_item->mm->mmap_sem);
>> 		if (npages != 1)
>> 			break;
>>
>> 		hash_page_addr = kmap_atomic(hash_page[0], KM_USER1);
>> 		/*
>>  		 * we check if the pages are equal 2 times, one time when they
>>  		 * arent write protected and again after we write protect them
>>  		 */
>> 		if (!memcmp(page_addr, hash_page_addr, PAGE_SIZE)) {
>> 			kunmap_atomic(hash_page_addr, KM_USER0);
>>     
>
> kmap slot mismatch
>   

opsss... :)

>   
>> 			kunmap(page_addr);
>>
>> 			ret = try_to_merge_two_pages(slot->mm, page,
>> 						     page_item->mm,
>> 						     hash_page[0], addr,
>> 						     page_item->addr);
>> 			page_cache_release(hash_page[0]);
>> 			return ret;
>> 		}
>> 		kunmap_atomic(hash_page_addr, KM_USER0);
>>     
>
> kmap slot mismatch
>
>   
>> 		page_cache_release(hash_page[0]);
>> 	}
>> 	kunmap(page_addr);
>> 	/* if node is NULL and used=0, the page is not inside the hash table */
>> 	if (!node && !used) {
>> 		struct page_item *page_item;
>> 		struct rmap_item *rmap_item;
>>
>> 		page_item = alloc_page_item();
>> 		if (!page_item)
>> 			return 1;
>>
>> 		rmap_item = alloc_rmap_item();
>> 		if (!rmap_item) {
>> 			kfree(page_item);
>> 			return ret;
>> 		}
>>
>> 		spin_lock(&ksm->hash_lock);
>> 		update_hash(ksm_scan, hash_index);
>> 		insert_page_to_hash(ksm_scan, hash_index, page_item, rmap_item);
>> 		spin_unlock(&ksm->hash_lock);
>> 	}
>>
>> 	return ret;
>> }
>>
>> /* return 1 - no slots registered, nothing to be done */
>> static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan)
>> {
>> 	struct ksm_mem_slot *slot;
>>
>> 	if (list_empty(&ksm->slots))
>> 		/* no slot avaible!, return 1 */
>> 		return 1;
>>
>> 	slot = ksm_scan->slot_index;
>> 	/*
>> 	 * first lets see if in this memory slot there are more pages that we
>> 	 * can use as our scan bluck pages
>> 	 */
>> 	if ((slot->npages - ksm_scan->page_index - nscan) > 0) {
>> 		ksm_scan->page_index += nscan;
>> 		return 0;
>> 	}
>>
>> 	list_for_each_entry_from(slot, &ksm->slots, link) {
>> 		if (slot == ksm_scan->slot_index)
>> 			continue;
>> 		ksm_scan->page_index = 0;
>> 		ksm_scan->slot_index = slot;
>> 		return 0;
>> 	}
>>
>> 	/* look like we finished scanning the whole memory, starting again */
>> 	ksm_scan->page_index = 0;
>>  	ksm_scan->slot_index = list_first_entry(&ksm->slots,
>> 						struct ksm_mem_slot, link);
>> 	return 0;
>> }
>>
>> /*
>>  * update slot_index so it point to vaild data, it is possible that by
>>  * the time we are here the data that ksm_scan was pointed to was released
>>  * so we have to call this function every time after taking the slots_lock
>>  */
>> static void scan_update_old_index(struct ksm_scan *ksm_scan)
>> {
>> 	struct ksm_mem_slot *slot;
>>
>> 	if (list_empty(&ksm->slots))
>> 		return;
>>
>> 	list_for_each_entry(slot, &ksm->slots, link) {
>> 		if (ksm_scan->slot_index == slot)
>> 			return;
>> 	}
>>
>> 	ksm_scan->slot_index = list_first_entry(&ksm->slots,
>> 						struct ksm_mem_slot, link);
>> }
>>
>> static int ksm_scan_start(struct ksm_scan *ksm_scan, int scan_npages)
>> {
>> 	struct ksm_mem_slot *slot;
>> 	struct page *page[1];
>>     
>
> just struct page *page and pass &page to get_user_pages 
> since it's just one page
>   

yea, sound better

>> 	int val;
>> 	int ret = 0;
>>
>> 	down_read(&ksm->slots_lock);
>>
>> 	scan_update_old_index(ksm_scan);
>>
>> 	while (scan_npages > 0) {
>> 		if (scan_get_next_index(ksm_scan, 1)) {
>> 			/* we have no slots, another ret value should be used */
>> 			goto out;
>> 		}
>>
>> 		slot = ksm_scan->slot_index;
>> 		down_read(&slot->mm->mmap_sem);
>> 		/* if the page is swappd out or in swap cache, we dont want to
>> 		 * scan it (it is just for performence)
>> 		 */
>> 		if (is_present_pte(slot->mm, slot->addr +
>> 				   ksm_scan->page_index * PAGE_SIZE)) {
>> 			val = get_user_pages(current, slot->mm, slot->addr +
>> 					     ksm_scan->page_index * PAGE_SIZE ,
>> 					      1, 0, 0, page, NULL);
>> 			up_read(&slot->mm->mmap_sem);
>> 			if (val == 1) {
>> 				if (!PageKsm(page[0]))
>> 					cmp_and_merge_page(ksm_scan, page[0]);
>> 				page_cache_release(page[0]);
>> 			}
>> 		} else
>> 			up_read(&slot->mm->mmap_sem);
>> 		scan_npages--;
>> 	}
>>
>> 	scan_get_next_index(ksm_scan, 1);
>> out:
>> 	up_read(&ksm->slots_lock);
>> 	return ret;
>> }
>>
>> static int ksm_scan_ioctl_start(struct ksm_scan *ksm_scan, int scan_npages)
>> {
>> 	return ksm_scan_start(ksm_scan, scan_npages);
>> }
>>     
>
> Unnecessary wrapper
>   

ok

>   
>> static int ksm_scan_release(struct inode *inode, struct file *filp)
>> {
>> 	struct ksm_scan *ksm_scan = filp->private_data;
>>
>> 	kfree(ksm_scan);
>> 	return 0;
>> }
>>
>> static long ksm_scan_ioctl(struct file *filp,
>> 			   unsigned int ioctl, unsigned long arg)
>> {
>> 	struct ksm_scan *ksm_scan = filp->private_data;
>> 	int r = EINVAL;
>>
>> 	switch (ioctl) {
>> 	case KSM_SCAN:
>> 		r = ksm_scan_ioctl_start(ksm_scan, arg);
>> 	}
>> 	return r;
>> }
>>
>> static struct file_operations ksm_sma_fops = {
>> 	.release        = ksm_sma_release,
>> 	.unlocked_ioctl = ksm_sma_ioctl,
>> 	.compat_ioctl   = ksm_sma_ioctl,
>> };
>>
>> static int ksm_dev_ioctl_create_shared_memory_area(void)
>> {
>> 	int fd, r;
>> 	struct ksm_sma *ksm_sma;
>> 	struct inode *inode;
>> 	struct file *file;
>>
>> 	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
>> 	if (IS_ERR(ksm_sma)) {
>>     
>
> failure == NULL
>
>   
>> 		r = PTR_ERR(ksm_sma);
>> 		goto out;
>> 	}
>>
>> 	INIT_LIST_HEAD(&ksm_sma->sma_slots);
>>
>> 	r = anon_inode_getfd(&fd, &inode, &file, "ksm-sma", &ksm_sma_fops,
>> 			     ksm_sma);
>> 	if (r)
>> 		goto out_free;
>>
>> 	return fd;
>> out_free:
>> 	kfree(ksm_sma);
>> out:
>> 	return -1;
>>     
>
> return r to propagate meaningful error
>   

right

>   
>> }
>>
>> static struct file_operations ksm_scan_fops = {
>> 	.release        = ksm_scan_release,
>> 	.unlocked_ioctl = ksm_scan_ioctl,
>> 	.compat_ioctl   = ksm_scan_ioctl,
>> };
>>
>> static struct ksm_scan *ksm_scan_create(void)
>> {
>> 	struct ksm_scan *ksm_scan;
>>
>> 	ksm_scan = kzalloc(sizeof(struct ksm_scan), GFP_KERNEL);
>> 	return ksm_scan;
>> }
>>     
>
> Unnecessary wrapper
>   

ok

>   
>> static int ksm_dev_ioctl_create_scan(void)
>> {
>> 	int fd, r;
>> 	struct inode *inode;
>> 	struct file *file;
>> 	struct ksm_scan *ksm_scan;
>>
>> 	ksm_scan = ksm_scan_create();
>> 	if (IS_ERR(ksm_scan)) {
>>     
>
> failure == NULL
>
>   
>> 		r = PTR_ERR(ksm_scan);
>> 		goto out;
>> 	}
>>
>> 	r = anon_inode_getfd(&fd, &inode, &file, "ksm-scan", &ksm_scan_fops,
>> 			     ksm_scan);
>> 	if (r)
>> 		goto out_free;
>> 	return fd;
>>
>> out_free:
>> 	kfree(ksm_scan);
>> out:
>> 	return -1;
>>     
>
> return r to propagate error
>
>   
>> }
>>
>> static long ksm_dev_ioctl(struct file *filp,
>> 			  unsigned int ioctl, unsigned long arg)
>> {
>> 	long r = -EINVAL;
>>
>> 	switch(ioctl) {
>> 	case KSM_GET_API_VERSION:
>> 		r = KSM_API_VERSION;
>> 		break;
>> 	case KSM_CREATE_SHARED_MEMORY_AREA:
>> 		r = ksm_dev_ioctl_create_shared_memory_area();
>> 		break;
>> 	case KSM_CREATE_SCAN:
>> 		r = ksm_dev_ioctl_create_scan();
>>     
>
> What's the value of having multiple scanners?
> And how expensive is scanning?
>   

right now there is no real value in having multiple scanners, but the 
cost for this design is nothing
in the future we might want to make each scanner to scan some limited 
areas at a given speed...

while(1) {
r = ioctl(fd_scan, KSM_SCAN, 100);
usleep(1000);
}

this scanner take 0-1% (more to 0...) of the memory in case it doesnt 
merge anything (just to scan)
it really take no cpu as far as it doesnt find any pages to merge,
it should be noted that scanning should be slower than the above example,
when a page to be merged is found the cpu usage grow a little bit 
beacuse it now copy the data of the merged pages

i will run few workload tests and report to you as soon as i will have 
better information about this.

>   
>> 	default:
>> 		return r;
>> 	}
>> 	return r;
>> }
>>
>> static int ksm_dev_open(struct inode *inode, struct file *filp)
>> {
>> 	try_module_get(THIS_MODULE);
>> 	return 0;
>> }
>>
>> static int ksm_dev_release(struct inode *inode, struct file *filp)
>> {
>> 	module_put(THIS_MODULE);
>> 	return 0;
>> }
>>
>> static struct file_operations ksm_chardev_ops = {
>> 	.open           = ksm_dev_open,
>> 	.release        = ksm_dev_release,
>> 	.unlocked_ioctl = ksm_dev_ioctl,
>> 	.compat_ioctl   = ksm_dev_ioctl,
>> };
>>
>> static struct miscdevice ksm_dev = {
>> 	KSM_MINOR,
>> 	"ksm",
>> 	&ksm_chardev_ops,
>> };
>>
>> static int __init ksm_init(void)
>> {
>> 	int r;
>>
>> 	ksm = kzalloc(sizeof(struct ksm), GFP_KERNEL);
>> 	if (IS_ERR(ksm)) {
>>     
>
> failure == NULL
>
>   
>> 		r = PTR_ERR(ksm);
>> 		goto out;
>> 	}
>>
>> 	r = ksm_slab_init();
>> 	if (r)
>> 		goto out_free;
>>
>> 	r = page_hash_init();
>> 	if (r)
>> 		goto out_free1;
>>
>> 	r = rmap_hash_init();
>> 	if (r)
>> 		goto out_free2;
>>
>> 	INIT_LIST_HEAD(&ksm->slots);
>> 	init_rwsem(&ksm->slots_lock);
>> 	spin_lock_init(&ksm->hash_lock);
>>
>> 	r = misc_register(&ksm_dev);
>> 	if (r) {
>> 		printk(KERN_ERR "ksm: misc device register failed\n");
>> 		goto out_free3;
>> 	}
>> 	printk(KERN_WARNING "ksm loaded\n");
>> 	return 0;
>>
>> out_free3:
>> 	rmap_hash_free();
>> out_free2:
>> 	page_hash_free();
>> out_free1:
>> 	ksm_slab_free();
>> out_free:
>> 	kfree(ksm);
>> out:
>> 	return r;
>> }
>>
>> static void __exit ksm_exit(void)
>> {
>> 	misc_deregister(&ksm_dev);
>> 	rmap_hash_free();
>> 	page_hash_free();
>> 	ksm_slab_free();
>> 	kfree(ksm);
>> }
>>
>> module_init(ksm_init)
>> module_exit(ksm_exit)
>>     

thanks alot for reviewing!


-- 
woof.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24  9:07   ` Izik Eidus
@ 2008-01-24  9:22     ` Avi Kivity
  2008-01-24  9:48       ` Izik Eidus
  2008-01-24 17:53       ` Chris Wright
  2008-01-24 17:51     ` Chris Wright
  1 sibling, 2 replies; 9+ messages in thread
From: Avi Kivity @ 2008-01-24  9:22 UTC (permalink / raw)
  To: Izik Eidus; +Cc: Chris Wright, kvm-devel, andrea, dor.laor, linux-mm, yaniv

Izik Eidus wrote:
>>  
>>> struct ksm *ksm;
>>>     
>>
>> static
>>
>>   
>
> yes


Actually the entire contents of 'struct ksm' should be module static 
variables.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24  9:22     ` Avi Kivity
@ 2008-01-24  9:48       ` Izik Eidus
  2008-01-24 17:53       ` Chris Wright
  1 sibling, 0 replies; 9+ messages in thread
From: Izik Eidus @ 2008-01-24  9:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, kvm-devel, andrea, dor.laor, linux-mm, yaniv

Avi Kivity wrote:
> Izik Eidus wrote:
>>>  
>>>> struct ksm *ksm;
>>>>     
>>>
>>> static
>>>
>>>   
>>
>> yes
>
>
> Actually the entire contents of 'struct ksm' should be module static 
> variables.
>
i agree with you, i will fix this as well

-- 
woof.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24  9:07   ` Izik Eidus
  2008-01-24  9:22     ` Avi Kivity
@ 2008-01-24 17:51     ` Chris Wright
  2008-01-24 19:41       ` Avi Kivity
  1 sibling, 1 reply; 9+ messages in thread
From: Chris Wright @ 2008-01-24 17:51 UTC (permalink / raw)
  To: Izik Eidus
  Cc: Chris Wright, kvm-devel, andrea, avi, dor.laor, linux-mm, yaniv

* Izik Eidus (izike@qumranet.com) wrote:
> Chris Wright wrote:
>>> 	list_add_tail(&slot->link, &ksm->slots);
>>
>> slots_lock?
>
> good catch, i forgat to put it here
>
>>> 	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
>>>     
>>
>> theoretical, but if threaded, registering program could do this
>> concurrently
>
> you talk here about the miss slots_lock?, i agree that it needed here

Yeah, I missed that slots_lock protects both.

>>> 	case KSM_REGISTER_MEMORY_REGION: {
>>> 		struct ksm_memory_region ksm_memory_region;
>>>
>>> 		r = -EFAULT;
>>> 		if (copy_from_user(&ksm_memory_region, argp,
>>> 				   sizeof ksm_memory_region))
>>
>> this doesn't look compat safe:
>>
>>  struct ksm_memory_region {
>>  	__u32 npages; /* number of pages to share */
>>  	__u64 addr; /* the begining of the virtual address */
>>  };
>
> why isnt it compat safe?

32-bit has more relaxed alignment requirement for __u64 (4 bytes)
than 64-bit (8 bytes).  choices are reverse the order or add padding
(can test by compiling structure in 32 and 64 bit).

>>> static int is_present_pte(struct mm_struct *mm, unsigned long addr)
>>> {
>>> 	pgd_t *pgd;
>>> 	pud_t *pud;
>>> 	pmd_t *pmd;
>>> 	pte_t *ptep;
>>> 	spinlock_t *ptl;
>>> 	int ret = 0;
>>>
>>> 	pgd = pgd_offset(mm, addr);
>>> 	if (!pgd_present(*pgd))
>>> 		goto out;
>>>
>>> 	pud = pud_offset(pgd, addr);
>>> 	if (!pud_present(*pud))
>>> 		goto out;
>>>
>>> 	pmd = pmd_offset(pud, addr);
>>> 	if (!pmd_present(*pmd))
>>> 		goto out;
>>>
>>> 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
>>> 	if (!ptep)
>>> 		goto out;
>>>
>>> 	if (pte_present(*ptep))
>>> 		ret = 1;
>>>
>>> 	pte_unmap_unlock(ptep, ptl);
>>> out:
>>> 	return ret;
>>> }
>>
>> This is generic helper.
>
> you recommended insert it to memory.c?

I think so, would help to convert another user.  But, as is typical in
this area, each have slightly different requirements.

>>> /*
>>>  * try_to_merge_one_page - take two pages and merge them into one
>>>  * note:
>>>  * oldpage should be anon page while newpage should be file mapped page
>>>  */
>>> static int try_to_merge_one_page(struct mm_struct *mm,
>>> 				 struct vm_area_struct *vma,
>>> 				 struct page *oldpage,
>>> 				 struct page *newpage,
>>> 				 pgprot_t newprot)
>>> {
>>> 	int ret = 0;
>>> 	unsigned long page_addr_in_vma;
>>> 	void *oldaddr, *newaddr;
>>>
>>> 	get_page(newpage);
>>> 	get_page(oldpage);
>>>
>>> 	down_write(&mm->mmap_sem);
>>>
>>> 	lock_two_pages(oldpage, newpage);
>>>
>>> 	page_addr_in_vma = addr_in_vma(vma, oldpage);
>>> 	if (page_addr_in_vma == -EFAULT)
>>> 		goto out_unlock;
>>>
>>> 	/*
>>> 	 * if the page is swapped or in swap cache, we cannot replace its pte
>>> 	 * we might want to run here swap_free in the future (it isnt exported)
>>> 	 */
>>> 	if (!is_present_pte(mm, page_addr_in_vma))
>>> 		goto out_unlock;
>>>
>>> 	if (!page_wrprotect(oldpage))
>>> 		goto out_unlock;
>>>
>>> 	oldaddr = kmap_atomic(oldpage, KM_USER0);
>>> 	newaddr = kmap_atomic(newpage, KM_USER1);
>>>
>>> 	ret = 1;
>>> 	if (!memcmp(oldaddr, newaddr, PAGE_SIZE))
>>> 		ret = replace_page(vma, oldpage, newpage, newprot);
>>>     
>>
>> Does it make sense to leave oldpage r/o if replace_page fails?
>
> the chance here that it wont be the same is very very low,
> but it could happen, and in this case we will have vmexit and pagefault + 
> copying of a page data for nothing

right

> it can be solved by adding to the rmap something like:
> page_writeble() tha will work the same as page_wrprotect() but will mark 
> the pte as write instead of readonly,
> the question is: if it wanted to the rmap code?

maybe it's overkill

>>> 	case KSM_CREATE_SCAN:
>>> 		r = ksm_dev_ioctl_create_scan();
>>>     
>> What's the value of having multiple scanners?
>> And how expensive is scanning?

> right now there is no real value in having multiple scanners, but the cost 
> for this design is nothing

yes, only cost is runtime, and if each guest launch would get stuck
behind a bunch of scanners.  since they need write access to add sma.
at least /dev/ksm being 0600 would keep it from DoS threat.

> in the future we might want to make each scanner to scan some limited areas 
> at a given speed...
>
> while(1) {
> r = ioctl(fd_scan, KSM_SCAN, 100);
> usleep(1000);
> }
>
> this scanner take 0-1% (more to 0...) of the memory in case it doesnt merge 
> anything (just to scan)
> it really take no cpu as far as it doesnt find any pages to merge,
> it should be noted that scanning should be slower than the above example,
> when a page to be merged is found the cpu usage grow a little bit beacuse 
> it now copy the data of the merged pages
>
> i will run few workload tests and report to you as soon as i will have 
> better information about this.

ok, thanks.
-chris

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24  9:22     ` Avi Kivity
  2008-01-24  9:48       ` Izik Eidus
@ 2008-01-24 17:53       ` Chris Wright
  1 sibling, 0 replies; 9+ messages in thread
From: Chris Wright @ 2008-01-24 17:53 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Izik Eidus, Chris Wright, kvm-devel, andrea, dor.laor, linux-mm, yaniv

* Avi Kivity (avi@qumranet.com) wrote:
> Actually the entire contents of 'struct ksm' should be module static 
> variables.

Yeah, I agree.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24 17:51     ` Chris Wright
@ 2008-01-24 19:41       ` Avi Kivity
  2008-01-24 20:34         ` Chris Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Avi Kivity @ 2008-01-24 19:41 UTC (permalink / raw)
  To: Chris Wright; +Cc: Izik Eidus, andrea, yaniv, kvm-devel, linux-mm

Chris Wright wrote:
>>>  struct ksm_memory_region {
>>>  	__u32 npages; /* number of pages to share */
>>>  	__u64 addr; /* the begining of the virtual address */
>>>  };
>>>       
>> why isnt it compat safe?
>>     
>
> 32-bit has more relaxed alignment requirement for __u64 (4 bytes)
> than 64-bit (8 bytes).  choices are reverse the order or add padding
> (can test by compiling structure in 32 and 64 bit).
>   

Reversing the order isn't good enough, since the structure size would be 
different, and that is embedded in the ioctl number.  Padding is necessary.

[wishlist: struct { ... } __attribute__((abi_x86_64));]

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [kvm-devel] [RFC][PATCH 3/5] ksm source code
  2008-01-24 19:41       ` Avi Kivity
@ 2008-01-24 20:34         ` Chris Wright
  0 siblings, 0 replies; 9+ messages in thread
From: Chris Wright @ 2008-01-24 20:34 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Izik Eidus, andrea, yaniv, kvm-devel, linux-mm

* Avi Kivity (avi@qumranet.com) wrote:
> Chris Wright wrote:
>>>>  struct ksm_memory_region {
>>>>  	__u32 npages; /* number of pages to share */
>>>>  	__u64 addr; /* the begining of the virtual address */
>>>>  };
>>>>       
>>> why isnt it compat safe?
>>
>> 32-bit has more relaxed alignment requirement for __u64 (4 bytes)
>> than 64-bit (8 bytes).  choices are reverse the order or add padding
>> (can test by compiling structure in 32 and 64 bit).
>
> Reversing the order isn't good enough, since the structure size would be 
> different, and that is embedded in the ioctl number.

good point.

> Padding is necessary.
>
> [wishlist: struct { ... } __attribute__((abi_x86_64));]

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-01-24 20:34 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-21 16:12 [RFC][PATCH 3/5] ksm source code Izik Eidus
2008-01-24  7:24 ` [kvm-devel] " Chris Wright
2008-01-24  9:07   ` Izik Eidus
2008-01-24  9:22     ` Avi Kivity
2008-01-24  9:48       ` Izik Eidus
2008-01-24 17:53       ` Chris Wright
2008-01-24 17:51     ` Chris Wright
2008-01-24 19:41       ` Avi Kivity
2008-01-24 20:34         ` Chris Wright

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox