* hugetlbfs-2.5.39-5 (was Re: hugetlbfs-2.5.39-3)
2002-09-30 0:35 hugetlbfs-2.5.39-3 William Lee Irwin III
2002-09-30 0:52 ` hugetlbfs-2.5.39-3 Andrew Morton
@ 2002-09-30 21:27 ` William Lee Irwin III
1 sibling, 0 replies; 4+ messages in thread
From: William Lee Irwin III @ 2002-09-30 21:27 UTC (permalink / raw)
To: linux-mm; +Cc: bhartner
On Sun, Sep 29, 2002 at 05:35:58PM -0700, William Lee Irwin III wrote:
> This patch exports functionality similar to ramfs but utiizing the
> hugetlbfs interface to mmap its files. It's a straightforward derivative
> of ramfs requiring only hugetlbpage.c to export some prefaulting logic.
Some testing & fixing will need to be done, but it's where I'm going.
No new testing ("it compiles, ship it") just yet. More to come, though.
Bill, this is the new stuff. I'll get it into better shape by tonight
PST but wanted to flash something around for you before Austin sleeps.
New stuff:
Add SHM_HUGETLB flag as an extension, add hugetlb_zero_setup() similar
to tmpfs, check the flag and call it conditionally in newseg().
Something probably needs to be done to properly tear things down.
umount() appears to fail. Looks like it goes BUG() on unaligned
mappings too -- no big deal just -EINVAL them next post.
diff --minimal -Nru a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
--- a/arch/i386/mm/hugetlbpage.c Mon Sep 30 13:19:13 2002
+++ b/arch/i386/mm/hugetlbpage.c Mon Sep 30 13:19:13 2002
@@ -17,7 +17,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-static struct vm_operations_struct hugetlb_vm_ops;
+struct vm_operations_struct hugetlb_vm_ops;
struct list_head htlbpage_freelist;
spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
extern long htlbpagemem;
@@ -44,24 +44,22 @@
static struct page *
alloc_hugetlb_page(void)
{
- struct list_head *curr, *head;
+ int i;
struct page *page;
spin_lock(&htlbpage_lock);
-
- head = &htlbpage_freelist;
- curr = head->next;
-
- if (curr == head) {
+ if (list_empty(&htlbpage_freelist)) {
spin_unlock(&htlbpage_lock);
return NULL;
}
- page = list_entry(curr, struct page, list);
- list_del(curr);
+
+ page = list_entry(htlbpage_freelist.next, struct page, list);
+ list_del(&page->list);
htlbpagemem--;
spin_unlock(&htlbpage_lock);
set_page_count(page, 1);
- memset(page_address(page), 0, HPAGE_SIZE);
+ for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+ clear_highpage(&page[i]);
return page;
}
@@ -459,6 +457,46 @@
return retval;
}
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ spin_lock(&mm->page_table_lock);
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ unsigned long idx;
+ pte_t *pte = huge_pte_alloc(mm, addr);
+ struct page *page;
+
+ if (!pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!pte_none(*pte))
+ continue;
+
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ page = alloc_hugetlb_page();
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ add_to_page_cache(page, mapping, idx);
+ }
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+ }
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
static int
alloc_private_hugetlb_pages(int key, unsigned long addr, unsigned long len,
int prot, int flag)
@@ -540,6 +578,13 @@
return (int) htlbzone_pages;
}
-static struct vm_operations_struct hugetlb_vm_ops = {
+static struct page * hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+{
+ BUG();
+ return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
.close = zap_hugetlb_resources,
+ .nopage = hugetlb_nopage,
};
diff --minimal -Nru a/fs/Config.in b/fs/Config.in
--- a/fs/Config.in Mon Sep 30 13:19:13 2002
+++ b/fs/Config.in Mon Sep 30 13:19:13 2002
@@ -56,6 +56,11 @@
bool 'Virtual memory file system support (former shm fs)' CONFIG_TMPFS
define_bool CONFIG_RAMFS y
+if [ "$CONFIG_HUGETLB_PAGE" = "y" ] ; then
+ bool 'HugeTLB file system support' CONFIG_HUGETLBFS
+ define_bool CONFIG_HUGETLBFS y
+fi
+
tristate 'ISO 9660 CDROM file system support' CONFIG_ISO9660_FS
dep_mbool ' Microsoft Joliet CDROM extensions' CONFIG_JOLIET $CONFIG_ISO9660_FS
dep_mbool ' Transparent decompression extension' CONFIG_ZISOFS $CONFIG_ISO9660_FS
diff --minimal -Nru a/fs/Makefile b/fs/Makefile
--- a/fs/Makefile Mon Sep 30 13:19:13 2002
+++ b/fs/Makefile Mon Sep 30 13:19:13 2002
@@ -46,6 +46,7 @@
obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_RAMFS) += ramfs/
+obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
obj-$(CONFIG_CODA_FS) += coda/
obj-$(CONFIG_INTERMEZZO_FS) += intermezzo/
obj-$(CONFIG_MINIX_FS) += minix/
diff --minimal -Nru a/fs/hugetlbfs/Makefile b/fs/hugetlbfs/Makefile
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/fs/hugetlbfs/Makefile Mon Sep 30 13:19:13 2002
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux ramfs routines.
+#
+
+obj-$(CONFIG_HUGETLBFS) += hugetlbfs.o
+
+hugetlbfs-objs := inode.o
+
+include $(TOPDIR)/Rules.make
diff --minimal -Nru a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/fs/hugetlbfs/inode.c Mon Sep 30 13:19:13 2002
@@ -0,0 +1,419 @@
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ * 2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+/* some random number */
+#define HUGETLBFS_MAGIC 0x958458f6
+
+static struct super_operations hugetlbfs_ops;
+static struct address_space_operations hugetlbfs_aops;
+struct file_operations hugetlbfs_file_operations;
+static struct inode_operations hugetlbfs_dir_inode_operations;
+
+static struct backing_dev_info hugetlbfs_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .memory_backed = 1, /* Does not contribute to dirty memory */
+};
+
+static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode =file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ int ret;
+
+ down(&inode->i_sem);
+
+ UPDATE_ATIME(inode);
+ vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+ vma->vm_ops = &hugetlb_vm_ops;
+ ret = hugetlb_prefault(mapping, vma);
+
+ up(&inode->i_sem);
+
+ return ret;
+}
+
+/*
+ * Read a page. Again trivial. If it didn't already exist
+ * in the page cache, it is zero-filled.
+ */
+static int hugetlbfs_readpage(struct file *file, struct page * page)
+{
+ return -EINVAL;
+}
+
+static int hugetlbfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+{
+ return -EINVAL;
+}
+
+static int hugetlbfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+{
+ return -EINVAL;
+}
+
+struct inode *hugetlbfs_get_inode(struct super_block *sb, int mode, int dev)
+{
+ struct inode * inode = new_inode(sb);
+
+ if (inode) {
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_rdev = NODEV;
+ inode->i_mapping->a_ops = &hugetlbfs_aops;
+ inode->i_mapping->backing_dev_info = &hugetlbfs_backing_dev_info;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ switch (mode & S_IFMT) {
+ default:
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_fop = &hugetlbfs_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &hugetlbfs_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inode->i_nlink++;
+ break;
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ break;
+ }
+ }
+ return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int hugetlbfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev)
+{
+ struct inode * inode = hugetlbfs_get_inode(dir->i_sb, mode, dev);
+ int error = -ENOSPC;
+
+ if (inode) {
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ error = 0;
+ }
+ return error;
+}
+
+static int hugetlbfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+ int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ if (!retval)
+ dir->i_nlink++;
+ return retval;
+}
+
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode)
+{
+ return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/*
+ * Link a file..
+ */
+static int hugetlbfs_link(struct dentry *old_dentry, struct inode * dir, struct dentry * dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+
+ inode->i_nlink++;
+ atomic_inc(&inode->i_count); /* New dentry reference */
+ dget(dentry); /* Extra pinning count for the created dentry */
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+static inline int hugetlbfs_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+/*
+ * Check that a directory is empty (this works
+ * for regular files too, they'll just always be
+ * considered empty..).
+ *
+ * Note that an empty directory can still have
+ * children, they just all have to be negative..
+ */
+static int hugetlbfs_empty(struct dentry *dentry)
+{
+ struct list_head *list;
+
+ spin_lock(&dcache_lock);
+ list = dentry->d_subdirs.next;
+
+ while (list != &dentry->d_subdirs) {
+ struct dentry *de = list_entry(list, struct dentry, d_child);
+
+ if (hugetlbfs_positive(de)) {
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
+ list = list->next;
+ }
+ spin_unlock(&dcache_lock);
+ return 1;
+}
+
+/*
+ * Unlink a hugetlbfs entry
+ */
+static int hugetlbfs_unlink(struct inode * dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ inode->i_nlink--;
+ dput(dentry); /* Undo the count from "create" - this does all the work */
+ return 0;
+}
+
+static int hugetlbfs_rmdir(struct inode * dir, struct dentry *dentry)
+{
+ int retval = -ENOTEMPTY;
+
+ if (hugetlbfs_empty(dentry)) {
+ dentry->d_inode->i_nlink--;
+ hugetlbfs_unlink(dir, dentry);
+ dir->i_nlink--;
+ retval = 0;
+ }
+ return retval;
+}
+
+/*
+ * The VFS layer already does all the dentry stuff for rename,
+ * we just have to decrement the usage count for the target if
+ * it exists so that the VFS layer correctly free's it when it
+ * gets overwritten.
+ */
+static int hugetlbfs_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir,struct dentry *new_dentry)
+{
+ int error = -ENOTEMPTY;
+
+ if (hugetlbfs_empty(new_dentry)) {
+ struct inode *inode = new_dentry->d_inode;
+ if (inode) {
+ inode->i_nlink--;
+ dput(new_dentry);
+ }
+ if (S_ISDIR(old_dentry->d_inode->i_mode)) {
+ old_dir->i_nlink--;
+ new_dir->i_nlink++;
+ }
+ error = 0;
+ }
+ return error;
+}
+
+static int hugetlbfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+ struct inode *inode;
+ int error = -ENOSPC;
+
+ inode = hugetlbfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+ if (inode) {
+ int l = strlen(symname)+1;
+ error = page_symlink(inode, symname, l);
+ if (!error) {
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ } else
+ iput(inode);
+ }
+ return error;
+}
+
+static int hugetlbfs_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+ return 0;
+}
+
+static struct address_space_operations hugetlbfs_aops = {
+ readpage: hugetlbfs_readpage,
+ writepage: fail_writepage,
+ prepare_write: hugetlbfs_prepare_write,
+ commit_write: hugetlbfs_commit_write
+};
+
+struct file_operations hugetlbfs_file_operations = {
+ read: generic_file_read,
+ write: generic_file_write,
+ mmap: hugetlbfs_file_mmap,
+ fsync: hugetlbfs_sync_file,
+ sendfile: generic_file_sendfile,
+};
+
+static struct inode_operations hugetlbfs_dir_inode_operations = {
+ create: hugetlbfs_create,
+ lookup: simple_lookup,
+ link: hugetlbfs_link,
+ unlink: hugetlbfs_unlink,
+ symlink: hugetlbfs_symlink,
+ mkdir: hugetlbfs_mkdir,
+ rmdir: hugetlbfs_rmdir,
+ mknod: hugetlbfs_mknod,
+ rename: hugetlbfs_rename,
+};
+
+static struct super_operations hugetlbfs_ops = {
+ statfs: simple_statfs,
+ drop_inode: generic_delete_inode,
+};
+
+static int hugetlbfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+ struct inode * inode;
+ struct dentry * root;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = HUGETLBFS_MAGIC;
+ sb->s_op = &hugetlbfs_ops;
+ inode = hugetlbfs_get_inode(sb, S_IFDIR | 0755, 0);
+ if (!inode)
+ return -ENOMEM;
+
+ root = d_alloc_root(inode);
+ if (!root) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ sb->s_root = root;
+ return 0;
+}
+
+static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
+ int flags, char *dev_name, void *data)
+{
+ return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+}
+
+static struct file_system_type hugetlbfs_fs_type = {
+ name: "hugetlbfs",
+ get_sb: hugetlbfs_get_sb,
+ kill_sb: kill_litter_super,
+};
+
+static struct vfsmount *hugetlbfs_vfsmount;
+
+static atomic_t hugetlbfs_counter = ATOMIC_INIT(0);
+
+struct file *hugetlb_zero_setup(size_t size)
+{
+ int error, n;
+ struct file *file;
+ struct inode *inode;
+ struct dentry *dentry, *root;
+ struct qstr quick_string;
+ char buf[16];
+
+ n = atomic_read(&hugetlbfs_counter);
+ atomic_inc(&hugetlbfs_counter);
+
+ root = hugetlbfs_vfsmount->mnt_root;
+ snprintf(buf, 16, "%d", n);
+ quick_string.name = buf;
+ quick_string.len = strlen(quick_string.name);
+ quick_string.hash = 0;
+ dentry = d_alloc(root, &quick_string);
+ if (!dentry)
+ return ERR_PTR(-ENOMEM);
+
+ error = -ENFILE;
+ file = get_empty_filp();
+ if (!file)
+ goto out_dentry;
+
+ error = -ENOSPC;
+ inode = hugetlbfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+ if (!inode)
+ goto out_file;
+
+ d_instantiate(dentry, inode);
+ inode->i_size = size;
+ inode->i_nlink = 0;
+ file->f_vfsmnt = mntget(hugetlbfs_vfsmount);
+ file->f_dentry = dentry;
+ file->f_op = &hugetlbfs_file_operations;
+ file->f_mode = FMODE_WRITE | FMODE_READ;
+ return file;
+
+out_file:
+ put_filp(file);
+out_dentry:
+ dput(dentry);
+ return ERR_PTR(error);
+}
+
+static int __init init_hugetlbfs_fs(void)
+{
+ int error;
+ struct vfsmount *vfsmount;
+
+ error = register_filesystem(&hugetlbfs_fs_type);
+ if (error)
+ return error;
+
+ vfsmount = kern_mount(&hugetlbfs_fs_type);
+
+ if (!IS_ERR(vfsmount)) {
+ hugetlbfs_vfsmount = vfsmount;
+ return 0;
+ }
+
+ error = PTR_ERR(vfsmount);
+ return error;
+}
+
+static void __exit exit_hugetlbfs_fs(void)
+{
+ unregister_filesystem(&hugetlbfs_fs_type);
+}
+
+module_init(init_hugetlbfs_fs)
+module_exit(exit_hugetlbfs_fs)
+
+MODULE_LICENSE("GPL");
diff --minimal -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Mon Sep 30 13:19:13 2002
+++ b/include/linux/mm.h Mon Sep 30 13:19:13 2002
@@ -386,7 +386,9 @@
extern int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
extern int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
extern int free_hugepages(struct vm_area_struct *);
-
+extern int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
+extern struct file *hugetlb_zero_setup(size_t);
+extern struct file_operations hugetlbfs_file_operations;
#else
#define is_vm_hugetlb_page(vma) (0)
#define follow_hugetlb_page(mm, vma, pages, vmas, start, len, i) (0)
@@ -394,6 +396,7 @@
#define free_hugepages(mpnt) do { } while(0)
#endif
+extern struct vm_operations_struct hugetlb_vm_ops;
/*
* If the mapping doesn't provide a set_page_dirty a_op, then
diff --minimal -Nru a/include/linux/shm.h b/include/linux/shm.h
--- a/include/linux/shm.h Mon Sep 30 13:19:13 2002
+++ b/include/linux/shm.h Mon Sep 30 13:19:13 2002
@@ -88,6 +88,7 @@
/* shm_mode upper byte flags */
#define SHM_DEST 01000 /* segment will be destroyed on last detach */
#define SHM_LOCKED 02000 /* segment will not be swapped */
+#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
asmlinkage long sys_shmget (key_t key, size_t size, int flag);
asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, unsigned long *addr);
diff --minimal -Nru a/ipc/shm.c b/ipc/shm.c
--- a/ipc/shm.c Mon Sep 30 13:19:13 2002
+++ b/ipc/shm.c Mon Sep 30 13:19:13 2002
@@ -185,8 +185,12 @@
shp->shm_perm.key = key;
shp->shm_flags = (shmflg & S_IRWXUGO);
- sprintf (name, "SYSV%08x", key);
- file = shmem_file_setup(name, size, VM_ACCOUNT);
+ if (shmflg & SHM_HUGETLB)
+ file = hugetlb_zero_setup(size);
+ else {
+ sprintf (name, "SYSV%08x", key);
+ file = shmem_file_setup(name, size, VM_ACCOUNT);
+ }
error = PTR_ERR(file);
if (IS_ERR(file))
goto no_file;
@@ -205,7 +209,10 @@
shp->id = shm_buildid(id,shp->shm_perm.seq);
shp->shm_file = file;
file->f_dentry->d_inode->i_ino = shp->id;
- file->f_op = &shm_file_operations;
+ if (shmflg & SHM_HUGETLB)
+ file->f_op = &hugetlbfs_file_operations;
+ else
+ file->f_op = &shm_file_operations;
shm_tot += numpages;
shm_unlock (id);
return shp->id;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/
^ permalink raw reply [flat|nested] 4+ messages in thread