Commit ceffc078 authored by Carsten Otte's avatar Carsten Otte Committed by Linus Torvalds
Browse files

[PATCH] xip: fs/mm: execute in place



- generic_file* file operations do no longer have a xip/non-xip split
- filemap_xip.c implements a new set of fops that require get_xip_page
  aop to work proper. all new fops are exported GPL-only (don't like to
  see whatever code use those except GPL modules)
- __xip_unmap now uses page_check_address, which is no longer static
  in rmap.c, and defined in linux/rmap.h
- mm/filemap.h is now much more clean, plainly having just Linus'
  inline funcs moved here from filemap.c
- fix includes in filemap_xip to make it build cleanly on i386
Signed-off-by: default avatarCarsten Otte <cotte@de.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 420edbcc
......@@ -808,7 +808,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) {
if (!f->f_mapping->a_ops ||
((!f->f_mapping->a_ops->direct_IO) &&
(!f->f_mapping->a_ops->get_xip_page))) {
fput(f);
f = ERR_PTR(-EINVAL);
}
......
......@@ -330,6 +330,8 @@ struct address_space_operations {
int (*releasepage) (struct page *, int);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
};
struct backing_dev_info;
......@@ -1497,6 +1499,22 @@ extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
#ifdef CONFIG_FS_XIP
extern ssize_t xip_file_aio_read(struct kiocb *iocb, char __user *buf,
size_t count, loff_t pos);
extern ssize_t xip_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
extern ssize_t xip_file_sendfile(struct file *in_file, loff_t *ppos,
size_t count, read_actor_t actor,
void *target);
extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
extern ssize_t xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
size_t count, loff_t pos);
extern ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
extern int xip_truncate_page(struct address_space *mapping, loff_t from);
#endif
static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
read_descriptor_t * desc,
read_actor_t actor)
......
......@@ -92,6 +92,12 @@ static inline void page_dup_rmap(struct page *page)
int page_referenced(struct page *, int is_locked, int ignore_token);
int try_to_unmap(struct page *);
/*
* Called from mm/filemap_xip.c to unmap empty zero page
*/
pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
/*
* Used by swapoff to help locate where page is expected in vma.
*/
......
......@@ -19,3 +19,4 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
......@@ -28,6 +28,7 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include "filemap.h"
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
......@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
}
EXPORT_SYMBOL(remove_suid);
/*
* Copy as much as we can into the page and return the number of bytes which
* were sucessfully copied. If a fault is encountered then clear the page
* out to (offset+bytes) and return the number of bytes which were copied.
*/
static inline size_t
filemap_copy_from_user(struct page *page, unsigned long offset,
const char __user *buf, unsigned bytes)
{
char *kaddr;
int left;
kaddr = kmap_atomic(page, KM_USER0);
left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (left != 0) {
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_from_user(kaddr + offset, buf, bytes);
kunmap(page);
}
return bytes - left;
}
static size_t
size_t
__filemap_copy_from_user_iovec(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
......@@ -1766,52 +1742,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
return copied - left;
}
/*
* This has the same sideeffects and return value as filemap_copy_from_user().
* The difference is that on a fault we need to memset the remainder of the
* page (out to offset+bytes), to emulate filemap_copy_from_user()'s
* single-segment behaviour.
*/
static inline size_t
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
const struct iovec *iov, size_t base, size_t bytes)
{
char *kaddr;
size_t copied;
kaddr = kmap_atomic(page, KM_USER0);
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
base, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (copied != bytes) {
kaddr = kmap(page);
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
base, bytes);
kunmap(page);
}
return copied;
}
static inline void
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
while (bytes) {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
}
*iovp = iov;
*basep = base;
}
/*
* Performs necessary checks before doing a write
*
......
/*
* linux/mm/filemap.h
*
* Copyright (C) 1994-1999 Linus Torvalds
*/
#ifndef __FILEMAP_H
#define __FILEMAP_H
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/uio.h>
#include <linux/config.h>
#include <asm/uaccess.h>
extern size_t
__filemap_copy_from_user_iovec(char *vaddr,
const struct iovec *iov,
size_t base,
size_t bytes);
/*
* Copy as much as we can into the page and return the number of bytes which
* were sucessfully copied. If a fault is encountered then clear the page
* out to (offset+bytes) and return the number of bytes which were copied.
*/
static inline size_t
filemap_copy_from_user(struct page *page, unsigned long offset,
const char __user *buf, unsigned bytes)
{
char *kaddr;
int left;
kaddr = kmap_atomic(page, KM_USER0);
left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (left != 0) {
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_from_user(kaddr + offset, buf, bytes);
kunmap(page);
}
return bytes - left;
}
/*
* This has the same sideeffects and return value as filemap_copy_from_user().
* The difference is that on a fault we need to memset the remainder of the
* page (out to offset+bytes), to emulate filemap_copy_from_user()'s
* single-segment behaviour.
*/
static inline size_t
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
const struct iovec *iov, size_t base, size_t bytes)
{
char *kaddr;
size_t copied;
kaddr = kmap_atomic(page, KM_USER0);
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
base, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (copied != bytes) {
kaddr = kmap(page);
copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
base, bytes);
kunmap(page);
}
return copied;
}
static inline void
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
while (bytes) {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
}
*iovp = iov;
*basep = base;
}
#endif
/*
* linux/mm/filemap_xip.c
*
* Copyright (C) 2005 IBM Corporation
* Author: Carsten Otte <cotte@de.ibm.com>
*
* derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
*
*/
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
#include <asm/tlbflush.h>
#include "filemap.h"
/*
* This is a file read routine for execute in place files, and uses
* the mapping->a_ops->get_xip_page() function for the actual low-level
* stuff.
*
* Note the struct file* is not used at all. It may be NULL.
*/
static void
do_xip_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
loff_t *ppos,
read_descriptor_t *desc,
read_actor_t actor)
{
struct inode *inode = mapping->host;
unsigned long index, end_index, offset;
loff_t isize;
BUG_ON(!mapping->a_ops->get_xip_page);
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
unsigned long nr, ret;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
goto out;
}
}
nr = nr - offset;
page = mapping->a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 0);
if (!page)
goto no_xip_page;
if (unlikely(IS_ERR(page))) {
if (PTR_ERR(page) == -ENODATA) {
/* sparse */
page = virt_to_page(empty_zero_page);
} else {
desc->error = PTR_ERR(page);
goto out;
}
} else
BUG_ON(!PageUptodate(page));
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
if (ret == nr && desc->count)
continue;
goto out;
no_xip_page:
/* Did not get the page. Report it */
desc->error = -EIO;
goto out;
}
out:
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
if (filp)
file_accessed(filp);
}
/*
* This is the "read()" routine for all filesystems
* that uses the get_xip_page address space operation.
*/
static ssize_t
__xip_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
size_t count;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
count += iv->iov_len;
if (unlikely((ssize_t)(count|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
count -= iv->iov_len; /* This segment is no good */
break;
}
retval = 0;
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
desc.written = 0;
desc.arg.buf = iov[seg].iov_base;
desc.count = iov[seg].iov_len;
if (desc.count == 0)
continue;
desc.error = 0;
do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
ppos, &desc, file_read_actor);
retval += desc.written;
if (!retval) {
retval = desc.error;
break;
}
}
}
return retval;
}
ssize_t
xip_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
loff_t pos)
{
struct iovec local_iov = { .iov_base = buf, .iov_len = count };
BUG_ON(iocb->ki_pos != pos);
return __xip_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
}
EXPORT_SYMBOL_GPL(xip_file_aio_read);
ssize_t
xip_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct kiocb kiocb;
init_sync_kiocb(&kiocb, filp);
return __xip_file_aio_read(&kiocb, iov, nr_segs, ppos);
}
EXPORT_SYMBOL_GPL(xip_file_readv);
ssize_t
xip_file_sendfile(struct file *in_file, loff_t *ppos,
size_t count, read_actor_t actor, void *target)
{
read_descriptor_t desc;
if (!count)
return 0;
desc.written = 0;
desc.count = count;
desc.arg.data = target;
desc.error = 0;
do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
ppos, &desc, actor);
if (desc.written)
return desc.written;
return desc.error;
}
EXPORT_SYMBOL_GPL(xip_file_sendfile);
/*
* __xip_unmap is invoked from xip_unmap and
* xip_write
*
* This function walks all vmas of the address_space and unmaps the
* empty_zero_page when found at pgoff. Should it go in rmap.c?
*/
static void
__xip_unmap (struct address_space * mapping,
unsigned long pgoff)
{
struct vm_area_struct *vma;
struct mm_struct *mm;
struct prio_tree_iter iter;
unsigned long address;
pte_t *pte;
pte_t pteval;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
/*
* We need the page_table_lock to protect us from page faults,
* munmap, fork, etc...
*/
pte = page_check_address(virt_to_page(empty_zero_page), mm,
address);
if (!IS_ERR(pte)) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(pte));
pteval = ptep_clear_flush(vma, address, pte);
BUG_ON(pte_dirty(pteval));
pte_unmap(pte);
spin_unlock(&mm->page_table_lock);
}
}
spin_unlock(&mapping->i_mmap_lock);
}
/*
* xip_nopage() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* This function is derived from filemap_nopage, but used for execute in place
*/
static struct page *
xip_file_nopage(struct vm_area_struct * area,
unsigned long address,
int *type)
{
struct file *file = area->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct page *page;
unsigned long size, pgoff, endoff;
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+ area->vm_pgoff;
endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+ area->vm_pgoff;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (pgoff >= size) {
return NULL;
}
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page)) {
BUG_ON(!PageUptodate(page));
return page;
}
if (PTR_ERR(page) != -ENODATA)
return NULL;
/* sparse block */
if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
(area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
/* maybe shared writable, allocate new block */
page = mapping->a_ops->get_xip_page (mapping,
pgoff*(PAGE_SIZE/512), 1);
if (IS_ERR(page))
return NULL;
BUG_ON(!PageUptodate(page));
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, pgoff);
} else {
/* not shared and writable, use empty_zero_page */
page = virt_to_page(empty_zero_page);
}
return page;
}
static struct vm_operations_struct xip_file_vm_ops = {
.nopage = xip_file_nopage,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
{
BUG_ON(!file->f_mapping->a_ops->get_xip_page);
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
static ssize_t
do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
struct page *page;
size_t bytes;
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
char __user *buf;
ssize_t written = 0;
BUG_ON(!mapping->a_ops->get_xip_page);
buf = iov->iov_base;
do {
unsigned long index;
unsigned long offset;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
fault_in_pages_readable(buf, bytes);
page = a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 0);
if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
/* we allocate a new page unmap it */
page = a_ops->get_xip_page(mapping,