Commit 2fe17c10 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Al Viro

fallocate should be a file operation

Currently all filesystems except XFS implement fallocate asynchronously,
while XFS forced a commit.  Both of these are suboptimal - in case of O_SYNC
I/O we really want our allocation on disk, especially for the !KEEP_SIZE
case where we actually grow the file with user-visible zeroes.  On the
other hand always commiting the transaction is a bad idea for fast-path
uses of fallocate like for example in recent Samba versions.   Given
that block allocation is a data plane operation anyway change it from
an inode operation to a file operation so that we have the file structure
available that lets us check for O_SYNC.

This also includes moving the code around for a few of the filesystems,
and remove the already unnedded S_ISDIR checks given that we only wire
up fallocate for regular files.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 64c23e86
......@@ -60,7 +60,6 @@ ata *);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
void (*truncate_range)(struct inode *, loff_t, loff_t);
long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
locking rules:
......@@ -88,7 +87,6 @@ getxattr: no
listxattr: no
removexattr: yes
truncate_range: yes
fallocate: no
fiemap: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
......@@ -437,6 +435,7 @@ prototypes:
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **);
long (*fallocate)(struct file *, int, loff_t, loff_t);
};
locking rules:
......
......@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
......@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct extent_state *cached_state = NULL;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
int ret;
alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
goto out;
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, alloc_start);
if (ret)
goto out;
}
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, 0, &cached_state, GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
&cached_state, GFP_NOFS);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
}
}
cur_offset = alloc_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
BUG_ON(IS_ERR(em) || !em);
last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask;
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
if (ret < 0) {
free_extent_map(em);
break;
}
}
free_extent_map(em);
cur_offset = last_byte;
if (cur_offset >= alloc_end) {
ret = 0;
break;
}
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
......@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
.open = generic_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
......
......@@ -7098,116 +7098,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
static long btrfs_fallocate(struct inode *inode, int mode,
loff_t offset, loff_t len)
{
struct extent_state *cached_state = NULL;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
int ret;
alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
goto out;
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, alloc_start);
if (ret)
goto out;
}
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, 0, &cached_state, GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
&cached_state, GFP_NOFS);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
}
}
cur_offset = alloc_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
BUG_ON(IS_ERR(em) || !em);
last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask;
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
if (ret < 0) {
free_extent_map(em);
break;
}
}
free_extent_map(em);
cur_offset = last_byte;
if (cur_offset >= alloc_end) {
ret = 0;
break;
}
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
static int btrfs_set_page_dirty(struct page *page)
{
return __set_page_dirty_nobuffers(page);
......@@ -7310,7 +7200,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
static const struct inode_operations btrfs_special_inode_operations = {
......
......@@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
......
......@@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
}
/*
* preallocate space for a file. This implements ext4's fallocate inode
* preallocate space for a file. This implements ext4's fallocate file
* operation, which gets called from sys_fallocate system call.
* For block-mapped files, posix_fallocate should fall back to the method
* of writing zeroes to the required new blocks (the same behavior which is
* expected for file systems which do not support fallocate() system call).
*/
long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file->f_path.dentry->d_inode;
handle_t *handle;
loff_t new_size;
unsigned int max_blocks;
......@@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
/* preallocation to directories is currently not supported */
if (S_ISDIR(inode->i_mode))
return -ENODEV;
map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
......
......@@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = {
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
......@@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.check_acl = ext4_check_acl,
.fallocate = ext4_fallocate,
.fiemap = ext4_fiemap,
};
......@@ -19,6 +19,8 @@
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/ext2_fs.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/crc32.h>
#include <linux/writeback.h>
#include <asm/uaccess.h>
......@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static void empty_write_end(struct page *page, unsigned from,
unsigned to)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
page_zero_new_buffers(page, from, to);
flush_dcache_page(page);
mark_page_accessed(page);
if (!gfs2_is_writeback(ip))
gfs2_page_add_databufs(ip, page, from, to);
block_commit_write(page, from, to);
}
static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
{
unsigned start, end, next;
struct buffer_head *bh, *head;
int error;
if (!page_has_buffers(page)) {
error = __block_write_begin(page, from, to - from, gfs2_block_map);
if (unlikely(error))
return error;
empty_write_end(page, from, to);
return 0;
}
bh = head = page_buffers(page);
next = end = 0;
while (next < from) {
next += bh->b_size;
bh = bh->b_this_page;
}
start = next;
do {
next += bh->b_size;
if (buffer_mapped(bh)) {
if (end) {
error = __block_write_begin(page, start, end - start,
gfs2_block_map);
if (unlikely(error))
return error;
empty_write_end(page, start, end);
end = 0;
}
start = next;
}
else
end = next;
bh = bh->b_this_page;
} while (next < to);
if (end) {
error = __block_write_begin(page, start, end - start, gfs2_block_map);
if (unlikely(error))
return error;
empty_write_end(page, start, end);
}
return 0;
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
int mode)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
u64 start = offset >> PAGE_CACHE_SHIFT;
unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
pgoff_t curr;
struct page *page;
unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
unsigned int from, to;
if (!end_offset)
end_offset = PAGE_CACHE_SIZE;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(error))
goto out;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
if (gfs2_is_stuffed(ip)) {
error = gfs2_unstuff_dinode(ip, NULL);
if (unlikely(error))
goto out;
}
curr = start;
offset = start << PAGE_CACHE_SHIFT;
from = start_offset;
to = PAGE_CACHE_SIZE;
while (curr <= end) {
page = grab_cache_page_write_begin(inode->i_mapping, curr,
AOP_FLAG_NOFS);
if (unlikely(!page)) {
error = -ENOMEM;
goto out;
}
if (curr == end)
to = end_offset;
error = write_empty_blocks(page, from, to);
if (!error && offset + to > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
i_size_write(inode, offset + to);
}
unlock_page(page);
page_cache_release(page);
if (error)
goto out;
curr++;
offset += PAGE_CACHE_SIZE;
from = 0;
}
gfs2_dinode_out(ip, dibh->b_data);
mark_inode_dirty(inode);
brelse(dibh);
out:
return error;
}
static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
unsigned int *data_blocks, unsigned int *ind_blocks)
{
const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
for (tmp = max_data; tmp > sdp->sd_diptrs;) {
tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
max_data -= tmp;
}
/* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
so it might end up with fewer data blocks */
if (max_data <= *data_blocks)
return;
*data_blocks = max_data;
*ind_blocks = max_blocks - max_data;
*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
if (*len > max) {
*len = max;
gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
}
}
static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
loff_t bytes, max_bytes;
struct gfs2_alloc *al;
int error;
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
sdp->sd_sb.sb_bsize_shift;
len = next - offset;
bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
if (!bytes)
bytes = UINT_MAX;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
error = gfs2_glock_nq(&ip->i_gh);
if (unlikely(error))
goto out_uninit;
if (!gfs2_write_alloc_required(ip, offset, len))
goto out_unlock;
while (len > 0) {
if (len < bytes)
bytes = len;
al = gfs2_alloc_get(ip);
if (!al) {
error = -ENOMEM;
goto out_unlock;
}
error = gfs2_quota_lock_check(ip);
if (error)
goto out_alloc_put;
retry:
gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
al->al_requested = data_blocks + ind_blocks;
error = gfs2_inplace_reserve(ip);
if (error) {
if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
bytes >>= 1;
goto retry;
}
goto out_qunlock;
}
max_bytes = bytes;
calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
al->al_requested = data_blocks + ind_blocks;
rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
RES_RG_HDR + gfs2_rg_blocks(al);
if (gfs2_is_jdata(ip))
rblocks += data_blocks ? data_blocks : 1;
error = gfs2_trans_begin(sdp, rblocks,
PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
if (error)
goto out_trans_fail;
error = fallocate_chunk(inode, offset, max_bytes, mode);
gfs2_trans_end(sdp);
if (error)
goto out_trans_fail;
len -= max_bytes;
offset += max_bytes;
gfs2_inplace_release(ip);
gfs2_quota_unlock(ip);
gfs2_alloc_put(ip);
}
goto out_unlock;
out_trans_fail:
gfs2_inplace_release(ip);
out_qunlock:
gfs2_quota_unlock(ip);
out_alloc_put:
gfs2_alloc_put(ip);
out_unlock:
gfs2_glock_dq(&ip->i_gh);
out_uninit:
gfs2_holder_uninit(&ip->i_gh);
return error;
}
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
/**
......@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.setlease = gfs2_setlease,
.fallocate = gfs2_fallocate,
};
const struct file_operations gfs2_dir_fops = {
......@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
};