Commit 3ea7daa5 authored by NeilBrown's avatar NeilBrown

md/raid10: add reshape support

A 'near' or 'offset' lay RAID10 array can be reshaped to a different
'near' or 'offset' layout, a different chunk size, and a different
number of devices.
However the number of copies cannot change.

Unlike RAID5/6, we do not support having user-space backup data that
is being relocated during a 'critical section'.  Rather, the
data_offset of each device must change so that when writing any block
to a new location, it will not over-write any data that is still
'live'.

This means that RAID10 reshape is not supportable on v0.90 metadata.

The different between the old data_offset and the new_offset must be
at least the larger of the chunksize multiplied by offset copies of
each of the old and new layout. (for 'near' mode, offset_copies == 1).

A larger difference of around 64M seems useful for in-place reshapes
as more data can be moved between metadata updates.
Very large differences (e.g. 512M) seem to slow the process down due
to lots of long seeks (on oldish consumer graded devices at least).

Metadata needs to be updated whenever the place we are about to write
to is considered - by the current metadata - to still contain data in
the old layout.

[unbalanced locking fix from Dan Carpenter <dan.carpenter@oracle.com>]
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent deb200d0
......@@ -24,6 +24,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include "md.h"
#include "raid10.h"
#include "raid0.h"
......@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int enough(struct r10conf *conf, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio, int error);
static void end_reshape(struct r10conf *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
......@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!r10_bio)
return NULL;
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */
else
nalloc = 2; /* recovery */
......@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
struct bio *rbio = r10_bio->devs[j].repl_bio;
bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) {
if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
&conf->mddev->recovery)) {
/* we can share bv_page's during recovery */
if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
&conf->mddev->recovery)) {
/* we can share bv_page's during recovery
* and reshape */
struct bio *rbio = r10_bio->devs[0].bio;
page = rbio->bi_io_vec[i].bv_page;
get_page(page);
......@@ -614,10 +622,11 @@ static int raid10_mergeable_bvec(struct request_queue *q,
struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
struct geom *geo = &conf->geo;
chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
if (conf->reshape_progress != MaxSector &&
((sector >= conf->reshape_progress) !=
conf->mddev->reshape_backwards))
......@@ -1032,6 +1041,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
int plugged;
int sectors_handled;
int max_sectors;
int sectors;
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
......@@ -1096,10 +1106,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/
wait_barrier(conf);
sectors = bio->bi_size >> 9;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_sector < conf->reshape_progress &&
bio->bi_sector + sectors > conf->reshape_progress) {
/* IO spans the reshape position. Need to wait for
* reshape to pass
*/
allow_barrier(conf);
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_sector ||
conf->reshape_progress >= bio->bi_sector + sectors);
wait_barrier(conf);
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio_data_dir(bio) == WRITE &&
(mddev->reshape_backwards
? (bio->bi_sector < conf->reshape_safe &&
bio->bi_sector + sectors > conf->reshape_progress)
: (bio->bi_sector + sectors > conf->reshape_safe &&
bio->bi_sector < conf->reshape_progress))) {
/* Need to update reshape_position in metadata */
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
conf->reshape_safe = mddev->reshape_position;
}
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = bio->bi_size >> 9;
r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector;
......@@ -1730,7 +1771,11 @@ static void end_sync_read(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private;
int d;
d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (bio == r10_bio->master_bio) {
/* this is a reshape read */
d = r10_bio->read_slot; /* really the read dev */
} else
d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
......@@ -2631,6 +2676,8 @@ static void raid10d(struct mddev *mddev)
if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state))
handle_write_completed(conf, r10_bio);
else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
reshape_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_IsSync, &r10_bio->state))
sync_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
......@@ -2723,7 +2770,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
......@@ -2735,6 +2783,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
* we need to convert that to several
* virtual addresses.
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
return 0;
}
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
......@@ -2766,6 +2819,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*skipped = 1;
return sectors_skipped;
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
/* if there has been nothing to do on any drive,
* then there is nothing to do at all..
......@@ -3211,7 +3268,8 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
struct r10conf *conf = mddev->private;
if (!raid_disks)
raid_disks = conf->geo.raid_disks;
raid_disks = min(conf->geo.raid_disks,
conf->prev.raid_disks);
if (!sectors)
sectors = conf->dev_sectors;
......@@ -3321,7 +3379,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf)
goto out;
conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
/* FIXME calc properly */
conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
max(0,mddev->delta_disks)),
GFP_KERNEL);
if (!conf->mirrors)
goto out;
......@@ -3338,9 +3398,21 @@ static struct r10conf *setup_conf(struct mddev *mddev)
goto out;
calc_sectors(conf, mddev->dev_sectors);
conf->prev = conf->geo;
conf->reshape_progress = MaxSector;
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
conf->reshape_progress = MaxSector;
} else {
if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
err = -EINVAL;
goto out;
}
conf->reshape_progress = mddev->reshape_position;
if (conf->prev.far_offset)
conf->prev.stride = 1 << conf->prev.chunk_shift;
else
/* far_copies must be 1 */
conf->prev.stride = conf->dev_sectors;
}
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
......@@ -3355,8 +3427,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return conf;
out:
printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
mdname(mddev));
if (err == -ENOMEM)
printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
mdname(mddev));
if (conf) {
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
......@@ -3374,12 +3447,8 @@ static int run(struct mddev *mddev)
struct mirror_info *disk;
struct md_rdev *rdev;
sector_t size;
/*
* copy the already verified devices into our private RAID10
* bookkeeping area. [whatever we allocate in run(),
* should be freed in stop()]
*/
sector_t min_offset_diff = 0;
int first = 1;
if (mddev->private == NULL) {
conf = setup_conf(mddev);
......@@ -3403,6 +3472,7 @@ static int run(struct mddev *mddev)
(conf->geo.raid_disks / conf->geo.near_copies));
rdev_for_each(rdev, mddev) {
long long diff;
disk_idx = rdev->raid_disk;
if (disk_idx < 0)
......@@ -3421,12 +3491,20 @@ static int run(struct mddev *mddev)
goto out_free_conf;
disk->rdev = rdev;
}
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
if (diff < 0)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk->head_position = 0;
}
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
......@@ -3434,6 +3512,16 @@ static int run(struct mddev *mddev)
goto out_free_conf;
}
if (conf->reshape_progress != MaxSector) {
/* must ensure that shape change is supported */
if (conf->geo.far_copies != 1 &&
conf->geo.far_offset == 0)
goto out_free_conf;
if (conf->prev.far_copies != 1 &&
conf->geo.far_offset == 0)
goto out_free_conf;
}
mddev->degraded = 0;
for (i = 0;
i < conf->geo.raid_disks
......@@ -3486,8 +3574,8 @@ static int run(struct mddev *mddev)
int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
......@@ -3495,6 +3583,30 @@ static int run(struct mddev *mddev)
if (md_integrity_register(mddev))
goto out_free_conf;
if (conf->reshape_progress != MaxSector) {
unsigned long before_length, after_length;
before_length = ((1 << conf->prev.chunk_shift) *
conf->prev.far_copies);
after_length = ((1 << conf->geo.chunk_shift) *
conf->geo.far_copies);
if (max(before_length, after_length) > min_offset_diff) {
/* This cannot work */
printk("md/raid10: offset difference not enough to continue reshape\n");
goto out_free_conf;
}
conf->offset_diff = min_offset_diff;
conf->reshape_safe = conf->reshape_progress;
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
}
return 0;
out_free_conf:
......@@ -3634,6 +3746,735 @@ static void *raid10_takeover(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
static int raid10_check_reshape(struct mddev *mddev)
{
/* Called when there is a request to change
* - layout (to ->new_layout)
* - chunk size (to ->new_chunk_sectors)
* - raid_disks (by delta_disks)
* or when trying to restart a reshape that was ongoing.
*
* We need to validate the request and possibly allocate
* space if that might be an issue later.
*
* Currently we reject any reshape of a 'far' mode array,
* allow chunk size to change if new is generally acceptable,
* allow raid_disks to increase, and allow
* a switch between 'near' mode and 'offset' mode.
*/
struct r10conf *conf = mddev->private;
struct geom geo;
if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
return -EINVAL;
if (setup_geo(&geo, mddev, geo_start) != conf->copies)
/* mustn't change number of copies */
return -EINVAL;
if (geo.far_copies > 1 && !geo.far_offset)
/* Cannot switch to 'far' mode */
return -EINVAL;
if (mddev->array_sectors & geo.chunk_mask)
/* not factor of array size */
return -EINVAL;
if (mddev->bitmap)
return -EBUSY;
if (!enough(conf, -1))
return -EINVAL;
kfree(conf->mirrors_new);
conf->mirrors_new = NULL;
if (mddev->delta_disks > 0) {
/* allocate new 'mirrors' list */
conf->mirrors_new = kzalloc(
sizeof(struct mirror_info)
*(mddev->raid_disks +
mddev->delta_disks),
GFP_KERNEL);
if (!conf->mirrors_new)
return -ENOMEM;
}
return 0;
}
/*
* Need to check if array has failed when deciding whether to:
* - start an array
* - remove non-faulty devices
* - add a spare
* - allow a reshape
* This determination is simple when no reshape is happening.
* However if there is a reshape, we need to carefully check
* both the before and after sections.
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*/
static int calc_degraded(struct r10conf *conf)
{
int degraded, degraded2;
int i;
rcu_read_lock();
degraded = 0;
/* 'prev' section first */
for (i = 0; i < conf->prev.raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (!test_bit(In_sync, &rdev->flags))
/* When we can reduce the number of devices in
* an array, this might not contribute to
* 'degraded'. It does now.
*/
degraded++;
}
rcu_read_unlock();
if (conf->geo.raid_disks == conf->prev.raid_disks)
return degraded;
rcu_read_lock();
degraded2 = 0;
for (i = 0; i < conf->geo.raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++;
else if (!test_bit(In_sync, &rdev->flags)) {
/* If reshape is increasing the number of devices,
* this section has already been recovered, so
* it doesn't contribute to degraded.
* else it does.
*/
if (conf->geo.raid_disks <= conf->prev.raid_disks)
degraded2++;
}
}
rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
}
static int raid10_start_reshape(struct mddev *mddev)
{
/* A 'reshape' has been requested. This commits
* the various 'new' fields and sets MD_RECOVER_RESHAPE
* This also checks if there are enough spares and adds them
* to the array.
* We currently require enough spares to make the final
* array non-degraded. We also require that the difference
* between old and new data_offset - on each device - is
* enough that we never risk over-writing.
*/
unsigned long before_length, after_length;
sector_t min_offset_diff = 0;
int first = 1;
struct geom new;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int spares = 0;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (setup_geo(&new, mddev, geo_start) != conf->copies)
return -EINVAL;
before_length = ((1 << conf->prev.chunk_shift) *
conf->prev.far_copies);
after_length = ((1 << conf->geo.chunk_shift) *
conf->geo.far_copies);
rdev_for_each(rdev, mddev) {
if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0) {
long long diff = (rdev->new_data_offset
- rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
if (diff < 0)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
}
}
if (max(before_length, after_length) > min_offset_diff)
return -EINVAL;
if (spares < mddev->delta_disks)
return -EINVAL;
conf->offset_diff = min_offset_diff;
spin_lock_irq(&conf->device_lock);
if (conf->mirrors_new) {
memcpy(conf->mirrors_new, conf->mirrors,
sizeof(struct mirror_info)*conf->prev.raid_disks);
smp_mb();
kfree(conf->mirrors_old); /* FIXME and elsewhere */
conf->mirrors_old = conf->mirrors;
conf->mirrors = conf->mirrors_new;
conf->mirrors_new = NULL;
}
setup_geo(&conf->geo, mddev, geo_start);
smp_mb();
if (mddev->reshape_backwards) {
sector_t size = raid10_size(mddev, 0, 0);
if (size < mddev->array_sectors) {
spin_unlock_irq(&conf->device_lock);
printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
mdname(mddev));
return -EINVAL;
}
mddev->resync_max_sectors = size;
conf->reshape_progress = size;
} else
conf->reshape_progress = 0;
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid10_add_disk(mddev, rdev) == 0) {
if (rdev->raid_disk >=
conf->prev.raid_disks)
set_bit(In_sync, &rdev->flags);
else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
/* Failure here is OK */;
}
} else if (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
/* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags);
}
}
/* When a reshape changes the number of devices,
* ->degraded is measured against the larger of the
* pre and post numbers.
*/
spin_lock_irq(&conf->device_lock);
mddev->degraded = calc_degraded(conf);
spin_unlock_irq(&conf->device_lock);
mddev->raid_disks = conf->geo.raid_disks;
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
if (!mddev->sync_thread) {
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
conf->geo = conf->prev;
mddev->raid_disks = conf->geo.raid_disks;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
conf->reshape_checkpoint = jiffies;
md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev);
return 0;
}
/* Calculate the last device-address that could contain
* any block from the chunk that includes the array-address 's'
* and report the next address.
* i.e. the address returned will be chunk-aligned and after
* any data that is in the chunk containing 's'.
*/
static sector_t last_dev_address(sector_t s, struct geom *geo)
{
s = (s | geo->chunk_mask) + 1;
s >>= geo->chunk_shift;
s *= geo->near_copies;
s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift;
return s;
}
/* Calculate the first device-address that could contain
* any block from the chunk that includes the array-address 's'.
* This too will be the start of a chunk
*/
static sector_t first_dev_address(sector_t s, struct geom *geo)
{
s >>= geo->chunk_shift;
s *= geo->near_copies;
sector_div(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift;
return s;
}
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped)
{
/* We simply copy at most one chunk (smallest of old and new)
* at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of
* a chunk, but that is not a problem was mddev->reshape_position
* can record any location.
*
* If we will want to write to a location that isn't
* yet recorded as 'safe' (i.e. in metadata on disk) then
* we need to flush all reshape requests and update the metadata.
*
* When reshaping forwards (e.g. to more devices), we interpret
* 'safe' as the earliest block which might not have been copied
* down yet. We divide this by previous stripe size and multiply
* by previous stripe length to get lowest device offset that we
* cannot write to yet.
* We interpret 'sector_nr' as an address that we want to write to.
* From this we use last_device_address() to find where we might
* write to, and first_device_address on the 'safe' position.
* If this 'next' write position is after the 'safe' position,