raid5.c 182 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4
/*
 * raid5.c : Multiple Devices driver for Linux
 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
 *	   Copyright (C) 1999, 2000 Ingo Molnar
5
 *	   Copyright (C) 2002, 2003 H. Peter Anvin
Linus Torvalds's avatar
Linus Torvalds committed
6
 *
7 8 9
 * RAID-4/5/6 management functions.
 * Thanks to Penguin Computing for making the RAID-6 development possible
 * by donating a test server!
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * You should have received a copy of the GNU General Public License
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

21 22 23 24 25 26 27 28 29
/*
 * BITMAP UNPLUGGING:
 *
 * The sequencing for updating the bitmap reliably is a little
 * subtle (and I got it wrong the first time) so it deserves some
 * explanation.
 *
 * We group bitmap updates into batches.  Each batch has a number.
 * We may write out several batches at once, but that isn't very important.
30 31
 * conf->seq_write is the number of the last batch successfully written.
 * conf->seq_flush is the number of the last batch that was closed to
32 33 34
 *    new additions.
 * When we discover that we will need to write to any block in a stripe
 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35
 * the number of the batch it will be in. This is seq_flush+1.
36 37 38 39 40 41 42 43 44
 * When we are ready to do a write, if that batch hasn't been written yet,
 *   we plug the array and queue the stripe for later.
 * When an unplug happens, we increment bm_flush, thus closing the current
 *   batch.
 * When we notice that bm_flush > bm_write, we write out all pending updates
 * to the bitmap, and advance bm_write to where bm_flush was.
 * This may occasionally write a bit out twice, but is sure never to
 * miss any bits.
 */
Linus Torvalds's avatar
Linus Torvalds committed
45

46
#include <linux/blkdev.h>
47
#include <linux/kthread.h>
48
#include <linux/raid/pq.h>
49
#include <linux/async_tx.h>
50
#include <linux/module.h>
51
#include <linux/async.h>
52
#include <linux/seq_file.h>
53
#include <linux/cpu.h>
54
#include <linux/slab.h>
55
#include <linux/ratelimit.h>
NeilBrown's avatar
NeilBrown committed
56 57
#include <trace/events/block.h>

58
#include "md.h"
59
#include "raid5.h"
60
#include "raid0.h"
61
#include "bitmap.h"
62

Linus Torvalds's avatar
Linus Torvalds committed
63 64 65 66 67 68 69 70 71
/*
 * Stripe cache
 */

#define NR_STRIPES		256
#define STRIPE_SIZE		PAGE_SIZE
#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
#define	IO_THRESHOLD		1
72
#define BYPASS_THRESHOLD	1
73
#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
Linus Torvalds's avatar
Linus Torvalds committed
74 75
#define HASH_MASK		(NR_HASH - 1)

76
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
77 78 79 80
{
	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
	return &conf->stripe_hashtbl[hash];
}
Linus Torvalds's avatar
Linus Torvalds committed
81 82 83 84 85 86 87

/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
 * a bio could span several devices.
 * When walking this list for a particular stripe+device, we must never proceed
 * beyond a bio that extends past this device, as the next bio might no longer
 * be valid.
88
 * This function is used to determine the 'next' bio in the list, given the sector
Linus Torvalds's avatar
Linus Torvalds committed
89 90
 * of the current stripe+device
 */
91 92
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
93
	int sectors = bio_sectors(bio);
94 95 96 97 98
	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
		return bio->bi_next;
	else
		return NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
99

100
/*
101 102
 * We maintain a biased count of active stripes in the bottom 16 bits of
 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
103
 */
104
static inline int raid5_bi_processed_stripes(struct bio *bio)
105
{
106 107
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	return (atomic_read(segments) >> 16) & 0xffff;
108 109
}

110
static inline int raid5_dec_bi_active_stripes(struct bio *bio)
111
{
112 113
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	return atomic_sub_return(1, segments) & 0xffff;
114 115
}

116
static inline void raid5_inc_bi_active_stripes(struct bio *bio)
117
{
118 119
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	atomic_inc(segments);
120 121
}

122 123
static inline void raid5_set_bi_processed_stripes(struct bio *bio,
	unsigned int cnt)
124
{
125 126
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	int old, new;
127

128 129 130 131
	do {
		old = atomic_read(segments);
		new = (old & 0xffff) | (cnt << 16);
	} while (atomic_cmpxchg(segments, old, new) != old);
132 133
}

134
static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
135
{
136 137
	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
	atomic_set(segments, cnt);
138 139
}

140 141 142
/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{
143 144 145 146
	if (sh->ddf_layout)
		/* ddf always start from first device */
		return 0;
	/* md starts just after Q block */
147 148 149 150 151
	if (sh->qd_idx == sh->disks - 1)
		return 0;
	else
		return sh->qd_idx + 1;
}
152 153 154 155 156
static inline int raid6_next_disk(int disk, int raid_disks)
{
	disk++;
	return (disk < raid_disks) ? disk : 0;
}
157

158 159 160 161 162
/* When walking through the disks in a raid5, starting at raid6_d0,
 * We need to map each disk to a 'slot', where the data disks are slot
 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 * is raid_disks-1.  This help does that mapping.
 */
163 164
static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
			     int *count, int syndrome_disks)
165
{
166
	int slot = *count;
167

168
	if (sh->ddf_layout)
169
		(*count)++;
170
	if (idx == sh->pd_idx)
171
		return syndrome_disks;
172
	if (idx == sh->qd_idx)
173
		return syndrome_disks + 1;
174
	if (!sh->ddf_layout)
175
		(*count)++;
176 177 178
	return slot;
}

179 180 181 182 183 184 185 186
static void return_io(struct bio *return_bi)
{
	struct bio *bi = return_bi;
	while (bi) {

		return_bi = bi->bi_next;
		bi->bi_next = NULL;
		bi->bi_size = 0;
187 188
		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
					 bi, 0);
189
		bio_endio(bi, 0);
190 191 192 193
		bi = return_bi;
	}
}

194
static void print_raid5_conf (struct r5conf *conf);
Linus Torvalds's avatar
Linus Torvalds committed
195

196 197 198 199 200 201 202
static int stripe_operations_active(struct stripe_head *sh)
{
	return sh->check_state || sh->reconstruct_state ||
	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}

203
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
204
{
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
	BUG_ON(!list_empty(&sh->lru));
	BUG_ON(atomic_read(&conf->active_stripes)==0);
	if (test_bit(STRIPE_HANDLE, &sh->state)) {
		if (test_bit(STRIPE_DELAYED, &sh->state) &&
		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
			list_add_tail(&sh->lru, &conf->delayed_list);
		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
			   sh->bm_seq - conf->seq_write > 0)
			list_add_tail(&sh->lru, &conf->bitmap_list);
		else {
			clear_bit(STRIPE_DELAYED, &sh->state);
			clear_bit(STRIPE_BIT_DELAY, &sh->state);
			list_add_tail(&sh->lru, &conf->handle_list);
		}
		md_wakeup_thread(conf->mddev->thread);
	} else {
		BUG_ON(stripe_operations_active(sh));
		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
			if (atomic_dec_return(&conf->preread_active_stripes)
			    < IO_THRESHOLD)
				md_wakeup_thread(conf->mddev->thread);
		atomic_dec(&conf->active_stripes);
		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
			list_add_tail(&sh->lru, &conf->inactive_list);
			wake_up(&conf->wait_for_stripe);
			if (conf->retry_read_aligned)
				md_wakeup_thread(conf->mddev->thread);
Linus Torvalds's avatar
Linus Torvalds committed
232 233 234
		}
	}
}
235

236 237 238 239 240 241
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
{
	if (atomic_dec_and_test(&sh->count))
		do_release_stripe(conf, sh);
}

Linus Torvalds's avatar
Linus Torvalds committed
242 243
static void release_stripe(struct stripe_head *sh)
{
244
	struct r5conf *conf = sh->raid_conf;
Linus Torvalds's avatar
Linus Torvalds committed
245
	unsigned long flags;
246

247 248 249 250 251 252
	local_irq_save(flags);
	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
		do_release_stripe(conf, sh);
		spin_unlock(&conf->device_lock);
	}
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
253 254
}

255
static inline void remove_hash(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
256
{
257 258
	pr_debug("remove_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
259

260
	hlist_del_init(&sh->hash);
Linus Torvalds's avatar
Linus Torvalds committed
261 262
}

263
static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
264
{
265
	struct hlist_head *hp = stripe_hash(conf, sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
266

267 268
	pr_debug("insert_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
269

270
	hlist_add_head(&sh->hash, hp);
Linus Torvalds's avatar
Linus Torvalds committed
271 272 273 274
}


/* find an idle stripe, make sure it is unhashed, and return it. */
275
static struct stripe_head *get_free_stripe(struct r5conf *conf)
Linus Torvalds's avatar
Linus Torvalds committed
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
{
	struct stripe_head *sh = NULL;
	struct list_head *first;

	if (list_empty(&conf->inactive_list))
		goto out;
	first = conf->inactive_list.next;
	sh = list_entry(first, struct stripe_head, lru);
	list_del_init(first);
	remove_hash(sh);
	atomic_inc(&conf->active_stripes);
out:
	return sh;
}

291
static void shrink_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
292 293 294
{
	struct page *p;
	int i;
295
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
296

297
	for (i = 0; i < num ; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
298 299 300 301
		p = sh->dev[i].page;
		if (!p)
			continue;
		sh->dev[i].page = NULL;
302
		put_page(p);
Linus Torvalds's avatar
Linus Torvalds committed
303 304 305
	}
}

306
static int grow_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
307 308
{
	int i;
309
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
310

311
	for (i = 0; i < num; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
312 313 314 315 316 317 318 319 320 321
		struct page *page;

		if (!(page = alloc_page(GFP_KERNEL))) {
			return 1;
		}
		sh->dev[i].page = page;
	}
	return 0;
}

322
static void raid5_build_block(struct stripe_head *sh, int i, int previous);
323
static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
324
			    struct stripe_head *sh);
Linus Torvalds's avatar
Linus Torvalds committed
325

326
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
Linus Torvalds's avatar
Linus Torvalds committed
327
{
328
	struct r5conf *conf = sh->raid_conf;
329
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
330

331 332
	BUG_ON(atomic_read(&sh->count) != 0);
	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
333
	BUG_ON(stripe_operations_active(sh));
334

335
	pr_debug("init_stripe called, stripe %llu\n",
Linus Torvalds's avatar
Linus Torvalds committed
336 337 338
		(unsigned long long)sh->sector);

	remove_hash(sh);
339

340
	sh->generation = conf->generation - previous;
341
	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
Linus Torvalds's avatar
Linus Torvalds committed
342
	sh->sector = sector;
343
	stripe_set_idx(sector, conf, previous, sh);
Linus Torvalds's avatar
Linus Torvalds committed
344 345
	sh->state = 0;

346 347

	for (i = sh->disks; i--; ) {
Linus Torvalds's avatar
Linus Torvalds committed
348 349
		struct r5dev *dev = &sh->dev[i];

350
		if (dev->toread || dev->read || dev->towrite || dev->written ||
Linus Torvalds's avatar
Linus Torvalds committed
351
		    test_bit(R5_LOCKED, &dev->flags)) {
352
			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
Linus Torvalds's avatar
Linus Torvalds committed
353
			       (unsigned long long)sh->sector, i, dev->toread,
354
			       dev->read, dev->towrite, dev->written,
Linus Torvalds's avatar
Linus Torvalds committed
355
			       test_bit(R5_LOCKED, &dev->flags));
356
			WARN_ON(1);
Linus Torvalds's avatar
Linus Torvalds committed
357 358
		}
		dev->flags = 0;
359
		raid5_build_block(sh, i, previous);
Linus Torvalds's avatar
Linus Torvalds committed
360 361 362 363
	}
	insert_hash(conf, sh);
}

364
static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
365
					 short generation)
Linus Torvalds's avatar
Linus Torvalds committed
366 367 368
{
	struct stripe_head *sh;

369
	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
370
	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
371
		if (sh->sector == sector && sh->generation == generation)
Linus Torvalds's avatar
Linus Torvalds committed
372
			return sh;
373
	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
374 375 376
	return NULL;
}

377 378 379 380 381 382 383 384 385 386 387 388 389
/*
 * Need to check if array has failed when deciding whether to:
 *  - start an array
 *  - remove non-faulty devices
 *  - add a spare
 *  - allow a reshape
 * This determination is simple when no reshape is happening.
 * However if there is a reshape, we need to carefully check
 * both the before and after sections.
 * This is because some failed devices may only affect one
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
390
static int calc_degraded(struct r5conf *conf)
391
{
392
	int degraded, degraded2;
393 394 395 396 397
	int i;

	rcu_read_lock();
	degraded = 0;
	for (i = 0; i < conf->previous_raid_disks; i++) {
398
		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
399 400
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = rcu_dereference(conf->disks[i].replacement);
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
		if (!rdev || test_bit(Faulty, &rdev->flags))
			degraded++;
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If the reshape increases the number of devices,
			 * this is being recovered by the reshape, so
			 * this 'previous' section is not in_sync.
			 * If the number of devices is being reduced however,
			 * the device can only be part of the array if
			 * we are reverting a reshape, so this section will
			 * be in-sync.
			 */
			if (conf->raid_disks >= conf->previous_raid_disks)
				degraded++;
	}
	rcu_read_unlock();
419 420
	if (conf->raid_disks == conf->previous_raid_disks)
		return degraded;
421
	rcu_read_lock();
422
	degraded2 = 0;
423
	for (i = 0; i < conf->raid_disks; i++) {
424
		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
425 426
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = rcu_dereference(conf->disks[i].replacement);
427
		if (!rdev || test_bit(Faulty, &rdev->flags))
428
			degraded2++;
429 430 431 432 433 434 435 436 437
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If reshape increases the number of devices, this
			 * section has already been recovered, else it
			 * almost certainly hasn't.
			 */
			if (conf->raid_disks <= conf->previous_raid_disks)
438
				degraded2++;
439 440
	}
	rcu_read_unlock();
441 442 443 444 445 446 447 448 449 450 451 452 453
	if (degraded2 > degraded)
		return degraded2;
	return degraded;
}

static int has_failed(struct r5conf *conf)
{
	int degraded;

	if (conf->mddev->reshape_position == MaxSector)
		return conf->mddev->degraded > conf->max_degraded;

	degraded = calc_degraded(conf);
454 455 456 457 458
	if (degraded > conf->max_degraded)
		return 1;
	return 0;
}

459
static struct stripe_head *
460
get_active_stripe(struct r5conf *conf, sector_t sector,
461
		  int previous, int noblock, int noquiesce)
Linus Torvalds's avatar
Linus Torvalds committed
462 463 464
{
	struct stripe_head *sh;

465
	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
466 467 468 469

	spin_lock_irq(&conf->device_lock);

	do {
470
		wait_event_lock_irq(conf->wait_for_stripe,
471
				    conf->quiesce == 0 || noquiesce,
472
				    conf->device_lock);
473
		sh = __find_stripe(conf, sector, conf->generation - previous);
Linus Torvalds's avatar
Linus Torvalds committed
474 475 476 477 478 479 480 481 482
		if (!sh) {
			if (!conf->inactive_blocked)
				sh = get_free_stripe(conf);
			if (noblock && sh == NULL)
				break;
			if (!sh) {
				conf->inactive_blocked = 1;
				wait_event_lock_irq(conf->wait_for_stripe,
						    !list_empty(&conf->inactive_list) &&
483 484
						    (atomic_read(&conf->active_stripes)
						     < (conf->max_nr_stripes *3/4)
Linus Torvalds's avatar
Linus Torvalds committed
485
						     || !conf->inactive_blocked),
486
						    conf->device_lock);
Linus Torvalds's avatar
Linus Torvalds committed
487 488
				conf->inactive_blocked = 0;
			} else
489
				init_stripe(sh, sector, previous);
Linus Torvalds's avatar
Linus Torvalds committed
490 491
		} else {
			if (atomic_read(&sh->count)) {
492
				BUG_ON(!list_empty(&sh->lru)
493 494
				    && !test_bit(STRIPE_EXPANDING, &sh->state)
				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
Linus Torvalds's avatar
Linus Torvalds committed
495 496 497
			} else {
				if (!test_bit(STRIPE_HANDLE, &sh->state))
					atomic_inc(&conf->active_stripes);
498 499
				if (list_empty(&sh->lru) &&
				    !test_bit(STRIPE_EXPANDING, &sh->state))
500 501
					BUG();
				list_del_init(&sh->lru);
Linus Torvalds's avatar
Linus Torvalds committed
502 503 504 505 506 507 508 509 510 511 512
			}
		}
	} while (sh == NULL);

	if (sh)
		atomic_inc(&sh->count);

	spin_unlock_irq(&conf->device_lock);
	return sh;
}

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
/* Determine if 'data_offset' or 'new_data_offset' should be used
 * in this stripe_head.
 */
static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{
	sector_t progress = conf->reshape_progress;
	/* Need a memory barrier to make sure we see the value
	 * of conf->generation, or ->data_offset that was set before
	 * reshape_progress was updated.
	 */
	smp_rmb();
	if (progress == MaxSector)
		return 0;
	if (sh->generation == conf->generation - 1)
		return 0;
	/* We are in a reshape, and this is a new-generation stripe,
	 * so use new_data_offset.
	 */
	return 1;
}

534 535 536 537
static void
raid5_end_read_request(struct bio *bi, int error);
static void
raid5_end_write_request(struct bio *bi, int error);
538

539
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
540
{
541
	struct r5conf *conf = sh->raid_conf;
542 543 544 545 546 547
	int i, disks = sh->disks;

	might_sleep();

	for (i = disks; i--; ) {
		int rw;
548
		int replace_only = 0;
549 550
		struct bio *bi, *rbi;
		struct md_rdev *rdev, *rrdev = NULL;
551 552 553 554 555
		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
				rw = WRITE_FUA;
			else
				rw = WRITE;
556
			if (test_bit(R5_Discard, &sh->dev[i].flags))
Shaohua Li's avatar
Shaohua Li committed
557
				rw |= REQ_DISCARD;
558
		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
559
			rw = READ;
560 561 562 563 564
		else if (test_and_clear_bit(R5_WantReplace,
					    &sh->dev[i].flags)) {
			rw = WRITE;
			replace_only = 1;
		} else
565
			continue;
Shaohua Li's avatar
Shaohua Li committed
566 567
		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
			rw |= REQ_SYNC;
568 569

		bi = &sh->dev[i].req;
570
		rbi = &sh->dev[i].rreq; /* For writing to replacement */
571 572

		rcu_read_lock();
573
		rrdev = rcu_dereference(conf->disks[i].replacement);
574 575 576 577 578 579
		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (!rdev) {
			rdev = rrdev;
			rrdev = NULL;
		}
580 581 582
		if (rw & WRITE) {
			if (replace_only)
				rdev = NULL;
583 584 585
			if (rdev == rrdev)
				/* We raced and saw duplicates */
				rrdev = NULL;
586
		} else {
587
			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
588 589 590
				rdev = rrdev;
			rrdev = NULL;
		}
591

592 593 594 595
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = NULL;
		if (rdev)
			atomic_inc(&rdev->nr_pending);
596 597 598 599
		if (rrdev && test_bit(Faulty, &rrdev->flags))
			rrdev = NULL;
		if (rrdev)
			atomic_inc(&rrdev->nr_pending);
600 601
		rcu_read_unlock();

602
		/* We have already checked bad blocks for reads.  Now
603 604
		 * need to check for writes.  We never accept write errors
		 * on the replacement, so we don't to check rrdev.
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
		 */
		while ((rw & WRITE) && rdev &&
		       test_bit(WriteErrorSeen, &rdev->flags)) {
			sector_t first_bad;
			int bad_sectors;
			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
					      &first_bad, &bad_sectors);
			if (!bad)
				break;

			if (bad < 0) {
				set_bit(BlockedBadBlocks, &rdev->flags);
				if (!conf->mddev->external &&
				    conf->mddev->flags) {
					/* It is very unlikely, but we might
					 * still need to write out the
					 * bad block log - better give it
					 * a chance*/
					md_check_recovery(conf->mddev);
				}
625 626 627 628 629 630
				/*
				 * Because md_wait_for_blocked_rdev
				 * will dec nr_pending, we must
				 * increment it first.
				 */
				atomic_inc(&rdev->nr_pending);
631 632 633 634 635 636 637 638
				md_wait_for_blocked_rdev(rdev, conf->mddev);
			} else {
				/* Acknowledged bad block - skip the write */
				rdev_dec_pending(rdev, conf->mddev);
				rdev = NULL;
			}
		}

639
		if (rdev) {
640 641
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
642 643
				md_sync_acct(rdev->bdev, STRIPE_SECTORS);

644 645
			set_bit(STRIPE_IO_STARTED, &sh->state);

Kent Overstreet's avatar
Kent Overstreet committed
646
			bio_reset(bi);
647
			bi->bi_bdev = rdev->bdev;
Kent Overstreet's avatar
Kent Overstreet committed
648 649 650 651 652 653
			bi->bi_rw = rw;
			bi->bi_end_io = (rw & WRITE)
				? raid5_end_write_request
				: raid5_end_read_request;
			bi->bi_private = sh;

654
			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
655
				__func__, (unsigned long long)sh->sector,
656 657
				bi->bi_rw, i);
			atomic_inc(&sh->count);
658 659 660 661 662 663
			if (use_new_offset(conf, sh))
				bi->bi_sector = (sh->sector
						 + rdev->new_data_offset);
			else
				bi->bi_sector = (sh->sector
						 + rdev->data_offset);
664 665 666
			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
				bi->bi_rw |= REQ_FLUSH;

667
			bi->bi_vcnt = 1;
668 669 670
			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			bi->bi_io_vec[0].bv_offset = 0;
			bi->bi_size = STRIPE_SIZE;
671 672 673 674 675 676
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (rw & REQ_DISCARD)
				bi->bi_vcnt = 0;
677 678
			if (rrdev)
				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
679 680 681 682 683

			if (conf->mddev->gendisk)
				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
						      bi, disk_devt(conf->mddev->gendisk),
						      sh->dev[i].sector);
684
			generic_make_request(bi);
685 686
		}
		if (rrdev) {
687 688
			if (s->syncing || s->expanding || s->expanded
			    || s->replacing)
689 690 691 692
				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);

			set_bit(STRIPE_IO_STARTED, &sh->state);

Kent Overstreet's avatar
Kent Overstreet committed
693
			bio_reset(rbi);
694
			rbi->bi_bdev = rrdev->bdev;
Kent Overstreet's avatar
Kent Overstreet committed
695 696 697 698 699
			rbi->bi_rw = rw;
			BUG_ON(!(rw & WRITE));
			rbi->bi_end_io = raid5_end_write_request;
			rbi->bi_private = sh;

700 701 702 703 704
			pr_debug("%s: for %llu schedule op %ld on "
				 "replacement disc %d\n",
				__func__, (unsigned long long)sh->sector,
				rbi->bi_rw, i);
			atomic_inc(&sh->count);
705 706 707 708 709 710
			if (use_new_offset(conf, sh))
				rbi->bi_sector = (sh->sector
						  + rrdev->new_data_offset);
			else
				rbi->bi_sector = (sh->sector
						  + rrdev->data_offset);
711
			rbi->bi_vcnt = 1;
712 713 714
			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			rbi->bi_io_vec[0].bv_offset = 0;
			rbi->bi_size = STRIPE_SIZE;
715 716 717 718 719 720
			/*
			 * If this is discard request, set bi_vcnt 0. We don't
			 * want to confuse SCSI because SCSI will replace payload
			 */
			if (rw & REQ_DISCARD)
				rbi->bi_vcnt = 0;
721 722 723 724
			if (conf->mddev->gendisk)
				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
						      rbi, disk_devt(conf->mddev->gendisk),
						      sh->dev[i].sector);
725 726 727
			generic_make_request(rbi);
		}
		if (!rdev && !rrdev) {
728
			if (rw & WRITE)
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
				set_bit(STRIPE_DEGRADED, &sh->state);
			pr_debug("skip op %ld on disc %d for sector %llu\n",
				bi->bi_rw, i, (unsigned long long)sh->sector);
			clear_bit(R5_LOCKED, &sh->dev[i].flags);
			set_bit(STRIPE_HANDLE, &sh->state);
		}
	}
}

static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page *page,
	sector_t sector, struct dma_async_tx_descriptor *tx)
{
	struct bio_vec *bvl;
	struct page *bio_page;
	int i;
	int page_offset;
746
	struct async_submit_ctl submit;
747
	enum async_tx_flags flags = 0;
748 749 750 751 752

	if (bio->bi_sector >= sector)
		page_offset = (signed)(bio->bi_sector - sector) * 512;
	else
		page_offset = (signed)(sector - bio->bi_sector) * -512;
753

754 755 756 757
	if (frombio)
		flags |= ASYNC_TX_FENCE;
	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);

758
	bio_for_each_segment(bvl, bio, i) {
759
		int len = bvl->bv_len;
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
		int clen;
		int b_offset = 0;

		if (page_offset < 0) {
			b_offset = -page_offset;
			page_offset += b_offset;
			len -= b_offset;
		}

		if (len > 0 && page_offset + len > STRIPE_SIZE)
			clen = STRIPE_SIZE - page_offset;
		else
			clen = len;

		if (clen > 0) {
775 776
			b_offset += bvl->bv_offset;
			bio_page = bvl->bv_page;
777 778
			if (frombio)
				tx = async_memcpy(page, bio_page, page_offset,
779
						  b_offset, clen, &submit);
780 781
			else
				tx = async_memcpy(bio_page, page, b_offset,
782
						  page_offset, clen, &submit);
783
		}
784 785 786
		/* chain the operations */
		submit.depend_tx = tx;

787 788 789 790 791 792 793 794 795 796 797 798
		if (clen < len) /* hit end of page */
			break;
		page_offset +=  len;
	}

	return tx;
}

static void ops_complete_biofill(void *stripe_head_ref)
{
	struct stripe_head *sh = stripe_head_ref;
	struct bio *return_bi = NULL;
799
	int i;
800

801
	pr_debug("%s: stripe %llu\n", __func__,
802 803 804 805 806 807 808
		(unsigned long long)sh->sector);

	/* clear completed biofills */
	for (i = sh->disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];

		/* acknowledge completion of a biofill operation */
809 810
		/* and check if we need to reply to a read request,
		 * new R5_Wantfill requests are held off until
811
		 * !STRIPE_BIOFILL_RUN
812 813
		 */
		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
814 815 816 817 818 819 820 821
			struct bio *rbi, *rbi2;

			BUG_ON(!dev->read);
			rbi = dev->read;
			dev->read = NULL;
			while (rbi && rbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
				rbi2 = r5_next_bio(rbi, dev->sector);
822
				if (!raid5_dec_bi_active_stripes(rbi)) {
823 824 825 826 827 828 829
					rbi->bi_next = return_bi;
					return_bi = rbi;
				}
				rbi = rbi2;
			}
		}
	}
830
	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
831 832 833

	return_io(return_bi);

834
	set_bit(STRIPE_HANDLE, &sh->state);
835 836 837 838 839 840
	release_stripe(sh);
}

static void ops_run_biofill(struct stripe_head *sh)
{
	struct dma_async_tx_descriptor *tx = NULL;
841
	struct async_submit_ctl submit;
842 843
	int i;

844
	pr_debug("%s: stripe %llu\n", __func__,
845 846 847 848 849 850
		(unsigned long long)sh->sector);

	for (i = sh->disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		if (test_bit(R5_Wantfill, &dev->flags)) {
			struct bio *rbi;
851
			spin_lock_irq(&sh->stripe_lock);
852 853
			dev->read = rbi = dev->toread;
			dev->toread = NULL;
854
			spin_unlock_irq(&sh->stripe_lock);
855 856 857 858 859 860 861 862 863 864
			while (rbi && rbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
				tx = async_copy_data(0, rbi, dev->page,
					dev->sector, tx);
				rbi = r5_next_bio(rbi, dev->sector);
			}
		}
	}

	atomic_inc(&sh->count);
865 866
	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
	async_trigger_callback(&submit);
867 868
}

869
static void mark_target_uptodate(struct stripe_head *sh, int target)
870
{
871
	struct r5dev *tgt;
872

873 874
	if (target < 0)
		return;
875

876
	tgt = &sh->dev[target];
877 878 879
	set_bit(R5_UPTODATE, &tgt->flags);
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	clear_bit(R5_Wantcompute, &tgt->flags);
880 881
}

882
static void ops_complete_compute(void *stripe_head_ref)
883 884 885
{
	struct stripe_head *sh = stripe_head_ref;

886
	pr_debug("%s: stripe %llu\n", __func__,
887 888
		(unsigned long long)sh->sector);

889
	/* mark the computed target(s) as uptodate */
890
	mark_target_uptodate(sh, sh->ops.target);
891
	mark_target_uptodate(sh, sh->ops.target2);
892

893 894 895
	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
	if (sh->check_state == check_state_compute_run)
		sh->check_state = check_state_compute_result;
896 897 898 899
	set_bit(STRIPE_HANDLE, &sh->state);
	release_stripe(sh);
}

900 901 902 903 904 905 906 907 908
/* return a pointer to the address conversion region of the scribble buffer */
static addr_conv_t *to_addr_conv(struct stripe_head *sh,
				 struct raid5_percpu *percpu)
{
	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
}

static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
909 910
{
	int disks = sh->disks;
911
	struct page **xor_srcs = percpu->scribble;
912 913 914 915 916
	int target = sh->ops.target;
	struct r5dev *tgt = &sh->dev[target];
	struct page *xor_dest = tgt->page;
	int count = 0;
	struct dma_async_tx_descriptor *tx;
917
	struct async_submit_ctl submit;
918 919 920
	int i;

	pr_debug("%s: stripe %llu block: %d\n",
921
		__func__, (unsigned long long)sh->sector, target);
922 923 924 925 926 927 928 929
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));

	for (i = disks; i--; )
		if (i != target)
			xor_srcs[count++] = sh->dev[i].page;

	atomic_inc(&sh->count);

930
	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
931
			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
932
	if (unlikely(count == 1))
933
		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
934
	else
935
		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
936 937 938 939

	return tx;
}

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
/* set_syndrome_sources - populate source buffers for gen_syndrome
 * @srcs - (struct page *) array of size sh->disks
 * @sh - stripe_head to parse
 *
 * Populates srcs in proper layout order for the stripe and returns the
 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
 * destination buffer is recorded in srcs[count] and the Q destination
 * is recorded in srcs[count+1]].
 */
static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
{
	int disks = sh->disks;
	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
	int d0_idx = raid6_d0(sh);
	int count;
	int i;

	for (i = 0; i < disks; i++)
958
		srcs[i] = NULL;
959 960 961 962 963 964 965 966 967 968

	count = 0;
	i = d0_idx;
	do {
		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);

		srcs[slot] = sh->dev[i].page;
		i = raid6_next_disk(i, disks);
	} while (i != d0_idx);

969
	return syndrome_disks;
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989
}

static struct dma_async_tx_descriptor *
ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{
	int disks = sh->disks;
	struct page **blocks = percpu->scribble;
	int target;
	int qd_idx = sh->qd_idx;
	struct dma_async_tx_descriptor *tx;
	struct async_submit_ctl submit;
	struct r5dev *tgt;
	struct page *dest;
	int i;
	int count;

	if (sh->ops.target < 0)
		target = sh->ops.target2;
	else if (sh->ops.target2 < 0)
		target = sh->ops.target;
990
	else
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
		/* we should only have one valid target */
		BUG();
	BUG_ON(target < 0);
	pr_debug("%s: stripe %llu block: %d\n",
		__func__, (unsigned long long)sh->sector, target);

	tgt = &sh->dev[target];
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	dest = tgt->page;

	atomic_inc(&sh->count);

	if (target == qd_idx) {
		count = set_syndrome_sources(blocks, sh);
		blocks[count] = NULL; /* regenerating p is not necessary */
		BUG_ON(blocks[count+1] != dest); /* q should already be set */
1007 1008
		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
				  ops_complete_compute, sh,
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
				  to_addr_conv(sh, percpu));
		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
	} else {
		/* Compute any data- or p-drive using XOR */
		count = 0;
		for (i = disks; i-- ; ) {
			if (i == target || i == qd_idx)
				continue;
			blocks[count++] = sh->dev[i].page;
		}

1020 1021
		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
				  NULL, ops_complete_compute, sh,
1022 1023 1024
				  to_addr_conv(sh, percpu));
		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
	}
1025 1026 1027 1028

	return tx;
}

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
static struct dma_async_tx_descriptor *
ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
{
	int i, count, disks = sh->disks;
	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
	int d0_idx = raid6_d0(sh);
	int faila = -1, failb = -1;
	int target = sh->ops.target;
	int target2 = sh->ops.target2;
	struct r5dev *tgt = &sh->dev[target];
	struct r5dev *tgt2 = &sh->dev[target2];
	struct dma_async_tx_descriptor *tx;
	struct page **blocks = percpu->scribble;
	struct async_submit_ctl submit;

	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
		 __func__, (unsigned long long)sh->sector, target, target2);
	BUG_ON(target < 0 || target2 < 0);
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));

1050
	/* we need to open-code set_syndrome_sources to handle the
1051 1052 1053
	 * slot number conversion for 'faila' and 'failb'
	 */
	for (i = 0; i < disks ; i++)
1054
		blocks[i] = NULL;
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
	count = 0;
	i = d0_idx;
	do {
		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);

		blocks[slot] = sh->dev[i].page;

		if (i == target)
			faila = slot;
		if (i == target2)
			failb = slot;
		i = raid6_next_disk(i, disks);
	} while (i != d0_idx);

	BUG_ON(faila == failb);
	if (failb < faila)
		swap(faila, failb);
	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
		 __func__, (unsigned long long)sh->sector, faila, failb);

	atomic_inc(&sh->count);

	if (failb == syndrome_disks+1) {
		/* Q disk is one of the missing disks */
		if (faila == syndrome_disks) {
			/* Missing P+Q, just recompute */
1081 1082 1083
			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
					  ops_complete_compute, sh,
					  to_addr_conv(sh, percpu));
1084
			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
						  STRIPE_SIZE, &submit);
		} else {
			struct page *dest;
			int data_target;
			int qd_idx = sh->qd_idx;

			/* Missing D+Q: recompute D from P, then recompute Q */
			if (target == qd_idx)
				data_target = target2;
			else
				data_target = target;

			count = 0;
			for (i = disks; i-- ; ) {
				if (i == data_target || i == qd_idx)
					continue;
				blocks[count++] = sh->dev[i].page;
			}
			dest = sh->dev[data_target].page;
1104 1105 1106 1107
			init_async_submit(&submit,
					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
					  NULL, NULL, NULL,
					  to_addr_conv(sh, percpu));
1108 1109 1110 1111
			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
				       &submit);

			count = set_syndrome_sources(blocks, sh);
1112 1113 1114
			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
					  ops_complete_compute, sh,
					  to_addr_conv(sh, percpu));
1115 1116 1117 1118
			return async_gen_syndrome(blocks, 0, count+2,
						  STRIPE_SIZE, &submit);
		}
	} else {
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
				  ops_complete_compute, sh,
				  to_addr_conv(sh, percpu));
		if (failb == syndrome_disks) {
			/* We're missing D+P. */
			return async_raid6_datap_recov(syndrome_disks+2,
						       STRIPE_SIZE, faila,
						       blocks, &submit);
		} else {
			/* We're missing D+D. */
			return async_raid6_2data_recov(syndrome_disks+2,
						       STRIPE_SIZE, faila, failb,
						       blocks, &submit);
		}
1133 1134 1135 1136
	}
}


1137 1138 1139 1140
static void ops_complete_prexor(void *stripe_head_ref)
{
	struct stripe_head *sh = stripe_head_ref;

1141
	pr_debug("%s: stripe %llu\n", __func__,
1142 1143 1144 1145
		(unsigned long long)sh->sector);
}

static struct dma_async_tx_descriptor *
1146 1147
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
	       struct dma_async_tx_descriptor *tx)
1148 1149
{
	int disks = sh->disks;
1150
	struct page **xor_srcs = percpu->scribble;
1151
	int count = 0, pd_idx = sh->pd_idx, i;
1152
	struct async_submit_ctl submit;
1153 1154 1155 1156

	/* existing parity data subtracted */
	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;

1157
	pr_debug("%s: stripe %llu\n", __func__,
1158 1159 1160 1161 1162
		(unsigned long long)sh->sector);

	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		/* Only process blocks that are known to be uptodate */
1163
		if (test_bit(R5_Wantdrain, &dev->flags))
1164 1165 1166
			xor_srcs[count++] = dev->page;
	}

1167
	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1168
			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1169
	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1170 1171 1172 1173 1174

	return tx;
}

static struct dma_async_tx_descriptor *
1175
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1176 1177
{
	int disks = sh->disks;
1178
	int i;
1179

1180
	pr_debug("%s: stripe %llu\n", __func__,
1181 1182 1183 1184 1185 1186
		(unsigned long long)sh->sector);

	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		struct bio *chosen;

1187
		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1188 1189
			struct bio *wbi;

1190
			spin_lock_irq(&sh->stripe_lock);
1191 1192 1193 1194
			chosen = dev->towrite;
			dev->towrite = NULL;
			BUG_ON(dev->written);
			wbi = dev->written = chosen;
1195
			spin_unlock_irq(&sh->stripe_lock);
1196 1197 1198

			while (wbi && wbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
1199 1200
				if (wbi->bi_rw & REQ_FUA)
					set_bit(R5_WantFUA, &dev->flags);
Shaohua Li's avatar
Shaohua Li committed
1201 1202
				if (wbi->bi_rw & REQ_SYNC)
					set_bit(R5_SyncIO, &dev->flags);
1203
				if (wbi->bi_rw & REQ_DISCARD)
Shaohua Li's avatar
Shaohua Li committed
1204
					set_bit(R5_Discard, &dev->flags);
1205
				else
Shaohua Li's avatar
Shaohua Li committed
1206 1207
					tx = async_copy_data(1, wbi, dev->page,
						dev->sector, tx);
1208 1209 1210 1211 1212 1213 1214 1215
				wbi = r5_next_bio(wbi, dev->sector);
			}
		}
	}

	return tx;
}

1216
static void ops_complete_reconstruct(void *stripe_head_ref)
1217 1218
{
	struct stripe_head *sh = stripe_head_ref;
1219 1220 1221 1222
	int disks = sh->disks;
	int pd_idx = sh->pd_idx;
	int qd_idx = sh->qd_idx;
	int i;
1223
	bool fua = false, sync = false, discard = false;
1224

1225
	pr_debug("%s: stripe %llu\n", __func__,
1226 1227
		(unsigned long long)sh->sector);

Shaohua Li's avatar
Shaohua Li committed
1228
	for (i = disks; i--; ) {
1229
		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
Shaohua Li's avatar
Shaohua Li committed
1230
		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1231
		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
Shaohua Li's avatar
Shaohua Li committed
1232
	}
1233

1234 1235
	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
1236

1237
		if (dev->written || i == pd_idx || i == qd_idx) {
1238 1239
			if (!discard)
				set_bit(R5_UPTODATE, &dev->flags);
1240 1241
			if (fua)
				set_bit(R5_WantFUA, &dev->flags);
Shaohua Li's avatar
Shaohua Li committed
1242 1243
			if (sync)
				set_bit(R5_SyncIO, &dev->flags);
1244
		}
1245 1246
	}

1247 1248 1249 1250 1251 1252 1253 1254
	if (sh->reconstruct_state == reconstruct_state_drain_run)
		sh->reconstruct_state = reconstruct_state_drain_result;
	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
	else {
		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
		sh->reconstruct_state = reconstruct_state_result;
	}
1255 1256 1257 1258 1259 1260

	set_bit(STRIPE_HANDLE, &sh->state);
	release_stripe(sh);
}

static void
1261 1262
ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
		     struct dma_async_tx_descriptor *tx)
1263 1264
{
	int disks = sh->disks;
1265
	struct page **xor_srcs = percpu->scribble;
1266
	struct async_submit_ctl submit;
1267 1268
	int count = 0, pd_idx = sh->pd_idx, i;
	struct page *xor_dest;
1269
	int prexor = 0;
1270 1271
	unsigned long flags;

1272
	pr_debug("%s: stripe %llu\n", __func__,
1273 1274
		(unsigned long long)sh->sector);

Shaohua Li's avatar
Shaohua Li committed
1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
	for (i = 0; i < sh->disks; i++) {
		if (pd_idx == i)
			continue;
		if (!test_bit(R5_Discard, &sh->dev[i].flags))
			break;
	}
	if (i >= sh->disks) {
		atomic_inc(&sh->count);
		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
		ops_complete_reconstruct(sh);
		return;
	}
1287 1288 1289
	/* check if prexor is active which means only process blocks
	 * that are part of a read-modify-write (written)
	 */
1290 1291
	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
		prexor = 1;
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (dev->written)
				xor_srcs[count++] = dev->page;
		}
	} else {
		xor_dest = sh->dev[pd_idx].page;
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (i != pd_idx)
				xor_srcs[count++] = dev->page;
		}
	}

	/* 1/ if we prexor'd then the dest is reused as a source
	 * 2/ if we did not prexor then we are redoing the parity
	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
	 * for the synchronous xor case
	 */
1312
	flags = ASYNC_TX_ACK |
1313 1314 1315 1316
		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);

	atomic_inc(&sh->count);

1317
	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1318
			  to_addr_conv(sh, percpu));
1319 1320 1321 1322
	if (unlikely(count == 1))
		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
	else
		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1323 1324
}

1325 1326 1327 1328 1329 1330
static void
ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
		     struct dma_async_tx_descriptor *tx)
{
	struct async_submit_ctl submit;
	struct page **blocks = percpu->scribble;
Shaohua Li's avatar
Shaohua Li committed
1331
	int count, i;
1332 1333 1334

	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);

Shaohua Li's avatar
Shaohua Li committed
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348
	for (i = 0; i < sh->disks; i++) {
		if (sh->pd_idx == i || sh->qd_idx == i)
			continue;
		if (!test_bit(R5_Discard, &sh->dev[i].flags))
			break;
	}
	if (i >= sh->disks) {
		atomic_inc(&sh->count);
		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
		ops_complete_reconstruct(sh);
		return;
	}

1349 1350 1351 1352 1353 1354 1355
	count = set_syndrome_sources(blocks, sh);