Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -986,14 +986,17 @@ static void super_written(struct bio *bio)
if (bio->bi_status) {
pr_err("md: %s gets error=%d\n", __func__,
blk_status_to_errno(bio->bi_status));
if (bio->bi_opf & MD_FAILFAST)
set_bit(FailfastIOFailure, &rdev->flags);
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
pr_warn("md: %s: Metadata write will be repeated to %pg\n",
mdname(mddev), rdev->bdev);
set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
set_bit(LastDev, &rdev->flags);
}
} else
clear_bit(LastDev, &rdev->flags);
clear_bit(FailfastIOFailure, &rdev->flags);

bio_put(bio);

Expand Down Expand Up @@ -1035,7 +1038,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,

if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
test_bit(FailFast, &rdev->flags) &&
!test_bit(LastDev, &rdev->flags))
!test_bit(FailfastIOFailure, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;

atomic_inc(&mddev->pending_writes);
Expand Down
7 changes: 4 additions & 3 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,10 @@ enum flag_bits {
* It is expects that no bad block log
* is present.
*/
LastDev, /* Seems to be the last working dev as
* it didn't fail, so don't use FailFast
* any more for metadata
FailfastIOFailure, /* A device that failled a metadata write
* with failfast.
* error_handler must not fail the array
* if last device has this flag.
*/
CollisionCheck, /*
* check if there is collision between raid1
Expand Down
26 changes: 20 additions & 6 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -1748,8 +1748,12 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
* - recovery is interrupted.
* - &mddev->degraded is bumped.
*
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
* If @rdev is marked with &FailfastIOFailure, it means that super_write
* failed in failfast and will be retried, so the @mddev did not fail.
*
* @rdev is marked as &Faulty excluding any cases:
* - when @mddev is failed and &mddev->fail_last_dev is off
* - when @rdev is last device and &FailfastIOFailure flag is set
*/
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
{
Expand All @@ -1760,7 +1764,16 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)

if (test_bit(In_sync, &rdev->flags) &&
(conf->raid_disks - mddev->degraded) == 1) {
if (test_bit(FailfastIOFailure, &rdev->flags)) {
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}

set_bit(MD_BROKEN, &mddev->flags);
pr_crit("md/raid1:%s: Disk failure on %pg, this is the last device.\n"
"md/raid1:%s: Cannot continue operation (%d/%d failed).\n",
mdname(mddev), rdev->bdev,
mdname(mddev), mddev->degraded + 1, conf->raid_disks);

if (!mddev->fail_last_dev) {
conf->recovery_disabled = mddev->recovery_disabled;
Expand All @@ -1772,17 +1785,18 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
if ((conf->raid_disks - mddev->degraded) > 0)
pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), rdev->bdev,
mdname(mddev), conf->raid_disks - mddev->degraded);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
mdname(mddev), rdev->bdev,
mdname(mddev), conf->raid_disks - mddev->degraded);
}

static void print_conf(struct r1conf *conf)
Expand Down
26 changes: 20 additions & 6 deletions drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -1995,8 +1995,12 @@ static int enough(struct r10conf *conf, int ignore)
* - recovery is interrupted.
* - &mddev->degraded is bumped.
*
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
* If @rdev is marked with &FailfastIOFailure, it means that super_write
* failed in failfast, so the @mddev did not fail.
*
* @rdev is marked as &Faulty excluding any cases:
* - when @mddev is failed and &mddev->fail_last_dev is off
* - when @rdev is last device and &FailfastIOFailure flag is set
*/
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{
Expand All @@ -2006,7 +2010,16 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
spin_lock_irqsave(&conf->device_lock, flags);

if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
if (test_bit(FailfastIOFailure, &rdev->flags)) {
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}

set_bit(MD_BROKEN, &mddev->flags);
pr_crit("md/raid10:%s: Disk failure on %pg, this is the last device.\n"
"md/raid10:%s: Cannot continue operation (%d/%d failed).\n",
mdname(mddev), rdev->bdev,
mdname(mddev), mddev->degraded + 1, conf->geo.raid_disks);

if (!mddev->fail_last_dev) {
spin_unlock_irqrestore(&conf->device_lock, flags);
Expand All @@ -2021,11 +2034,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Faulty, &rdev->flags);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
if (enough(conf, -1))
pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n"
"md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), rdev->bdev,
mdname(mddev), conf->geo.raid_disks - mddev->degraded);
spin_unlock_irqrestore(&conf->device_lock, flags);
pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n"
"md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), rdev->bdev,
mdname(mddev), conf->geo.raid_disks - mddev->degraded);
}

static void print_conf(struct r10conf *conf)
Expand Down