From 9eb07c259207d048e3ee8be2a77b2a4680b1edd4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Feb 2010 12:31:47 +1100 Subject: md: fix 'degraded' calculation when starting a reshape. This code was written long ago when it was not possible to reshape a degraded array. Now it is so the current level of degraded-ness needs to be taken in to account. Also newly addded devices should only reduce degradedness if they are deemed to be in-sync. In particular, if you convert a RAID5 to a RAID6, and increase the number of devices at the same time, then the 5->6 conversion will make the array degraded so the current code will produce a wrong value for 'degraded' - "-1" to be precise. If the reshape runs to completion end_reshape will calculate a correct new value for 'degraded', but if a device fails during the reshape an incorrect decision might be made based on the incorrect value of "degraded". This patch is suitable for 2.6.32-stable and if they are still open, 2.6.31-stable and 2.6.30-stable as well. Cc: stable@kernel.org Reported-by: Michael Evans Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e84204eb12df..b5629c3e14fa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5464,11 +5464,11 @@ static int raid5_start_reshape(mddev_t *mddev) !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; - if (rdev->raid_disk >= conf->previous_raid_disks) + if (rdev->raid_disk >= conf->previous_raid_disks) { set_bit(In_sync, &rdev->flags); - else + added_devices++; + } else rdev->recovery_offset = 0; - added_devices++; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) @@ -5480,9 +5480,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } + /* When a reshape changes the number of devices, ->degraded + * is measured against the large of the pre and post number of + * devices.*/ if (mddev->delta_disks > 0) { spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) + mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) - added_devices; spin_unlock_irqrestore(&conf->device_lock, flags); } -- cgit v1.2.3 From ef286f6fa673cd7fb367e1b145069d8dbfcc6081 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Feb 2010 16:34:14 +1100 Subject: md: fix some lockdep issues between md and sysfs. ====== This fix is related to http://bugzilla.kernel.org/show_bug.cgi?id=15142 but does not address that exact issue. ====== sysfs does like attributes being removed while they are being accessed (i.e. read or written) and waits for the access to complete. As accessing some md attributes takes the same lock that is held while removing those attributes a deadlock can occur. This patch addresses 3 issues in md that could lead to this deadlock. Two relate to calling flush_scheduled_work while the lock is held. This is probably a bad idea in general and as we use schedule_work to delete various sysfs objects it is particularly bad. In one case flush_scheduled_work is called from md_alloc (called by md_probe) called from do_md_run which holds the lock. This call is only present to ensure that ->gendisk is set. However we can be sure that gendisk is always set (though possibly we couldn't when that code was originally written. This is because do_md_run is called in three different contexts: 1/ from md_ioctl. This requires that md_open has succeeded, and it fails if ->gendisk is not set. 2/ from writing a sysfs attribute. This can only happen if the mddev has been registered in sysfs which happens in md_alloc after ->gendisk has been set. 3/ from autorun_array which is only called by autorun_devices, which checks for ->gendisk to be set before calling autorun_array. So the call to md_probe in do_md_run can be removed, and the check on ->gendisk can also go. In the other case flush_scheduled_work is being called in do_md_stop, purportedly to wait for all md_delayed_delete calls (which delete the component rdevs) to complete. However there really isn't any need to wait for them - they have already been disconnected in all important ways. The third issue is that raid5->stop() removes some attribute names while the lock is held. There is already some infrastructure in place to delay attribute removal until after the lock is released (using schedule_work). So extend that infrastructure to remove the raid5_attrs_group. This does not address all lockdep issues related to the sysfs "s_active" lock. The rest can be address by splitting that lockdep context between symlinks and non-symlinks which hopefully will happen. Signed-off-by: NeilBrown --- drivers/md/md.c | 14 +++++--------- drivers/md/raid5.c | 3 +-- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index dd3dfe42d5a9..a20a71e5efd3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4075,8 +4075,10 @@ static void mddev_delayed_delete(struct work_struct *ws) { mddev_t *mddev = container_of(ws, mddev_t, del_work); - if (mddev->private == &md_redundancy_group) { + if (mddev->private) { sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->private != (void*)1) + sysfs_remove_group(&mddev->kobj, mddev->private); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); mddev->sysfs_action = NULL; @@ -4287,10 +4289,7 @@ static int do_md_run(mddev_t * mddev) sysfs_notify_dirent(rdev->sysfs_state); } - md_probe(mddev->unit, NULL, NULL); disk = mddev->gendisk; - if (!disk) - return -ENOMEM; spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -4530,8 +4529,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; module_put(mddev->pers->owner); - if (mddev->pers->sync_request) - mddev->private = &md_redundancy_group; + if (mddev->pers->sync_request && mddev->private == NULL) + mddev->private = (void*)1; mddev->pers = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); @@ -4578,9 +4577,6 @@ out: } mddev->bitmap_info.offset = 0; - /* make sure all md_delayed_delete calls have finished */ - flush_scheduled_work(); - export_array(mddev); mddev->array_sectors = 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b5629c3e14fa..ceb24afdc147 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5136,9 +5136,8 @@ static int stop(mddev_t *mddev) mddev->thread = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); free_conf(conf); - mddev->private = NULL; + mddev->private = &raid5_attrs_group; return 0; } -- cgit v1.2.3 From 781248c1b50c776a9ef4be1130f84ced1cba42fe Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 16 Feb 2010 18:42:47 +0000 Subject: dm stripe: avoid divide by zero with invalid stripe count If a table containing zero as stripe count is passed into stripe_ctr the code attempts to divide by zero. This patch changes DM_TABLE_LOAD to return -EINVAL if the stripe count is zero. We now get the following error messages: device-mapper: table: 253:0: striped: Invalid stripe count device-mapper: ioctl: error adding target to table Signed-off-by: Nikanth Karthikesan Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e0efc1adcaff..bd58703ee8f6 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -110,7 +110,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } stripes = simple_strtoul(argv[0], &end, 10); - if (*end) { + if (!stripes || *end) { ti->error = "Invalid stripe count"; return -EINVAL; } -- cgit v1.2.3 From 55f67f2dedec1e3049abc30b6d82b999a14cafb7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 16 Feb 2010 18:42:51 +0000 Subject: dm snapshot: persistent annotate work_queue as on stack chunk_io() declares its 'struct mdata_req' on the stack and then initializes its 'struct work_struct' member. Annotate the initialization of this workqueue with INIT_WORK_ON_STACK to suppress a debugobjects warning seen when CONFIG_DEBUG_OBJECTS_WORK is enabled. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap-persistent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 7d08879689ac..c097d8a4823d 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -254,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, * Issue the synchronous I/O from a different thread * to avoid generic_make_request recursion. */ - INIT_WORK(&req.work, do_metadata); + INIT_WORK_ON_STACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); flush_workqueue(ps->metadata_wq); -- cgit v1.2.3 From ebfd32bba9b518d684009d9d21a56742337ca1b3 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 16 Feb 2010 18:42:53 +0000 Subject: dm log: userspace fix overhead_size calcuations This patch fixes two bugs that revolve around the miscalculation and misuse of the variable 'overhead_size'. 'overhead_size' is the size of the various header structures used during communication. The first bug is the use of 'sizeof' with the pointer of a structure instead of the structure itself - resulting in the wrong size being computed. This is then used in a check to see if the payload (data_size) would be to large for the preallocated structure. Since the bug produces a smaller value for the overhead, it was possible for the structure to be breached. (Although the current users of the code do not currently send enough data to trigger this bug.) The second bug is that the 'overhead_size' value is used to compute how much of the preallocated space should be cleared before populating it with fresh data. This should have simply been 'sizeof(struct cn_msg)' not overhead_size. The fact that 'overhead_size' was computed incorrectly made this problem "less bad" - leaving only a pointer's worth of space at the end uncleared. Thus, this bug was never producing a bad result, but still needs to be fixed - especially now that the value is computed correctly. Cc: stable@kernel.org Signed-off-by: Jonathan Brassow --- drivers/md/dm-log-userspace-transfer.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 54abf9e303b7..f1c8cae70b4b 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -172,11 +172,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, { int r = 0; size_t dummy = 0; - int overhead_size = - sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); struct dm_ulog_request *tfr = prealloced_ulog_tfr; struct receiving_pkg pkg; + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { DMINFO("Size of tfr exceeds preallocated size"); return -EINVAL; @@ -191,7 +195,7 @@ resend: */ mutex_lock(&dm_ulog_lock); - memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); tfr->luid = luid; tfr->seq = dm_ulog_seq++; -- cgit v1.2.3 From 5528d17de1cf1462f285c40ccaf8e0d0e4c64dc0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 16 Feb 2010 18:42:55 +0000 Subject: dm raid1: fail writes if errors are not handled and log fails If the mirror log fails when the handle_errors option was not selected and there is no remaining valid mirror leg, writes return success even though they weren't actually written to any device. This patch completes them with EIO instead. This code path is taken: do_writes: bio_list_merge(&ms->failures, &sync); do_failures: if (!get_valid_mirror(ms)) (false) else if (errors_handled(ms)) (false) else bio_endio(bio, 0); The logic in do_failures is based on presuming that the write was already tried: if it succeeded at least on one leg (without handle_errors) it is reported as success. Reference: https://bugzilla.redhat.com/show_bug.cgi?id=555197 Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ad779bd13aec..6c1046df81f6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -724,7 +724,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) { + if (unlikely(ms->log_failure) && errors_handled(ms)) { spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); -- cgit v1.2.3 From 558569aa9d83e016295bac77d900342908d7fd85 Mon Sep 17 00:00:00 2001 From: Takahiro Yasui Date: Tue, 16 Feb 2010 18:42:58 +0000 Subject: dm raid1: fix null pointer dereference in suspend When suspending a failed mirror, bios are completed by mirror_end_io() and __rh_lookup() in dm_rh_dec() returns NULL where a non-NULL return value is required by design. Fix this by not changing the state of the recovery failed region from DM_RH_RECOVERING to DM_RH_NOSYNC in dm_rh_recovery_end(). Issue On 2.6.33-rc1 kernel, I hit the bug when I suspended the failed mirror by dmsetup command. BUG: unable to handle kernel NULL pointer dereference at 00000020 IP: [] dm_rh_dec+0x35/0xa1 [dm_region_hash] ... EIP: 0060:[] EFLAGS: 00010046 CPU: 0 EIP is at dm_rh_dec+0x35/0xa1 [dm_region_hash] EAX: 00000286 EBX: 00000000 ECX: 00000286 EDX: 00000000 ESI: eff79eac EDI: eff79e80 EBP: f6915cd4 ESP: f6915cc4 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Process dmsetup (pid: 2849, ti=f6914000 task=eff03e80 task.ti=f6914000) ... Call Trace: [] ? mirror_end_io+0x53/0x1b1 [dm_mirror] [] ? clone_endio+0x4d/0xa2 [dm_mod] [] ? mirror_end_io+0x0/0x1b1 [dm_mirror] [] ? clone_endio+0x0/0xa2 [dm_mod] [] ? bio_endio+0x28/0x2b [] ? hold_bio+0x2d/0x62 [dm_mirror] [] ? mirror_presuspend+0xeb/0xf7 [dm_mirror] [] ? vmap_page_range+0xb/0xd [] ? suspend_targets+0x2d/0x3b [dm_mod] [] ? dm_table_presuspend_targets+0xe/0x10 [dm_mod] [] ? dm_suspend+0x4d/0x150 [dm_mod] [] ? dev_suspend+0x55/0x18a [dm_mod] [] ? _copy_from_user+0x42/0x56 [] ? dm_ctl_ioctl+0x22c/0x281 [dm_mod] [] ? dev_suspend+0x0/0x18a [dm_mod] [] ? dm_ctl_ioctl+0x0/0x281 [dm_mod] [] ? vfs_ioctl+0x22/0x85 [] ? do_vfs_ioctl+0x4cb/0x516 [] ? sys_ioctl+0x40/0x5a [] ? sysenter_do_call+0x12/0x28 Analysis When recovery process of a region failed, dm_rh_recovery_end() function changes the state of the region from RM_RH_RECOVERING to DM_RH_NOSYNC. When recovery_complete() is executed between dm_rh_update_states() and dm_writes() in do_mirror(), bios are processed with the region state, DM_RH_NOSYNC. However, the region data is freed without checking its pending count when dm_rh_update_states() is called next time. When bios are finished by mirror_end_io(), __rh_lookup() in dm_rh_dec() returns NULL even though a valid return value are expected. Solution Remove the state change of the recovery failed region from DM_RH_RECOVERING to DM_RH_NOSYNC in dm_rh_recovery_end(). We can remove the state change because: - If the region data has been released by dm_rh_update_states(), a new region data is created with the state of DM_RH_NOSYNC, and bios are processed according to the DM_RH_NOSYNC state. - If the region data has not been released by dm_rh_update_states(), a state of the region is DM_RH_RECOVERING and bios are put in the delayed_bio list. The flag change from DM_RH_RECOVERING to DM_RH_NOSYNC in dm_rh_recovery_end() was added in the following commit: dm raid1: handle resync failures author Jonathan Brassow Thu, 12 Jul 2007 16:29:04 +0000 (17:29 +0100) http://git.kernel.org/linus/f44db678edcc6f4c2779ac43f63f0b9dfa28b724 Signed-off-by: Takahiro Yasui Reviewed-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-region-hash.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 5f19ceb6fe91..168bd38f5006 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -660,10 +660,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success) spin_lock_irq(&rh->region_lock); if (success) list_add(®->list, ®->rh->recovered_regions); - else { - reg->state = DM_RH_NOSYNC; + else list_add(®->list, ®->rh->failed_recovered_regions); - } + spin_unlock_irq(&rh->region_lock); rh->wakeup_workers(rh->context); -- cgit v1.2.3 From 9eef87da2a8ea4920e0d913ff977cac064b68ee0 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Tue, 16 Feb 2010 18:43:01 +0000 Subject: dm mpath: fix stall when requeueing io This patch fixes the problem that system may stall if target's ->map_rq returns DM_MAPIO_REQUEUE in map_request(). E.g. stall happens on 1 CPU box when a dm-mpath device with queue_if_no_path bounces between all-paths-down and paths-up on I/O load. When target's ->map_rq returns DM_MAPIO_REQUEUE, map_request() requeues the request and returns to dm_request_fn(). Then, dm_request_fn() doesn't exit the I/O dispatching loop and continues processing the requeued request again. This map and requeue loop can be done with interrupt disabled, so 1 CPU system can be stalled if this situation happens. For example, commands below can stall my 1 CPU box within 1 minute or so: # dmsetup table mp mp: 0 2097152 multipath 1 queue_if_no_path 0 1 1 service-time 0 1 2 8:144 1 1 # while true; do dd if=/dev/mapper/mp of=/dev/null bs=1M count=100; done & # while true; do \ > dmsetup message mp 0 "fail_path 8:144" \ > dmsetup suspend --noflush mp \ > dmsetup resume mp \ > dmsetup message mp 0 "reinstate_path 8:144" \ > done To fix the problem above, this patch changes dm_request_fn() to exit the I/O dispatching loop once if a request is requeued in map_request(). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3167480b532c..aa4e2aa86d49 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1595,10 +1595,15 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_OK; } -static void map_request(struct dm_target *ti, struct request *clone, - struct mapped_device *md) +/* + * Returns: + * 0 : the request has been processed (not requeued) + * !0 : the request has been requeued + */ +static int map_request(struct dm_target *ti, struct request *clone, + struct mapped_device *md) { - int r; + int r, requeued = 0; struct dm_rq_target_io *tio = clone->end_io_data; /* @@ -1625,6 +1630,7 @@ static void map_request(struct dm_target *ti, struct request *clone, case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ dm_requeue_unmapped_request(clone); + requeued = 1; break; default: if (r > 0) { @@ -1636,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *clone, dm_kill_unmapped_request(clone, r); break; } + + return requeued; } /* @@ -1677,12 +1685,17 @@ static void dm_request_fn(struct request_queue *q) atomic_inc(&md->pending[rq_data_dir(clone)]); spin_unlock(q->queue_lock); - map_request(ti, clone, md); + if (map_request(ti, clone, md)) + goto requeued; + spin_lock_irq(q->queue_lock); } goto out; +requeued: + spin_lock_irq(q->queue_lock); + plug_and_out: if (!elv_queue_empty(q)) /* Some requests still remain, retry later */ -- cgit v1.2.3 From 9307f6b19ac4f5887552b5b2992f391b866f7633 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Tue, 16 Feb 2010 18:43:04 +0000 Subject: dm: sysfs revert add empty release function to avoid debug warning Revert commit d2bb7df8cac647b92f51fb84ae735771e7adbfa7 at Greg's request. Author: Milan Broz Date: Thu Dec 10 23:51:53 2009 +0000 dm: sysfs add empty release function to avoid debug warning This patch just removes an unnecessary warning: kobject: 'dm': does not have a release() function, it is broken and must be fixed. The kobject is embedded in mapped device struct, so code does not need to release memory explicitly here. Cc: Greg KH Signed-off-by: Alasdair G Kergon --- drivers/md/dm-sysfs.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index f53392df7b97..f91b40942e07 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -79,13 +79,6 @@ static struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, }; -/* - * The sysfs structure is embedded in md struct, nothing to do here - */ -static void dm_sysfs_release(struct kobject *kobj) -{ -} - /* * dm kobject is embedded in mapped_device structure * no need to define release function here @@ -93,7 +86,6 @@ static void dm_sysfs_release(struct kobject *kobj) static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, - .release = dm_sysfs_release }; /* -- cgit v1.2.3