diff -Nru mdadm-3.2.5/ANNOUNCE-3.2.6 mdadm-3.3/ANNOUNCE-3.2.6 --- mdadm-3.2.5/ANNOUNCE-3.2.6 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/ANNOUNCE-3.2.6 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,57 @@ +Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.6 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This is a stablity release which adds a number of bugfixs to 3.2.5. +There are no real stand-out fixes, just lots of little bits and pieces. + +Below is the "git log --oneline --reverse" list of changes since +3.2.5. + +NeilBrown 25th October 2012 + +b7e05d2 udev-rules: prevent systemd from mount devices before they are ready. +0d478e2 mdadm: Fix Segmentation fault. +42f0ca1 imsm: fix: correct checking volume's degradation +fcf2195 Monitor: fix inconsistencies in values for ->percent +5f862fb Monitor: Report NewArray when an array the disappeared, reappears. +6f51b1c Monitor: fix reporting for Fail vs FailSpare etc. +68ad53b mdmon: fix arg parsing. +517f135 Assemble: don't leak memory with fdlist. +090900c udev-rules: prevent systemd from mount devices before they are ready. +446e000 sha1.h: remove ansidecl.h header inclusion +ec894f5 Manage: zero metadata before adding to 'external' array. +3a84db5 ddf: allow a non-spare to be used to recovery a missing device. +c5d61ca ddf: hack to fix container recognition. +23084aa mdmon: fix arg processing for -a +c4e96a3 mdmon: allow --takeover when original was started with --offroot +80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf +c5c56d6 mapfile: fix mapfile rebuild for containers +aec89f6 fix segfaults in Detail() +2117ad1 Fix 'enough' function for RAID10. +0bc300d Use --offroot flag when assembling md arrays via --incrmental +ac78f24 Grow: make warning about old metadata more explicit. +14026ab Replace sha1.h with slightly older version. +6f6809f Add zlib license to crc32.c +5267ba0 Handles spaces in array names better. +c51f288 imsm: allow --assume-clean to work. +acf7076 Grow: allow --grow --continue to work for native metadata. +335d2a6 Grow: fix a couple of typos with --assume-clean usage +9ff1427 Fix open_container +3713633 mdadm: super0: do not override uuid with homehost +31bff58 Trivial bugfix and spelling fixes. +e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'. +22a6461 super0: allow creation of array on 2TB+ devices. +a5d47a2 Create new md devices consistently +eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf +ecdf2d7 Query: don't be confused by partition tables. +f7b75c1 Query: allow member of non-0.90 arrays to be better reported. diff -Nru mdadm-3.2.5/ANNOUNCE-3.3 mdadm-3.3/ANNOUNCE-3.3 --- mdadm-3.2.5/ANNOUNCE-3.3 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/ANNOUNCE-3.3 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,63 @@ +Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +This is a major new release so don't be too surprised if there are a +few issues. If I hear about them they will be fixed in 3.3.1. +git log reports nearly 500 changes since 3.2.6 so I won't list them +all. + +Some highlights are: + +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +and lots of bugfixes and other little changes. + +NeilBrown 3rd September 2013 diff -Nru mdadm-3.2.5/Assemble.c mdadm-3.3/Assemble.c --- mdadm-3.2.5/Assemble.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Assemble.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -48,7 +48,7 @@ static int is_member_busy(char *metadata_version) { /* check if the given member array is active */ - struct mdstat_ent *mdstat = mdstat_read(1, 0); + struct mdstat_ent *mdstat = mdstat_read(0, 0); struct mdstat_ent *ent; int busy = 0; @@ -81,36 +81,35 @@ same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 && memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) { if (devname) - fprintf(stderr, Name ": %s has wrong uuid.\n", - devname); + pr_err("%s has wrong uuid.\n", devname); return 0; } if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && name_matches(content->name, ident->name, homehost)==0) { if (devname) - fprintf(stderr, Name ": %s has wrong name.\n", - devname); + pr_err("%s has wrong name.\n", devname); return 0; } if (ident->super_minor != UnSet && ident->super_minor != content->array.md_minor) { if (devname) - fprintf(stderr, Name ": %s has wrong super-minor.\n", - devname); + pr_err("%s has wrong super-minor.\n", + devname); return 0; } if (ident->level != UnSet && ident->level != content->array.level) { if (devname) - fprintf(stderr, Name ": %s has wrong raid level.\n", - devname); + pr_err("%s has wrong raid level.\n", + devname); return 0; } if (ident->raid_disks != UnSet && + content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */ ident->raid_disks!= content->array.raid_disks) { if (devname) - fprintf(stderr, Name ": %s requires wrong number of drives.\n", - devname); + pr_err("%s requires wrong number of drives.\n", + devname); return 0; } if (ident->member && ident->member[0]) { @@ -118,173 +117,43 @@ char *s = strchr(content->text_version+1, '/'); if (s == NULL) { if (devname) - fprintf(stderr, Name ": %s is not a container and one is required.\n", - devname); + pr_err("%s is not a container and one is required.\n", + devname); return 0; } else if (strcmp(ident->member, s+1) != 0) { if (devname) - fprintf(stderr, Name ": skipping wrong member %s is %s\n", - content->text_version, devname); + pr_err("skipping wrong member %s is %s\n", + content->text_version, devname); return 0; } } return 1; } - -int Assemble(struct supertype *st, char *mddev, - struct mddev_ident *ident, - struct mddev_dev *devlist, - char *backup_file, int invalid_backup, - int readonly, int runstop, - char *update, char *homehost, int require_homehost, - int verbose, int force, int freeze_reshape) +static int select_devices(struct mddev_dev *devlist, + struct mddev_ident *ident, + struct supertype **stp, + struct mdinfo **contentp, + struct context *c, + int inargv, int auto_assem) { - /* - * The task of Assemble is to find a collection of - * devices that should (according to their superblocks) - * form an array, and to give this collection to the MD driver. - * In Linux-2.4 and later, this involves submitting a - * SET_ARRAY_INFO ioctl with no arg - to prepare - * the array - and then submit a number of - * ADD_NEW_DISK ioctls to add disks into - * the array. Finally RUN_ARRAY might - * be submitted to start the array. - * - * Much of the work of Assemble is in finding and/or - * checking the disks to make sure they look right. - * - * If mddev is not set, then scan must be set and we - * read through the config file for dev+uuid mapping - * We recurse, setting mddev, for each device that - * - isn't running - * - has a valid uuid (or any uuid if !uuidset) - * - * If mddev is set, we try to determine state of md. - * check version - must be at least 0.90.0 - * check kernel version. must be at least 2.4. - * If not, we can possibly fall back on START_ARRAY - * Try to GET_ARRAY_INFO. - * If possible, give up - * If not, try to STOP_ARRAY just to make sure - * - * If !uuidset and scan, look in conf-file for uuid - * If not found, give up - * If !devlist and scan and uuidset, get list of devs from conf-file - * - * For each device: - * Check superblock - discard if bad - * Check uuid (set if we don't have one) - discard if no match - * Check superblock similarity if we have a superblock - discard if different - * Record events, devicenum - * This should give us a list of devices for the array - * We should collect the most recent event number - * - * Count disks with recent enough event count - * While force && !enough disks - * Choose newest rejected disks, update event count - * mark clean and rewrite superblock - * If recent kernel: - * SET_ARRAY_INFO - * foreach device with recent events : ADD_NEW_DISK - * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY - * If old kernel: - * Check the device numbers in superblock are right - * update superblock if any changes - * START_ARRAY - * - */ - int mdfd; - int clean; - int auto_assem = (mddev == NULL && !ident->uuid_set && - ident->super_minor == UnSet && ident->name[0] == 0 - && (ident->container == NULL || ident->member == NULL)); - int old_linux = 0; - int vers = vers; /* Keep gcc quite - it really is initialised */ - struct { - char *devname; - int uptodate; /* set once we decide that this device is as - * recent as everything else in the array. - */ - struct mdinfo i; - } *devices; - char *devmap; - int *best = NULL; /* indexed by raid_disk */ - int bestcnt = 0; - int devcnt = 0; - unsigned int okcnt, sparecnt, rebuilding_cnt; - unsigned int req_cnt; - int i; - int most_recent = 0; - int chosen_drive; - int change = 0; - int inargv = 0; - int report_missmatch; -#ifndef MDASSEMBLE - int bitmap_done; -#endif - int start_partial_ok = (runstop >= 0) && - (force || devlist==NULL || auto_assem); - unsigned int num_devs; struct mddev_dev *tmpdev; - struct mdinfo info; + int num_devs; + struct supertype *st = *stp; struct mdinfo *content = NULL; - char *avail; - int nextspare = 0; - char *name = NULL; - int trustworthy; - char chosen_name[1024]; + int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0); struct domainlist *domains = NULL; - if (get_linux_version() < 2004000) - old_linux = 1; - - /* - * If any subdevs are listed, then any that don't - * match ident are discarded. Remainder must all match and - * become the array. - * If no subdevs, then we scan all devices in the config file, but - * there must be something in the identity - */ - - if (!devlist && - ident->uuid_set == 0 && - (ident->super_minor < 0 || ident->super_minor == UnSet) && - ident->name[0] == 0 && - (ident->container == NULL || ident->member == NULL) && - ident->devices == NULL) { - fprintf(stderr, Name ": No identity information available for %s - cannot assemble.\n", - mddev ? mddev : "further assembly"); - return 1; - } - - if (devlist == NULL) - devlist = conf_get_devs(); - else if (mddev) - inargv = 1; - - report_missmatch = ((inargv && verbose >= 0) || verbose > 0); - try_again: - /* We come back here when doing auto-assembly and attempting some - * set of devices failed. Those are now marked as ->used==2 and - * we ignore them and try again - */ - tmpdev = devlist; num_devs = 0; while (tmpdev) { if (tmpdev->used) tmpdev->used = 2; else num_devs++; + tmpdev->disposition = 0; tmpdev = tmpdev->next; } - if (!st && ident->st) st = ident->st; - - if (verbose>0) - fprintf(stderr, Name ": looking for devices for %s\n", - mddev ? mddev : "further assembly"); - /* first walk the list of devices to find a consistent set * that match the criterea, if that is possible. * We flag the ones we like with 'used'. @@ -299,12 +168,25 @@ struct dev_policy *pol = NULL; int found_container = 0; - if (tmpdev->used > 1) continue; + if (tmpdev->used > 1) + continue; - if (ident->devices && - !match_oneof(ident->devices, devname)) { - if (report_missmatch) - fprintf(stderr, Name ": %s is not one of %s\n", devname, ident->devices); + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_mismatch) + pr_err("%s is not the container required (%s)\n", + devname, ident->container); + continue; + } + } else if (ident->devices && + !match_oneof(ident->devices, devname)) { + /* Note that we ignore the "device=" identifier if a + * "container=" is given. Checking both is unnecessarily + * complicated. + */ + if (report_mismatch) + pr_err("%s is not one of %s\n", devname, ident->devices); continue; } @@ -312,74 +194,74 @@ dfd = dev_open(devname, O_RDONLY); if (dfd < 0) { - if (report_missmatch) - fprintf(stderr, Name ": cannot open device %s: %s\n", - devname, strerror(errno)); + if (report_mismatch) + pr_err("cannot open device %s: %s\n", + devname, strerror(errno)); tmpdev->used = 2; } else if (fstat(dfd, &stb)< 0) { /* Impossible! */ - fprintf(stderr, Name ": fstat failed for %s: %s\n", - devname, strerror(errno)); + pr_err("fstat failed for %s: %s\n", + devname, strerror(errno)); tmpdev->used = 2; } else if ((stb.st_mode & S_IFMT) != S_IFBLK) { - fprintf(stderr, Name ": %s is not a block device.\n", - devname); + pr_err("%s is not a block device.\n", + devname); tmpdev->used = 2; } else if (must_be_container(dfd)) { if (st) { /* already found some components, this cannot * be another one. */ - if (report_missmatch) - fprintf(stderr, Name ": %s is a container, but we are looking for components\n", - devname); + if (report_mismatch) + pr_err("%s is a container, but we are looking for components\n", + devname); tmpdev->used = 2; #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) { - if (report_missmatch) - fprintf(stderr, Name ": not a recognisable container: %s\n", - devname); + if (report_mismatch) + pr_err("not a recognisable container: %s\n", + devname); tmpdev->used = 2; #endif } else if (!tst->ss->load_container || tst->ss->load_container(tst, dfd, NULL)) { - if (report_missmatch) - fprintf(stderr, Name ": no correct container type: %s\n", - devname); + if (report_mismatch) + pr_err("no correct container type: %s\n", + devname); tmpdev->used = 2; } else if (auto_assem && - !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)), - tst->ss->match_home(tst, homehost) == 1)) { - if (report_missmatch) - fprintf(stderr, Name ": %s has metadata type %s for which " - "auto-assembly is disabled\n", - devname, tst->ss->name); + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which " + "auto-assembly is disabled\n", + devname, tst->ss->name); tmpdev->used = 2; } else found_container = 1; } else { if (!tst && (tst = guess_super(dfd)) == NULL) { - if (report_missmatch) - fprintf(stderr, Name ": no recogniseable superblock on %s\n", - devname); + if (report_mismatch) + pr_err("no recogniseable superblock on %s\n", + devname); tmpdev->used = 2; } else if (tst->ss->load_super(tst,dfd, NULL)) { - if (report_missmatch) - fprintf(stderr, Name ": no RAID superblock on %s\n", - devname); + if (report_mismatch) + pr_err("no RAID superblock on %s\n", + devname); tmpdev->used = 2; } else if (tst->ss->compare_super == NULL) { - if (report_missmatch) - fprintf(stderr, Name ": Cannot assemble %s metadata on %s\n", - tst->ss->name, devname); + if (report_mismatch) + pr_err("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); tmpdev->used = 2; } else if (auto_assem && st == NULL && - !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)), - tst->ss->match_home(tst, homehost) == 1)) { - if (report_missmatch) - fprintf(stderr, Name ": %s has metadata type %s for which " - "auto-assembly is disabled\n", - devname, tst->ss->name); + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which " + "auto-assembly is disabled\n", + devname, tst->ss->name); tmpdev->used = 2; } } @@ -393,15 +275,14 @@ /* Ignore unrecognised device if looking for * specific array */ goto loop; - - fprintf(stderr, Name ": %s has no superblock - assembly aborted\n", - devname); + pr_err("%s has no superblock - assembly aborted\n", + devname); if (st) st->ss->free_super(st); dev_policy_free(pol); domain_free(domains); - return 1; + return -1; } if (found_container) { @@ -414,60 +295,51 @@ */ dfd = dev_open(devname, O_RDONLY | O_EXCL); if (dfd < 0) { - if (report_missmatch) - fprintf(stderr, Name ": %s is busy - skipping\n", devname); + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); goto loop; } close(dfd); - if (ident->container) { - if (ident->container[0] == '/' && - !same_dev(ident->container, devname)) { - if (report_missmatch) - fprintf(stderr, Name ": %s is not the container required (%s)\n", - devname, ident->container); + if (ident->container && ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_mismatch) + pr_err("%s has wrong UUID to be required container\n", + devname); goto loop; } - if (ident->container[0] != '/') { - /* we have a uuid */ - int uuid[4]; - - content = &info; - tst->ss->getinfo_super(tst, content, NULL); - - if (!parse_uuid(ident->container, uuid) || - !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { - if (report_missmatch) - fprintf(stderr, Name ": %s has wrong UUID to be required container\n", - devname); - goto loop; - } - } } /* It is worth looking inside this container. */ - if (verbose > 0) - fprintf(stderr, Name ": looking in container %s\n", - devname); + if (c->verbose > 0) + pr_err("looking in container %s\n", + devname); for (content = tst->ss->container_content(tst, NULL); content; content = content->next) { if (!ident_matches(ident, content, tst, - homehost, update, - report_missmatch ? devname : NULL)) + c->homehost, c->update, + report_mismatch ? devname : NULL)) /* message already printed */; else if (is_member_busy(content->text_version)) { - if (report_missmatch) - fprintf(stderr, Name ": member %s in %s is already assembled\n", - content->text_version, - devname); + if (report_mismatch) + pr_err("member %s in %s is already assembled\n", + content->text_version, + devname); } else if (content->array.state & (1<text_version, - devname); + pr_err("Cannot activate member %s in %s.\n", + content->text_version, + devname); } else break; } @@ -478,39 +350,61 @@ st = tst; tst = NULL; if (!auto_assem && inargv && tmpdev->next != NULL) { - fprintf(stderr, Name ": %s is a container, but is not " - "only device given: confused and aborting\n", - devname); + pr_err("%s is a container, but is not " + "only device given: confused and aborting\n", + devname); st->ss->free_super(st); dev_policy_free(pol); domain_free(domains); - return 1; + return -1; } - if (verbose > 0) - fprintf(stderr, Name ": found match on member %s in %s\n", - content->text_version, devname); + if (c->verbose > 0) + pr_err("found match on member %s in %s\n", + content->text_version, devname); /* make sure we finished the loop */ tmpdev = NULL; goto loop; } else { + int rv = 0; + struct mddev_ident *match; - content = &info; + content = *contentp; tst->ss->getinfo_super(tst, content, NULL); if (!ident_matches(ident, content, tst, - homehost, update, - report_missmatch ? devname : NULL)) + c->homehost, c->update, + report_mismatch ? devname : NULL)) + goto loop; + + match = conf_match(tst, content, devname, + report_mismatch ? c->verbose : -1, + &rv); + if (!match && rv == 2) + goto loop; + if (match && match->devname && + strcasecmp(match->devname, "") == 0) { + if (report_mismatch) + pr_err("%s is a member of an explicitly ignored array\n", + devname); goto loop; - + } + if (match && !ident_matches(match, content, tst, + c->homehost, c->update, + report_mismatch ? devname : NULL)) + /* Array exists in mdadm.conf but some + * details don't match, so reject it + */ + goto loop; + /* should be safe to try an exclusive open now, we * have rejected anything that some other mdadm might * be looking at */ dfd = dev_open(devname, O_RDONLY | O_EXCL); if (dfd < 0) { - if (report_missmatch) - fprintf(stderr, Name ": %s is busy - skipping\n", devname); + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); goto loop; } close(dfd); @@ -541,22 +435,22 @@ */ if (auto_assem) goto loop; - if (homehost) { - int first = st->ss->match_home(st, homehost); - int last = tst->ss->match_home(tst, homehost); + if (c->homehost) { + int first = st->ss->match_home(st, c->homehost); + int last = tst->ss->match_home(tst, c->homehost); if (first != last && (first == 1 || last == 1)) { /* We can do something */ if (first) {/* just ignore this one */ - if (report_missmatch) - fprintf(stderr, Name ": %s misses out due to wrong homehost\n", - devname); + if (report_mismatch) + pr_err("%s misses out due to wrong homehost\n", + devname); goto loop; } else { /* reject all those sofar */ struct mddev_dev *td; - if (report_missmatch) - fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n", - devname); + if (report_mismatch) + pr_err("%s overrides previous devices due to good homehost\n", + devname); for (td=devlist; td != tmpdev; td=td->next) if (td->used == 1) td->used = 0; @@ -565,13 +459,13 @@ } } } - fprintf(stderr, Name ": superblock on %s doesn't match others - assembly aborted\n", - devname); + pr_err("superblock on %s doesn't match others - assembly aborted\n", + devname); tst->ss->free_super(tst); st->ss->free_super(st); dev_policy_free(pol); domain_free(domains); - return 1; + return -1; } tmpdev->used = 1; } @@ -579,7 +473,7 @@ /* Collect domain information from members only */ if (tmpdev && tmpdev->used == 1) { if (!pol) - pol = devnum_policy(stb.st_rdev); + pol = devid_policy(stb.st_rdev); domain_merge(&domains, pol, tst?tst->ss->name:NULL); } dev_policy_free(pol); @@ -597,7 +491,7 @@ if (tmpdev->used != 3) continue; tmpdev->used = 1; - content = &info; + content = *contentp; if (!st->sb) { /* we need sb from one of the spares */ @@ -616,11 +510,11 @@ if (tmpdev->used != 3) continue; if (stat(tmpdev->devname, &stb)< 0) { - fprintf(stderr, Name ": fstat failed for %s: %s\n", - tmpdev->devname, strerror(errno)); + pr_err("fstat failed for %s: %s\n", + tmpdev->devname, strerror(errno)); tmpdev->used = 2; } else { - struct dev_policy *pol = devnum_policy(stb.st_rdev); + struct dev_policy *pol = devid_policy(stb.st_rdev); int dt = domain_test(domains, pol, NULL); if (inargv && dt != 0) /* take this spare as domains match @@ -637,106 +531,53 @@ } } domain_free(domains); - - if (!st || !st->sb || !content) - return 2; - - /* Now need to open the array device. Use create_mddev */ - if (content == &info) + *stp = st; + if (st && st->sb && content == *contentp) st->ss->getinfo_super(st, content, NULL); + *contentp = content; - trustworthy = FOREIGN; - name = content->name; - switch (st->ss->match_home(st, homehost) - ?: st->ss->match_home(st, "any")) { - case 1: - trustworthy = LOCAL; - name = strchr(content->name, ':'); - if (name) - name++; - else - name = content->name; - break; - } - if (!auto_assem) - /* If the array is listed in mdadm.conf or on - * command line, then we trust the name - * even if the array doesn't look local - */ - trustworthy = LOCAL; - - if (name[0] == 0 && - content->array.level == LEVEL_CONTAINER) { - name = content->text_version; - trustworthy = METADATA; - } - - if (name[0] && trustworthy != LOCAL && - ! require_homehost && - conf_name_is_free(name)) - trustworthy = LOCAL; - - if (trustworthy == LOCAL && - strchr(name, ':')) - /* Ignore 'host:' prefix of name */ - name = strchr(name, ':')+1; - - mdfd = create_mddev(mddev, name, ident->autof, trustworthy, - chosen_name); - if (mdfd < 0) { - st->ss->free_super(st); - if (auto_assem) - goto try_again; - return 1; - } - mddev = chosen_name; - vers = md_get_version(mdfd); - if (vers < 9000) { - fprintf(stderr, Name ": Assemble requires driver version 0.90.0 or later.\n" - " Upgrade your kernel or try --build\n"); - close(mdfd); - return 1; - } - if (mddev_busy(fd2devnum(mdfd))) { - fprintf(stderr, Name ": %s already active, cannot restart it!\n", - mddev); - for (tmpdev = devlist ; - tmpdev && tmpdev->used != 1; - tmpdev = tmpdev->next) - ; - if (tmpdev && auto_assem) - fprintf(stderr, Name ": %s needed for %s...\n", - mddev, tmpdev->devname); - close(mdfd); - mdfd = -3; - st->ss->free_super(st); - if (auto_assem) - goto try_again; - return 1; - } - ioctl(mdfd, STOP_ARRAY, NULL); /* just incase it was started but has no content */ + return num_devs; +} +struct devs { + char *devname; + int uptodate; /* set once we decide that this device is as + * recent as everything else in the array. + */ + int included; /* set if the device is already in the array + * due to a previous '-I' + */ + struct mdinfo i; +}; + +static int load_devices(struct devs *devices, char *devmap, + struct mddev_ident *ident, struct supertype *st, + struct mddev_dev *devlist, struct context *c, + struct mdinfo *content, + int mdfd, char *mddev, + int *most_recentp, int *bestcntp, int **bestp, + int inargv) +{ + struct mddev_dev *tmpdev; + int devcnt = 0; + int nextspare = 0; #ifndef MDASSEMBLE - if (content != &info) { - /* This is a member of a container. Try starting the array. */ - int err; - err = assemble_container_content(st, mdfd, content, runstop, - chosen_name, verbose, - backup_file, freeze_reshape); - close(mdfd); - return err; - } - bitmap_done = 0; + int bitmap_done = 0; #endif - /* Ok, no bad inconsistancy, we can try updating etc */ - devices = malloc(num_devs * sizeof(*devices)); - devmap = calloc(num_devs * content->array.raid_disks, 1); - for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) { + int most_recent = -1; + int bestcnt = 0; + int *best = *bestp; + + for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) { char *devname = tmpdev->devname; struct stat stb; + int i; + + if (tmpdev->used != 1) + continue; /* looks like a good enough match to update the super block if needed */ #ifndef MDASSEMBLE - if (update) { + if (c->update) { int dfd; /* prepare useful information in info structures */ struct stat stb2; @@ -744,7 +585,7 @@ int err; fstat(mdfd, &stb2); - if (strcmp(update, "uuid")==0 && + if (strcmp(c->update, "uuid")==0 && !ident->uuid_set) { int rfd; if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || @@ -756,18 +597,20 @@ } if (rfd >= 0) close(rfd); } - dfd = dev_open(devname, O_RDWR|O_EXCL); + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); tst = dup_super(st); if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { - fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n", - devname); + pr_err("cannot re-read metadata from %s - aborting\n", + devname); if (dfd >= 0) close(dfd); close(mdfd); free(devices); free(devmap); - return 1; + return -1; } tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); @@ -775,42 +618,42 @@ strcpy(content->name, ident->name); content->array.md_minor = minor(stb2.st_rdev); - if (strcmp(update, "byteorder") == 0) + if (strcmp(c->update, "byteorder") == 0) err = 0; else - err = tst->ss->update_super(tst, content, update, - devname, verbose, + err = tst->ss->update_super(tst, content, c->update, + devname, c->verbose, ident->uuid_set, - homehost); + c->homehost); if (err < 0) { - fprintf(stderr, - Name ": --update=%s not understood" - " for %s metadata\n", - update, tst->ss->name); + if (err == -1) + pr_err("--update=%s not understood" + " for %s metadata\n", + c->update, tst->ss->name); tst->ss->free_super(tst); free(tst); close(mdfd); close(dfd); free(devices); free(devmap); - return 1; + return -1; } - if (strcmp(update, "uuid")==0 && + if (strcmp(c->update, "uuid")==0 && !ident->uuid_set) { ident->uuid_set = 1; memcpy(ident->uuid, content->uuid, 16); } if (tst->ss->store_super(tst, dfd)) - fprintf(stderr, Name ": Could not re-write superblock on %s.\n", - devname); + pr_err("Could not re-write superblock on %s.\n", + devname); close(dfd); - if (strcmp(update, "uuid")==0 && + if (strcmp(c->update, "uuid")==0 && ident->bitmap_fd >= 0 && !bitmap_done) { if (bitmap_update_uuid(ident->bitmap_fd, content->uuid, tst->ss->swapuuid) != 0) - fprintf(stderr, Name ": Could not update uuid on external bitmap.\n"); + pr_err("Could not update uuid on external bitmap.\n"); else bitmap_done = 1; } @@ -820,17 +663,19 @@ { struct supertype *tst = dup_super(st); int dfd; - dfd = dev_open(devname, O_RDWR|O_EXCL); + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { - fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n", - devname); + pr_err("cannot re-read metadata from %s - aborting\n", + devname); if (dfd >= 0) close(dfd); close(mdfd); free(devices); free(devmap); - return 1; + return -1; } tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); tst->ss->free_super(tst); @@ -839,37 +684,47 @@ stat(devname, &stb); - if (verbose > 0) - fprintf(stderr, Name ": %s is identified as a member of %s, slot %d.\n", - devname, mddev, content->disk.raid_disk); + if (c->verbose > 0) + pr_err("%s is identified as a member of %s, slot %d%s.\n", + devname, mddev, content->disk.raid_disk, + (content->disk.state & (1<disposition == 'I'); devices[devcnt].i = *content; devices[devcnt].i.disk.major = major(stb.st_rdev); devices[devcnt].i.disk.minor = minor(stb.st_rdev); - if (most_recent < devcnt) { - if (devices[devcnt].i.events + + if (devices[devcnt].i.disk.state == 6) { + if (most_recent < 0 || + devices[devcnt].i.events > devices[most_recent].i.events) most_recent = devcnt; } + if (content->array.level == LEVEL_MULTIPATH) /* with multipath, the raid_disk from the superblock is meaningless */ i = devcnt; else i = devices[devcnt].i.disk.raid_disk; if (i+1 == 0) { - if (nextspare < content->array.raid_disks) - nextspare = content->array.raid_disks; + if (nextspare < content->array.raid_disks*2) + nextspare = content->array.raid_disks*2; i = nextspare++; } else { - if (i >= content->array.raid_disks && + /* i is raid_disk - double it so there is room for + * replacements */ + i *= 2; + if (devices[devcnt].i.disk.state & (1<= content->array.raid_disks*2 && i >= nextspare) nextspare = i+1; } if (i < 10000) { if (i >= bestcnt) { int newbestcnt = i+10; - int *newbest = malloc(sizeof(int)*newbestcnt); + int *newbest = xmalloc(sizeof(int)*newbestcnt); int c; for (c=0; c < newbestcnt; c++) if (c < bestcnt) @@ -891,20 +746,20 @@ * Could be a mis-detection caused by overlapping * partitions. fail-safe. */ - fprintf(stderr, Name ": WARNING %s and %s appear" - " to have very similar superblocks.\n" - " If they are really different, " - "please --zero the superblock on one\n" - " If they are the same or overlap," - " please remove one from %s.\n", - devices[best[i]].devname, devname, - inargv ? "the list" : - "the\n DEVICE list in mdadm.conf" + pr_err("WARNING %s and %s appear" + " to have very similar superblocks.\n" + " If they are really different, " + "please --zero the superblock on one\n" + " If they are the same or overlap," + " please remove one from %s.\n", + devices[best[i]].devname, devname, + inargv ? "the list" : + "the\n DEVICE list in mdadm.conf" ); close(mdfd); free(devices); free(devmap); - return 1; + return -1; } if (best[i] == -1 || (devices[best[i]].i.events @@ -913,92 +768,31 @@ } devcnt++; } + if (most_recent >= 0) + *most_recentp = most_recent; + *bestcntp = bestcnt; + *bestp = best; + return devcnt; +} - if (devcnt == 0) { - fprintf(stderr, Name ": no devices found for %s\n", - mddev); - if (st) - st->ss->free_super(st); - close(mdfd); - free(devices); - free(devmap); - return 1; - } - - if (update && strcmp(update, "byteorder")==0) - st->minor_version = 90; - - st->ss->getinfo_super(st, content, NULL); - clean = content->array.state & 1; - - /* now we have some devices that might be suitable. - * I wonder how many - */ - avail = malloc(content->array.raid_disks); - memset(avail, 0, content->array.raid_disks); - okcnt = 0; - sparecnt=0; - rebuilding_cnt=0; - for (i=0; i< bestcnt; i++) { - int j = best[i]; - int event_margin = 1; /* always allow a difference of '1' - * like the kernel does - */ - if (j < 0) continue; - /* note: we ignore error flags in multipath arrays - * as they don't make sense - */ - if (content->array.level != LEVEL_MULTIPATH) - if (!(devices[j].i.disk.state & (1<array.raid_disks > 0 && - devices[most_recent].i.disk.raid_disk >= 0 && - devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { - if (verbose > -1) - fprintf(stderr, Name ": ignoring %s as it reports %s as failed\n", - devices[j].devname, devices[most_recent].devname); - best[i] = -1; - continue; - } - if (devices[j].i.events+event_margin >= - devices[most_recent].i.events) { - devices[j].uptodate = 1; - if (i < content->array.raid_disks) { - if (devices[j].i.recovery_start == MaxSector || - (content->reshape_active && - ((i >= content->array.raid_disks - content->delta_disks) || - (i >= content->array.raid_disks - content->delta_disks - 1 - && content->array.level == 4)))) { - okcnt++; - avail[i]=1; - } else - rebuilding_cnt++; - } else - sparecnt++; - } - } - free(devmap); - while (force && - (!enough(content->array.level, content->array.raid_disks, - content->array.layout, 1, +static int force_array(struct mdinfo *content, + struct devs *devices, + int *best, int bestcnt, char *avail, + int most_recent, + struct supertype *st, + struct context *c) +{ + int okcnt = 0; + while (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, + avail) + || + (content->reshape_active && content->delta_disks > 0 && + !enough(content->array.level, (content->array.raid_disks + - content->delta_disks), + content->new_layout, 1, avail) - || - (content->reshape_active && content->delta_disks > 0 && - !enough(content->array.level, (content->array.raid_disks - - content->delta_disks), - content->new_layout, 1, - avail) - ))) { + )) { /* Choose the newest best drive which is * not up-to-date, update the superblock * and add it. @@ -1006,7 +800,9 @@ int fd; struct supertype *tst; unsigned long long current_events; - chosen_drive = -1; + int chosen_drive = -1; + int i; + for (i = 0; i < content->array.raid_disks && i < bestcnt; i++) { int j = best[i]; if (j>=0 && @@ -1021,36 +817,38 @@ break; current_events = devices[chosen_drive].i.events; add_another: - if (verbose >= 0) - fprintf(stderr, Name ": forcing event count in %s(%d) from %d upto %d\n", - devices[chosen_drive].devname, - devices[chosen_drive].i.disk.raid_disk, - (int)(devices[chosen_drive].i.events), - (int)(devices[most_recent].i.events)); - fd = dev_open(devices[chosen_drive].devname, O_RDWR|O_EXCL); + if (c->verbose >= 0) + pr_err("forcing event count in %s(%d) from %d upto %d\n", + devices[chosen_drive].devname, + devices[chosen_drive].i.disk.raid_disk, + (int)(devices[chosen_drive].i.events), + (int)(devices[most_recent].i.events)); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? O_RDWR + : (O_RDWR|O_EXCL)); if (fd < 0) { - fprintf(stderr, Name ": Couldn't open %s for write - not updating\n", - devices[chosen_drive].devname); + pr_err("Couldn't open %s for write - not updating\n", + devices[chosen_drive].devname); devices[chosen_drive].i.events = 0; continue; } tst = dup_super(st); if (tst->ss->load_super(tst,fd, NULL)) { close(fd); - fprintf(stderr, Name ": RAID superblock disappeared from %s - not updating.\n", - devices[chosen_drive].devname); + pr_err("RAID superblock disappeared from %s - not updating.\n", + devices[chosen_drive].devname); devices[chosen_drive].i.events = 0; continue; } content->events = devices[most_recent].i.events; tst->ss->update_super(tst, content, "force-one", - devices[chosen_drive].devname, verbose, - 0, NULL); + devices[chosen_drive].devname, c->verbose, + 0, NULL); if (tst->ss->store_super(tst, fd)) { close(fd); - fprintf(stderr, Name ": Could not re-write superblock on %s\n", - devices[chosen_drive].devname); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); devices[chosen_drive].i.events = 0; tst->ss->free_super(tst); continue; @@ -1076,37 +874,684 @@ } } } + return okcnt; +} - /* Now we want to look at the superblock which the kernel will base things on - * and compare the devices that we think are working with the devices that the - * superblock thinks are working. - * If there are differences and --force is given, then update this chosen - * superblock. - */ - chosen_drive = -1; - st->ss->free_super(st); - for (i=0; chosen_drive < 0 && ibitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { + pr_err("SET_BITMAP_FILE failed.\n"); + return 1; + } + } else if (ident->bitmap_file) { + /* From config file */ + int bmfd = open(ident->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s\n", + ident->bitmap_file); + return 1; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + pr_err("Failed to set bitmapfile for %s\n", mddev); + close(bmfd); + return 1; + } + close(bmfd); + } + + /* First, add the raid disks, but add the chosen one last */ + for (i=0; i<= bestcnt; i++) { + int j; + if (i < bestcnt) { + j = best[i]; + if (j == chosen_drive) + continue; + } else + j = chosen_drive; + + if (j >= 0 && !devices[j].included) { + int dfd = dev_open(devices[j].devname, + O_RDWR|O_EXCL); + if (dfd >= 0) { + remove_partitions(dfd); + close(dfd); + } + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { + pr_err("failed to add " + "%s to %s: %s\n", + devices[j].devname, + mddev, + strerror(errno)); + if (i < content->array.raid_disks * 2 + || i == bestcnt) + okcnt--; + else + sparecnt--; + } else if (c->verbose > 0) + pr_err("added %s to %s as %d%s%s\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk, + devices[j].uptodate?"": + " (possibly out of date)", + (devices[j].i.disk.state & (1<= 0) { + if (c->verbose > 0) + pr_err("%s is already in %s as %d\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk); + } else if (c->verbose > 0 && i < content->array.raid_disks*2 + && (i&1) == 0) + pr_err("no uptodate device for slot %d of %s\n", + i, mddev); + } + + if (content->array.level == LEVEL_CONTAINER) { + if (c->verbose >= 0) { + pr_err("Container %s has been " + "assembled with %d drive%s", + mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, "\n"); + } + st->ss->free_super(st); + sysfs_uevent(content, "change"); + return 0; + } + + /* Get number of in-sync devices according to the superblock. + * We must have this number to start the array without -s or -R + */ + req_cnt = content->array.working_disks; + + if (c->runstop == 1 || + (c->runstop <= 0 && + ( enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok) + ))) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; +#ifndef MDASSEMBLE + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP) && + content->delta_disks <= 0) { + if (!c->backup_file) { + pr_err("%s: Need a backup file to complete reshape of this array.\n", + mddev); + pr_err("Please provided one with \"--backup-file=...\"\n"); + if (c->update && + strcmp(c->update, "revert-reshape") == 0) + pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n"); + return 1; + } + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (rv == 0) + rv = Grow_continue(mdfd, st, content, + c->backup_file, + c->freeze_reshape); + } else if (c->readonly && + sysfs_attribute_available( + content, NULL, "array_state")) { + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } else +#endif + rv = ioctl(mdfd, RUN_ARRAY, NULL); + if (rv == 0) { + if (c->verbose >= 0) { + pr_err("%s has been started with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + fprintf(stderr, ".\n"); + } + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + if (256 < 4 * (content->array.chunk_size/4096)) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * content->array.chunk_size / 4096) + 1); + sysfs_free(sra); + } + } + if (okcnt < (unsigned)content->array.raid_disks) { + /* If any devices did not get added + * because the kernel rejected them based + * on event count, try adding them + * again providing the action policy is + * 're-add' or greater. The bitmap + * might allow them to be included, or + * they will become spares. + */ + for (i = 0; i < bestcnt; i++) { + int j = best[i]; + if (j >= 0 && !devices[j].uptodate) { + if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) + continue; + rv = add_disk(mdfd, st, content, + &devices[j].i); + if (rv == 0 && c->verbose >= 0) + pr_err("%s has been re-added.\n", + devices[j].devname); + } + } + } + if (content->array.level == 6 && + okcnt + 1 == (unsigned)content->array.raid_disks && + was_forced) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_str(sra, NULL, + "sync_action", "repair"); + sysfs_free(sra); + } + return 0; + } + pr_err("failed to RUN_ARRAY %s: %s\n", + mddev, strerror(errno)); + + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + pr_err("Not enough devices to " + "start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + pr_err("Not enough devices to " + "start the array while not clean " + "- consider --force.\n"); + + return 1; + } + if (c->runstop == -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + fprintf(stderr, ", but not started.\n"); + return 2; + } + if (c->verbose >= -1) { + pr_err("%s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + fprintf(stderr, " - not enough to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + fprintf(stderr, " - not enough to start the " + "array while not clean - consider " + "--force.\n"); + else { + if (req_cnt == (unsigned)content->array.raid_disks) + fprintf(stderr, " - need all %d to start it", req_cnt); + else + fprintf(stderr, " - need %d to start", req_cnt); + fprintf(stderr, " (use --run to insist).\n"); + } + } + return 1; +} + +int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c) +{ + /* + * The task of Assemble is to find a collection of + * devices that should (according to their superblocks) + * form an array, and to give this collection to the MD driver. + * In Linux-2.4 and later, this involves submitting a + * SET_ARRAY_INFO ioctl with no arg - to prepare + * the array - and then submit a number of + * ADD_NEW_DISK ioctls to add disks into + * the array. Finally RUN_ARRAY might + * be submitted to start the array. + * + * Much of the work of Assemble is in finding and/or + * checking the disks to make sure they look right. + * + * If mddev is not set, then scan must be set and we + * read through the config file for dev+uuid mapping + * We recurse, setting mddev, for each device that + * - isn't running + * - has a valid uuid (or any uuid if !uuidset) + * + * If mddev is set, we try to determine state of md. + * check version - must be at least 0.90.0 + * check kernel version. must be at least 2.4. + * If not, we can possibly fall back on START_ARRAY + * Try to GET_ARRAY_INFO. + * If possible, give up + * If not, try to STOP_ARRAY just to make sure + * + * If !uuidset and scan, look in conf-file for uuid + * If not found, give up + * If !devlist and scan and uuidset, get list of devs from conf-file + * + * For each device: + * Check superblock - discard if bad + * Check uuid (set if we don't have one) - discard if no match + * Check superblock similarity if we have a superblock - discard if different + * Record events, devicenum + * This should give us a list of devices for the array + * We should collect the most recent event number + * + * Count disks with recent enough event count + * While force && !enough disks + * Choose newest rejected disks, update event count + * mark clean and rewrite superblock + * If recent kernel: + * SET_ARRAY_INFO + * foreach device with recent events : ADD_NEW_DISK + * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY + * If old kernel: + * Check the device numbers in superblock are right + * update superblock if any changes + * START_ARRAY + * + */ + int rv; + int mdfd; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 + && (ident->container == NULL || ident->member == NULL)); + struct devs *devices; + char *devmap; + int *best = NULL; /* indexed by raid_disk */ + int bestcnt = 0; + int devcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt; + int i; + int was_forced = 0; + int most_recent = 0; + int chosen_drive; + int change = 0; + int inargv = 0; + int start_partial_ok = (c->runstop >= 0) && + (c->force || devlist==NULL || auto_assem); + int num_devs; + struct mddev_dev *tmpdev; + struct mdinfo info; + struct mdinfo *content = NULL; + struct mdinfo *pre_exist = NULL; + char *avail; + char *name = NULL; + char chosen_name[1024]; + struct map_ent *map = NULL; + struct map_ent *mp; + + /* + * If any subdevs are listed, then any that don't + * match ident are discarded. Remainder must all match and + * become the array. + * If no subdevs, then we scan all devices in the config file, but + * there must be something in the identity + */ + + if (!devlist && + ident->uuid_set == 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && + ident->devices == NULL) { + pr_err("No identity information available for %s - cannot assemble.\n", + mddev ? mddev : "further assembly"); + return 1; + } + + if (devlist == NULL) + devlist = conf_get_devs(); + else if (mddev) + inargv = 1; + +try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ + if (!st && ident->st) + st = ident->st; + if (c->verbose>0) + pr_err("looking for devices for %s\n", + mddev ? mddev : "further assembly"); + + content = &info; + if (st) + st->ignore_hw_compat = 1; + num_devs = select_devices(devlist, ident, &st, &content, c, + inargv, auto_assem); + if (num_devs < 0) + return 1; + + if (!st || !st->sb || !content) + return 2; + + /* We have a full set of devices - we now need to find the + * array device. + * However there is a risk that we are racing with "mdadm -I" + * and the array is already partially assembled - we will have + * rejected any devices already in this address. + * So we take a lock on the map file - to prevent further races - + * and look for the uuid in there. If found and the array is + * active, we abort. If found and the array is not active + * we commit to that md device and add all the contained devices + * to our list. We flag them so that we don't try to re-add, + * but can remove if they turn out to not be wanted. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); + mp = map_by_uuid(&map, content->uuid); + if (mp) { + struct mdinfo *dv; + /* array already exists. */ + pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS); + if (pre_exist->array.level != UnSet) { + pr_err("Found some drive for an array that is already active: %s\n", + mp->path); + pr_err("giving up.\n"); + return 1; + } + for (dv = pre_exist->devs; dv; dv = dv->next) { + /* We want to add this device to our list, + * but it could already be there if "mdadm -I" + * started *after* we checked for O_EXCL. + * If we add it to the top of the list + * it will be preferred over later copies. + */ + struct mddev_dev *newdev; + char *devname = map_dev(dv->disk.major, + dv->disk.minor, + 0); + if (!devname) + continue; + newdev = xmalloc(sizeof(*newdev)); + newdev->devname = devname; + newdev->disposition = 'I'; + newdev->used = 1; + newdev->next = devlist; + devlist = newdev; + num_devs++; + } + strcpy(chosen_name, mp->path); + if (c->verbose > 0 || mddev == NULL || + strcmp(mddev, chosen_name) != 0) + pr_err("Merging with already-assembled %s\n", + chosen_name); + mdfd = open_dev_excl(mp->devnm); + } else { + int trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, c->homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local + */ + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name); + } + if (mdfd < 0) { + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + mddev = chosen_name; + if (get_linux_version() < 2004000 || + md_get_version(mdfd) < 9000) { + pr_err("Assemble requires Linux 2.4 or later, and\n" + " md driver version 0.90.0 or later.\n" + " Upgrade your kernel or try --build\n"); + close(mdfd); + return 1; + } + if (pre_exist == NULL) { + if (mddev_busy(fd2devnm(mdfd))) { + pr_err("%s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + pr_err("%s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + /* just incase it was started but has no content */ + ioctl(mdfd, STOP_ARRAY, NULL); + } + +#ifndef MDASSEMBLE + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + int err; + err = assemble_container_content(st, mdfd, content, c, + chosen_name); + close(mdfd); + return err; + } +#endif + /* Ok, no bad inconsistancy, we can try updating etc */ + devices = xcalloc(num_devs, sizeof(*devices)); + devmap = xcalloc(num_devs, content->array.raid_disks); + devcnt = load_devices(devices, devmap, ident, st, devlist, + c, content, mdfd, mddev, + &most_recent, &bestcnt, &best, inargv); + if (devcnt < 0) + return 1; + + if (devcnt == 0) { + pr_err("no devices found for %s\n", + mddev); + if (st) + st->ss->free_super(st); + close(mdfd); + free(devices); + free(devmap); + return 1; + } + + if (c->update && strcmp(c->update, "byteorder")==0) + st->minor_version = 90; + + st->ss->getinfo_super(st, content, NULL); + clean = content->array.state & 1; + + /* now we have some devices that might be suitable. + * I wonder how many + */ + avail = xcalloc(content->array.raid_disks, 1); + okcnt = 0; + replcnt = 0; + sparecnt=0; + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { + int j = best[i]; + int event_margin = 1; /* always allow a difference of '1' + * like the kernel does + */ + if (j < 0) continue; + /* note: we ignore error flags in multipath arrays + * as they don't make sense + */ + if (content->array.level != LEVEL_MULTIPATH) + if (!(devices[j].i.disk.state & (1<force && + content->array.raid_disks > 0 && + devices[most_recent].i.disk.raid_disk >= 0 && + devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { + if (c->verbose > -1) + pr_err("ignoring %s as it reports %s as failed\n", + devices[j].devname, devices[most_recent].devname); + best[i] = -1; + continue; + } + /* Require event counter to be same as, or just less than, + * most recent. If it is bigger, it must be a stray spare and + * should be ignored. + */ + if (devices[j].i.events+event_margin >= + devices[most_recent].i.events && + devices[j].i.events <= + devices[most_recent].i.events + ) { + devices[j].uptodate = 1; + if (i < content->array.raid_disks * 2) { + if (devices[j].i.recovery_start == MaxSector || + (content->reshape_active && + ((i >= content->array.raid_disks - content->delta_disks) || + (i >= content->array.raid_disks - content->delta_disks - 1 + && content->array.level == 4)))) { + if (!avail[i/2]) { + okcnt++; + avail[i/2]=1; + } else + replcnt++; + } else + rebuilding_cnt++; + } else + sparecnt++; + } + } + free(devmap); + if (c->force) { + int force_ok = force_array(content, devices, best, bestcnt, + avail, most_recent, st, c); + okcnt += force_ok; + if (force_ok) + was_forced = 1; + } + /* Now we want to look at the superblock which the kernel will base things on + * and compare the devices that we think are working with the devices that the + * superblock thinks are working. + * If there are differences and --force is given, then update this chosen + * superblock. + */ + chosen_drive = -1; + st->ss->free_super(st); + for (i=0; chosen_drive < 0 && iss->load_super(st,fd, NULL)) { close(fd); - fprintf(stderr, Name ": RAID superblock has disappeared from %s\n", - devices[j].devname); + pr_err("RAID superblock has disappeared from %s\n", + devices[j].devname); close(mdfd); free(devices); return 1; @@ -1114,23 +1559,25 @@ close(fd); } if (st->sb == NULL) { - fprintf(stderr, Name ": No suitable drives found for %s\n", mddev); + pr_err("No suitable drives found for %s\n", mddev); close(mdfd); free(devices); return 1; } st->ss->getinfo_super(st, content, NULL); #ifndef MDASSEMBLE - sysfs_init(content, mdfd, 0); + sysfs_init(content, mdfd, NULL); #endif for (i=0; iarray.raid_disks) - desired_state = (1<= content->array.raid_disks * 2) desired_state = 0; + else if (i & 1) + desired_state = (1<ss->update_super(st, &devices[j].i, "assemble", NULL, - verbose, 0, NULL)) { - if (force) { - if (verbose >= 0) - fprintf(stderr, Name ": " - "clearing FAULTY flag for device %d in %s for %s\n", - j, mddev, devices[j].devname); + c->verbose, 0, NULL)) { + if (c->force) { + if (c->verbose >= 0) + pr_err("clearing FAULTY flag for device %d in %s for %s\n", + j, mddev, devices[j].devname); change = 1; } else { - if (verbose >= -1) - fprintf(stderr, Name ": " - "device %d in %s has wrong state in superblock, but %s seems ok\n", - i, mddev, devices[j].devname); + if (c->verbose >= -1) + pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n", + i, mddev, devices[j].devname); } } #if 0 if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) { - fprintf(stderr, Name ": devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", - i, mddev); + pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", + i, mddev); } #endif } - if (force && !clean && + if (c->force && !clean && !enough(content->array.level, content->array.raid_disks, content->array.layout, clean, avail)) { change += st->ss->update_super(st, content, "force-array", - devices[chosen_drive].devname, verbose, + devices[chosen_drive].devname, c->verbose, 0, NULL); + was_forced = 1; clean = 1; } if (change) { int fd; - fd = dev_open(devices[chosen_drive].devname, O_RDWR|O_EXCL); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? + O_RDWR : (O_RDWR|O_EXCL)); if (fd < 0) { - fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n", - devices[chosen_drive].devname); + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[chosen_drive].devname); close(mdfd); free(devices); return 1; } if (st->ss->store_super(st, fd)) { close(fd); - fprintf(stderr, Name ": Could not re-write superblock on %s\n", - devices[chosen_drive].devname); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); close(mdfd); free(devices); return 1; } - if (verbose >= 0) - fprintf(stderr, Name ": Marking array %s as 'clean'\n", - mddev); + if (c->verbose >= 0) + pr_err("Marking array %s as 'clean'\n", + mddev); close(fd); } @@ -1202,20 +1650,24 @@ * The code of doing this lives in Grow.c */ #ifndef MDASSEMBLE - if (content->reshape_active) { + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP)) { int err = 0; - int *fdlist = malloc(sizeof(int)* bestcnt); - if (verbose > 0) - fprintf(stderr, Name ":%s has an active reshape - checking " - "if critical section needs to be restored\n", - chosen_name); - for (i=0; iverbose > 0) + pr_err(":%s has an active reshape - checking " + "if critical section needs to be restored\n", + chosen_name); + enable_fds(bestcnt/2); + for (i = 0; i < bestcnt/2; i++) { + int j = best[i*2]; if (j >= 0) { - fdlist[i] = dev_open(devices[j].devname, O_RDWR|O_EXCL); + fdlist[i] = dev_open(devices[j].devname, + devices[j].included + ? O_RDWR : (O_RDWR|O_EXCL)); if (fdlist[i] < 0) { - fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n", - devices[j].devname); + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[j].devname); err = 1; break; } @@ -1226,12 +1678,12 @@ if (st->ss->external && st->ss->recover_backup) err = st->ss->recover_backup(st, content); else - err = Grow_restart(st, content, fdlist, bestcnt, - backup_file, verbose > 0); - if (err && invalid_backup) { - if (verbose > 0) - fprintf(stderr, Name ": continuing" - " without restoring backup\n"); + err = Grow_restart(st, content, fdlist, bestcnt/2, + c->backup_file, c->verbose > 0); + if (err && c->invalid_backup) { + if (c->verbose > 0) + pr_err("continuing" + " without restoring backup\n"); err = 0; } } @@ -1239,320 +1691,83 @@ i--; if (fdlist[i]>=0) close(fdlist[i]); } + free(fdlist); if (err) { - fprintf(stderr, Name ": Failed to restore critical section for reshape, sorry.\n"); - if (backup_file == NULL) - fprintf(stderr," Possibly you needed to specify the --backup-file\n"); + pr_err("Failed to restore critical section for reshape, sorry.\n"); + if (c->backup_file == NULL) + cont_err("Possibly you needed to specify the --backup-file\n"); close(mdfd); free(devices); return err; } } #endif - /* count number of in-sync devices according to the superblock. - * We must have this number to start the array without -s or -R - */ - req_cnt = content->array.working_disks; /* Almost ready to actually *do* something */ - if (!old_linux) { - int rv; - - /* First, fill in the map, so that udev can find our name - * as soon as we become active. - */ - map_update(NULL, fd2devnum(mdfd), content->text_version, - content->uuid, chosen_name); - - rv = set_array_info(mdfd, st, content); - if (rv) { - fprintf(stderr, Name ": failed to set array info for %s: %s\n", - mddev, strerror(errno)); - ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - free(devices); - return 1; - } - if (ident->bitmap_fd >= 0) { - if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { - fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n"); - ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - free(devices); - return 1; - } - } else if (ident->bitmap_file) { - /* From config file */ - int bmfd = open(ident->bitmap_file, O_RDWR); - if (bmfd < 0) { - fprintf(stderr, Name ": Could not open bitmap file %s\n", - ident->bitmap_file); - ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - free(devices); - return 1; - } - if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { - fprintf(stderr, Name ": Failed to set bitmapfile for %s\n", mddev); - close(bmfd); - ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - free(devices); - return 1; - } - close(bmfd); - } - - /* First, add the raid disks, but add the chosen one last */ - for (i=0; i<= bestcnt; i++) { - int j; - if (i < bestcnt) { - j = best[i]; - if (j == chosen_drive) - continue; - } else - j = chosen_drive; - - if (j >= 0 /* && devices[j].uptodate */) { - int dfd = dev_open(devices[j].devname, - O_RDWR|O_EXCL); - if (dfd >= 0) { - remove_partitions(dfd); - close(dfd); - } - rv = add_disk(mdfd, st, content, &devices[j].i); + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + if (c->update && strcmp(c->update, "metadata")==0) { + content->array.major_version = 1; + content->array.minor_version = 0; + strcpy(content->text_version, "1.0"); + } - if (rv) { - fprintf(stderr, Name ": failed to add " - "%s to %s: %s\n", - devices[j].devname, - mddev, - strerror(errno)); - if (i < content->array.raid_disks - || i == bestcnt) - okcnt--; - else - sparecnt--; - } else if (verbose > 0) - fprintf(stderr, Name ": added %s " - "to %s as %d%s\n", - devices[j].devname, mddev, - devices[j].i.disk.raid_disk, - devices[j].uptodate?"": - " (possibly out of date)"); - } else if (verbose > 0 && i < content->array.raid_disks) - fprintf(stderr, Name ": no uptodate device for " - "slot %d of %s\n", - i, mddev); - } - - if (content->array.level == LEVEL_CONTAINER) { - if (verbose >= 0) { - fprintf(stderr, Name ": Container %s has been " - "assembled with %d drive%s", - mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s"); - if (okcnt < (unsigned)content->array.raid_disks) - fprintf(stderr, " (out of %d)", - content->array.raid_disks); - fprintf(stderr, "\n"); - } - st->ss->free_super(st); - sysfs_uevent(content, "change"); - wait_for(chosen_name, mdfd); - close(mdfd); - free(devices); - return 0; - } + map_update(&map, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); - if (runstop == 1 || - (runstop <= 0 && - ( enough(content->array.level, content->array.raid_disks, - content->array.layout, clean, avail) && - (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok) - ))) { - /* This array is good-to-go. - * If a reshape is in progress then we might need to - * continue monitoring it. In that case we start - * it read-only and let the grow code make it writable. + rv = start_array(mdfd, mddev, content, + st, ident, best, bestcnt, + chosen_drive, devices, okcnt, sparecnt, + rebuilding_cnt, + c, + clean, avail, start_partial_ok, + pre_exist != NULL, + was_forced); + if (rv == 1 && !pre_exist) + ioctl(mdfd, STOP_ARRAY, NULL); + free(devices); + map_unlock(&map); + if (rv == 0) { + wait_for(chosen_name, mdfd); + close(mdfd); + if (auto_assem) { + int usecs = 1; + /* There is a nasty race with 'mdadm --monitor'. + * If it opens this device before we close it, + * it gets an incomplete open on which IO + * doesn't work and the capacity is + * wrong. + * If we reopen (to check for layered devices) + * before --monitor closes, we loose. + * + * So: wait upto 1 second for there to be + * a non-zero capacity. */ - int rv; -#ifndef MDASSEMBLE - if (content->reshape_active && - content->delta_disks <= 0) { - rv = sysfs_set_str(content, NULL, - "array_state", "readonly"); - if (rv == 0) - rv = Grow_continue(mdfd, st, content, - backup_file, - freeze_reshape); - } else -#endif - rv = ioctl(mdfd, RUN_ARRAY, NULL); - if (rv == 0) { - if (verbose >= 0) { - fprintf(stderr, Name ": %s has been started with %d drive%s", - mddev, okcnt, okcnt==1?"":"s"); - if (okcnt < (unsigned)content->array.raid_disks) - fprintf(stderr, " (out of %d)", content->array.raid_disks); - if (rebuilding_cnt) - fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); - if (sparecnt) - fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); - fprintf(stderr, ".\n"); - } - if (content->reshape_active && - content->array.level >= 4 && - content->array.level <= 6) { - /* might need to increase the size - * of the stripe cache - default is 256 - */ - if (256 < 4 * (content->array.chunk_size/4096)) { - struct mdinfo *sra = sysfs_read(mdfd, 0, 0); - if (sra) - sysfs_set_num(sra, NULL, - "stripe_cache_size", - (4 * content->array.chunk_size / 4096) + 1); - sysfs_free(sra); - } + while (usecs < 1000) { + mdfd = open(mddev, O_RDONLY); + if (mdfd >= 0) { + unsigned long long size; + if (get_dev_size(mdfd, NULL, &size) && + size > 0) + break; + close(mdfd); } - if (okcnt < (unsigned)content->array.raid_disks) { - /* If any devices did not get added - * because the kernel rejected them based - * on event count, try adding them - * again providing the action policy is - * 're-add' or greater. The bitmap - * might allow them to be included, or - * they will become spares. - */ - for (i = 0; i < bestcnt; i++) { - int j = best[i]; - if (j >= 0 && !devices[j].uptodate) { - if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) - continue; - rv = add_disk(mdfd, st, content, - &devices[j].i); - if (rv == 0 && verbose >= 0) - fprintf(stderr, - Name ": %s has been re-added.\n", - devices[j].devname); - } - } - } - wait_for(mddev, mdfd); - close(mdfd); - if (auto_assem) { - int usecs = 1; - /* There is a nasty race with 'mdadm --monitor'. - * If it opens this device before we close it, - * it gets an incomplete open on which IO - * doesn't work and the capacity is - * wrong. - * If we reopen (to check for layered devices) - * before --monitor closes, we loose. - * - * So: wait upto 1 second for there to be - * a non-zero capacity. - */ - while (usecs < 1000) { - mdfd = open(mddev, O_RDONLY); - if (mdfd >= 0) { - unsigned long long size; - if (get_dev_size(mdfd, NULL, &size) && - size > 0) - break; - close(mdfd); - } - usleep(usecs); - usecs <<= 1; - } - } - free(devices); - return 0; - } - fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n", - mddev, strerror(errno)); - - if (!enough(content->array.level, content->array.raid_disks, - content->array.layout, 1, avail)) - fprintf(stderr, Name ": Not enough devices to " - "start the array.\n"); - else if (!enough(content->array.level, - content->array.raid_disks, - content->array.layout, clean, - avail)) - fprintf(stderr, Name ": Not enough devices to " - "start the array while not clean " - "- consider --force.\n"); - - if (auto_assem) - ioctl(mdfd, STOP_ARRAY, NULL); - close(mdfd); - free(devices); - return 1; - } - if (runstop == -1) { - fprintf(stderr, Name ": %s assembled from %d drive%s", - mddev, okcnt, okcnt==1?"":"s"); - if (okcnt != (unsigned)content->array.raid_disks) - fprintf(stderr, " (out of %d)", content->array.raid_disks); - fprintf(stderr, ", but not started.\n"); - close(mdfd); - free(devices); - return 0; - } - if (verbose >= -1) { - fprintf(stderr, Name ": %s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); - if (rebuilding_cnt) - fprintf(stderr, "%s %d rebuilding", sparecnt?", ":" and ", rebuilding_cnt); - if (sparecnt) - fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); - if (!enough(content->array.level, content->array.raid_disks, - content->array.layout, 1, avail)) - fprintf(stderr, " - not enough to start the array.\n"); - else if (!enough(content->array.level, - content->array.raid_disks, - content->array.layout, clean, - avail)) - fprintf(stderr, " - not enough to start the " - "array while not clean - consider " - "--force.\n"); - else { - if (req_cnt == (unsigned)content->array.raid_disks) - fprintf(stderr, " - need all %d to start it", req_cnt); - else - fprintf(stderr, " - need %d of %d to start", req_cnt, content->array.raid_disks); - fprintf(stderr, " (use --run to insist).\n"); + usleep(usecs); + usecs <<= 1; } } - if (auto_assem) - ioctl(mdfd, STOP_ARRAY, NULL); + } else close(mdfd); - free(devices); - return 1; - } else { - /* The "chosen_drive" is a good choice, and if necessary, the superblock has - * been updated to point to the current locations of devices. - * so we can just start the array - */ - unsigned long dev; - dev = makedev(devices[chosen_drive].i.disk.major, - devices[chosen_drive].i.disk.minor); - if (ioctl(mdfd, START_ARRAY, dev)) { - fprintf(stderr, Name ": Cannot start array: %s\n", - strerror(errno)); - } - } - close(mdfd); - free(devices); - return 0; + /* '2' means 'OK, but not started yet' */ + return rv == 2 ? 0 : rv; } #ifndef MDASSEMBLE int assemble_container_content(struct supertype *st, int mdfd, - struct mdinfo *content, int runstop, - char *chosen_name, int verbose, - char *backup_file, int freeze_reshape) + struct mdinfo *content, struct context *c, + char *chosen_name) { struct mdinfo *dev, *sra; int working = 0, preexist = 0; @@ -1561,22 +1776,28 @@ int old_raid_disks; int start_reshape; - sysfs_init(content, mdfd, 0); + sysfs_init(content, mdfd, NULL); - sra = sysfs_read(mdfd, 0, GET_VERSION); - if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) + sra = sysfs_read(mdfd, NULL, GET_VERSION); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) { + if (content->array.major_version == -1 && + content->array.minor_version == -2 && + c->readonly && + content->text_version[0] == '/') + content->text_version[0] = '-'; if (sysfs_set_array(content, md_get_version(mdfd)) != 0) { if (sra) sysfs_free(sra); return 1; } + } /* There are two types of reshape: container wide or sub-array specific * Check if metadata requests blocking container wide reshapes */ start_reshape = (content->reshape_active && - !((content->reshape_active == CONTAINER_RESHAPE) && - (content->array.state & (1<reshape_active == CONTAINER_RESHAPE) && + (content->array.state & (1<text_version, content->uuid, chosen_name); - if (runstop > 0 || - (working + preexist + expansion) >= - content->array.working_disks) { + if (c->runstop > 0 || + (working + preexist + expansion) >= + content->array.working_disks) { int err; if (start_reshape) { int spare = content->array.raid_disks + expansion; if (restore_backup(st, content, working, - spare, backup_file, verbose) == 1) + spare, c->backup_file, c->verbose) == 1) return 1; err = sysfs_set_str(content, NULL, @@ -1621,34 +1842,34 @@ return 1; if (st->ss->external) { - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); - ping_monitor_by_id(st->container_dev); - if (mdmon_running(st->container_dev) && - st->update_tail == NULL) + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) st->update_tail = &st->updates; } - err = Grow_continue(mdfd, st, content, backup_file, - freeze_reshape); + err = Grow_continue(mdfd, st, content, c->backup_file, + c->freeze_reshape); } else switch(content->array.level) { - case LEVEL_LINEAR: - case LEVEL_MULTIPATH: - case 0: - err = sysfs_set_str(content, NULL, "array_state", - "active"); - break; - default: - err = sysfs_set_str(content, NULL, "array_state", - "readonly"); - /* start mdmon if needed. */ - if (!err) { - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); - ping_monitor_by_id(st->container_dev); + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + c->readonly ? "readonly" : "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + } + break; } - break; - } if (!err) sysfs_set_safemode(content, content->safe_mode_delay); @@ -1660,17 +1881,15 @@ !start_reshape) block_subarray(content); - if (verbose >= 0) { + if (c->verbose >= 0) { if (err) - fprintf(stderr, Name - ": array %s now has %d device%s", - chosen_name, working + preexist, - working + preexist == 1 ? "":"s"); + pr_err("array %s now has %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); else - fprintf(stderr, Name - ": Started %s with %d device%s", - chosen_name, working + preexist, - working + preexist == 1 ? "":"s"); + pr_err("Started %s with %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); if (preexist) fprintf(stderr, " (%d new)", working); if (expansion) @@ -1683,11 +1902,10 @@ return err; /* FIXME should have an O_EXCL and wait for read-auto */ } else { - if (verbose >= 0) { - fprintf(stderr, Name - ": %s assembled with %d device%s", - chosen_name, preexist + working, - preexist + working == 1 ? "":"s"); + if (c->verbose >= 0) { + pr_err("%s assembled with %d device%s", + chosen_name, preexist + working, + preexist + working == 1 ? "":"s"); if (preexist) fprintf(stderr, " (%d new)", working); fprintf(stderr, " but not started\n"); @@ -1696,4 +1914,3 @@ } } #endif - diff -Nru mdadm-3.2.5/bitmap.c mdadm-3.3/bitmap.c --- mdadm-3.2.5/bitmap.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/bitmap.c 2013-09-03 04:47:47.000000000 +0000 @@ -97,7 +97,7 @@ { int i, num = 0; - for (i=0; i < num_bits / 8; i++) + for (i = 0; i < num_bits / 8; i++) num += count_dirty_bits_byte(buf[i], 8); if (num_bits % 8) /* not an even byte boundary */ @@ -121,7 +121,6 @@ return (bits + bits_per_sector - 1) / bits_per_sector; } - bitmap_info_t *bitmap_fd_read(int fd, int brief) { /* Note: fd might be open O_DIRECT, so we must be @@ -133,27 +132,16 @@ unsigned int n, skip; if (posix_memalign(&buf, 4096, 8192) != 0) { - fprintf(stderr, Name ": failed to allocate 8192 bytes\n"); + pr_err("failed to allocate 8192 bytes\n"); return NULL; } n = read(fd, buf, 8192); - info = malloc(sizeof(*info)); - if (info == NULL) { -#if __GNUC__ < 3 - fprintf(stderr, Name ": failed to allocate %d bytes\n", - (int)sizeof(*info)); -#else - fprintf(stderr, Name ": failed to allocate %zd bytes\n", - sizeof(*info)); -#endif - free(buf); - return NULL; - } + info = xmalloc(sizeof(*info)); if (n < sizeof(info->sb)) { - fprintf(stderr, Name ": failed to read superblock of bitmap " - "file: %s\n", strerror(errno)); + pr_err("failed to read superblock of bitmap " + "file: %s\n", strerror(errno)); free(info); free(buf); return NULL; @@ -194,7 +182,7 @@ } if (read_bits < total_bits) { /* file truncated... */ - fprintf(stderr, Name ": WARNING: bitmap file is not large " + pr_err("WARNING: bitmap file is not large " "enough for array size %llu!\n\n", (unsigned long long)info->sb.sync_size); total_bits = read_bits; @@ -214,14 +202,14 @@ struct supertype *st = *stp; if (stat(filename, &stb) < 0) { - fprintf(stderr, Name ": failed to find file %s: %s\n", + pr_err("failed to find file %s: %s\n", filename, strerror(errno)); return NULL; } if ((S_IFMT & stb.st_mode) == S_IFBLK) { - fd = open(filename, O_RDONLY); + fd = open(filename, O_RDONLY|O_DIRECT); if (fd < 0) { - fprintf(stderr, Name ": failed to open bitmap file %s: %s\n", + pr_err("failed to open bitmap file %s: %s\n", filename, strerror(errno)); return NULL; } @@ -231,18 +219,17 @@ /* just look at device... */ lseek(fd, 0, 0); } else if (!st->ss->locate_bitmap) { - fprintf(stderr, Name ": No bitmap possible with %s metadata\n", + pr_err("No bitmap possible with %s metadata\n", st->ss->name); return NULL; } else st->ss->locate_bitmap(st, fd); - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ *stp = st; } else { fd = open(filename, O_RDONLY|O_DIRECT); if (fd < 0) { - fprintf(stderr, Name ": failed to open bitmap file %s: %s\n", + pr_err("failed to open bitmap file %s: %s\n", filename, strerror(errno)); return NULL; } @@ -286,12 +273,12 @@ printf(" Filename : %s\n", filename); printf(" Magic : %08x\n", sb->magic); if (sb->magic != BITMAP_MAGIC) { - fprintf(stderr, Name ": invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); + pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); } printf(" Version : %d\n", sb->version); if (sb->version < BITMAP_MAJOR_LO || sb->version > BITMAP_MAJOR_HI) { - fprintf(stderr, Name ": unknown bitmap version %d, either the bitmap file is corrupted or you need to upgrade your tools\n", sb->version); + pr_err("unknown bitmap version %d, either the bitmap file is corrupted or you need to upgrade your tools\n", sb->version); goto free_info; } @@ -357,13 +344,13 @@ long long bytes, filesize; if (!force && access(filename, F_OK) == 0) { - fprintf(stderr, Name ": bitmap file %s already exists, use --force to overwrite\n", filename); + pr_err("bitmap file %s already exists, use --force to overwrite\n", filename); return rv; } fp = fopen(filename, "w"); if (fp == NULL) { - fprintf(stderr, Name ": failed to open bitmap file %s: %s\n", + pr_err("failed to open bitmap file %s: %s\n", filename, strerror(errno)); return rv; } @@ -393,7 +380,7 @@ sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ if (fwrite(&sb, sizeof(sb), 1, fp) != 1) { - fprintf(stderr, Name ": failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno)); + pr_err("failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno)); goto out; } @@ -410,7 +397,7 @@ while (bytes > 0) { if (fwrite(block, sizeof(block), 1, fp) != 1) { - fprintf(stderr, Name ": failed to write bitmap file %s: %s\n", filename, strerror(errno)); + pr_err("failed to write bitmap file %s: %s\n", filename, strerror(errno)); goto out; } bytes -= sizeof(block); diff -Nru mdadm-3.2.5/Build.c mdadm-3.3/Build.c --- mdadm-3.2.5/Build.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Build.c 2013-09-03 04:47:47.000000000 +0000 @@ -24,14 +24,12 @@ #include "mdadm.h" -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) - -int Build(char *mddev, int chunk, int level, int layout, - int raiddisks, struct mddev_dev *devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, - int delay, int verbose, int autof, unsigned long long size) +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) + +int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c) { /* Build a linear or raid0 arrays without superblocks * We cannot really do any checks, we just do it. @@ -66,53 +64,50 @@ continue; } if (stat(dv->devname, &stb)) { - fprintf(stderr, Name ": Cannot find %s: %s\n", + pr_err("Cannot find %s: %s\n", dv->devname, strerror(errno)); return 1; } if ((stb.st_mode & S_IFMT) != S_IFBLK) { - fprintf(stderr, Name ": %s is not a block device.\n", + pr_err("%s is not a block device.\n", dv->devname); return 1; } } - if (raiddisks != subdevs) { - fprintf(stderr, Name ": requested %d devices in array but listed %d\n", - raiddisks, subdevs); + if (s->raiddisks != subdevs) { + pr_err("requested %d devices in array but listed %d\n", + s->raiddisks, subdevs); return 1; } - if (layout == UnSet) - switch(level) { + if (s->layout == UnSet) + switch(s->level) { default: /* no layout */ - layout = 0; + s->layout = 0; break; case 10: - layout = 0x102; /* near=2, far=1 */ - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to n1\n"); + s->layout = 0x102; /* near=2, far=1 */ + if (c->verbose > 0) + pr_err("layout defaults to n1\n"); break; case 5: case 6: - layout = map_name(r5layout, "default"); - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(r5layout, layout)); + s->layout = map_name(r5layout, "default"); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, s->layout)); break; case LEVEL_FAULTY: - layout = map_name(faultylayout, "default"); + s->layout = map_name(faultylayout, "default"); - if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(faultylayout, layout)); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout)); break; } /* We need to create the device. It can have no name. */ map_lock(&map); - mdfd = create_mddev(mddev, NULL, autof, LOCAL, + mdfd = create_mddev(mddev, NULL, c->autof, LOCAL, chosen_name); if (mdfd < 0) { map_unlock(&map); @@ -120,7 +115,7 @@ } mddev = chosen_name; - map_update(&map, fd2devnum(mdfd), "none", uuid, chosen_name); + map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name); map_unlock(&map); vers = md_get_version(mdfd); @@ -128,38 +123,42 @@ /* looks Ok, go for it */ if (vers >= 9000) { mdu_array_info_t array; - array.level = level; - array.size = size; - array.nr_disks = raiddisks; - array.raid_disks = raiddisks; + array.level = s->level; + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + array.nr_disks = s->raiddisks; + array.raid_disks = s->raiddisks; array.md_minor = 0; if (fstat(mdfd, &stb)==0) array.md_minor = minor(stb.st_rdev); array.not_persistent = 1; array.state = 0; /* not clean, but no errors */ - if (assume_clean) + if (s->assume_clean) array.state |= 1; - array.active_disks = raiddisks - missing_disks; - array.working_disks = raiddisks - missing_disks; + array.active_disks = s->raiddisks - missing_disks; + array.working_disks = s->raiddisks - missing_disks; array.spare_disks = 0; array.failed_disks = missing_disks; - if (chunk == 0 && (level==0 || level==LEVEL_LINEAR)) - chunk = 64; - array.chunk_size = chunk*1024; - array.layout = layout; + if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR)) + s->chunk = 64; + array.chunk_size = s->chunk*1024; + array.layout = s->layout; if (ioctl(mdfd, SET_ARRAY_INFO, &array)) { - fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n", + pr_err("SET_ARRAY_INFO failed for %s: %s\n", mddev, strerror(errno)); goto abort; } - } else if (bitmap_file) { - fprintf(stderr, Name ": bitmaps not supported with this kernel\n"); + } else if (s->bitmap_file) { + pr_err("bitmaps not supported with this kernel\n"); goto abort; } - if (bitmap_file && level <= 0) { - fprintf(stderr, Name ": bitmaps not meaningful with level %s\n", - map_num(pers, level)?:"given"); + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); goto abort; } /* now add the devices */ @@ -169,24 +168,24 @@ if (strcmp("missing", dv->devname) == 0) continue; if (stat(dv->devname, &stb)) { - fprintf(stderr, Name ": Weird: %s has disappeared.\n", + pr_err("Weird: %s has disappeared.\n", dv->devname); goto abort; } if ((stb.st_mode & S_IFMT)!= S_IFBLK) { - fprintf(stderr, Name ": Wierd: %s is no longer a block device.\n", + pr_err("Weird: %s is no longer a block device.\n", dv->devname); goto abort; } fd = open(dv->devname, O_RDONLY|O_EXCL); if (fd < 0) { - fprintf(stderr, Name ": Cannot open %s: %s\n", + pr_err("Cannot open %s: %s\n", dv->devname, strerror(errno)); goto abort; } if (get_dev_size(fd, NULL, &dsize) && - (size == 0 || dsize < size)) - size = dsize; + (s->size == 0 || s->size == MAX_SIZE || dsize < s->size)) + s->size = dsize; close(fd); if (vers >= 9000) { mdu_disk_info_t disk; @@ -198,13 +197,13 @@ disk.major = major(stb.st_rdev); disk.minor = minor(stb.st_rdev); if (ioctl(mdfd, ADD_NEW_DISK, &disk)) { - fprintf(stderr, Name ": ADD_NEW_DISK failed for %s: %s\n", + pr_err("ADD_NEW_DISK failed for %s: %s\n", dv->devname, strerror(errno)); goto abort; } } else { if (ioctl(mdfd, REGISTER_DEV, &stb.st_rdev)) { - fprintf(stderr, Name ": REGISTER_DEV failed for %s: %s.\n", + pr_err("REGISTER_DEV failed for %s: %s.\n", dv->devname, strerror(errno)); goto abort; } @@ -213,71 +212,72 @@ /* now to start it */ if (vers >= 9000) { mdu_param_t param; /* not used by syscall */ - if (bitmap_file) { - bitmap_fd = open(bitmap_file, O_RDWR); + if (s->bitmap_file) { + bitmap_fd = open(s->bitmap_file, O_RDWR); if (bitmap_fd < 0) { int major = BITMAP_MAJOR_HI; #if 0 - if (bitmap_chunk == UnSet) { - fprintf(stderr, Name ": %s cannot be openned.", - bitmap_file); + if (s->bitmap_chunk == UnSet) { + pr_err("%s cannot be openned.", + s->bitmap_file); goto abort; } #endif if (vers < 9003) { major = BITMAP_MAJOR_HOSTENDIAN; #ifdef __BIG_ENDIAN - fprintf(stderr, Name ": Warning - bitmaps created on this kernel are not portable\n" + pr_err("Warning - bitmaps created on this kernel are not portable\n" " between different architectures. Consider upgrading the Linux kernel.\n"); #endif } - bitmapsize = size>>9; /* FIXME wrong for RAID10 */ - if (CreateBitmap(bitmap_file, 1, NULL, bitmap_chunk, - delay, write_behind, bitmapsize, major)) { + bitmapsize = s->size>>9; /* FIXME wrong for RAID10 */ + if (CreateBitmap(s->bitmap_file, 1, NULL, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { goto abort; } - bitmap_fd = open(bitmap_file, O_RDWR); + bitmap_fd = open(s->bitmap_file, O_RDWR); if (bitmap_fd < 0) { - fprintf(stderr, Name ": %s cannot be openned.", - bitmap_file); + pr_err("%s cannot be openned.", + s->bitmap_file); goto abort; } } if (bitmap_fd >= 0) { if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { - fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n", + pr_err("Cannot set bitmap file for %s: %s\n", mddev, strerror(errno)); goto abort; } } } if (ioctl(mdfd, RUN_ARRAY, ¶m)) { - fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", + pr_err("RUN_ARRAY failed: %s\n", strerror(errno)); - if (chunk & (chunk-1)) { - fprintf(stderr, " : Problem may be that chunk size" - " is not a power of 2\n"); + if (s->chunk & (s->chunk-1)) { + cont_err("Problem may be that chunk size" + " is not a power of 2\n"); } goto abort; } } else { unsigned long arg; arg=0; - while (chunk > 4096) { + while (s->chunk > 4096) { arg++; - chunk >>= 1; + s->chunk >>= 1; } - if (level == 0) - chunk |= 0x20000; - else chunk |= 0x10000; + if (s->level == 0) + arg |= 0x20000; + else + arg |= 0x10000; if (ioctl(mdfd, START_MD, arg)) { - fprintf(stderr, Name ": START_MD failed: %s\n", + pr_err("START_MD failed: %s\n", strerror(errno)); goto abort; } } - if (verbose >= 0) - fprintf(stderr, Name ": array %s built and started.\n", + if (c->verbose >= 0) + pr_err("array %s built and started.\n", mddev); wait_for(mddev, mdfd); close(mdfd); diff -Nru mdadm-3.2.5/ChangeLog mdadm-3.3/ChangeLog --- mdadm-3.2.5/ChangeLog 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/ChangeLog 2013-09-03 04:47:47.000000000 +0000 @@ -1,6 +1,197 @@ Please see git logs for detailed change log. This file just contains highlight. +Changes Prior to release 3.3 +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +Changes Prior to release 3.2.6 + - There are no real stand-out fixes, just lots of little bits and pieces. + +Changes Prior to release 3.2.5 + - This release primarily fixes a serious regression in 3.2.4. + This regression does *not* cause any risk to data. It simply + means that adding a device with "--add" would sometime fail + when it should not. + + - The fix also includes a couple of minor fixes such as making + the "--layout=preserve" option to "--grow" work again. + + +Changes Prior to release 3.2.4 +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Changes Prior to release 3.2.3 + - The largest single area of change is support for reshape of Intel + IMSM arrays (OnLine Capacity Explansion and Level Migration). + - Among other fixes, this now has a better chance of surviving if a + device fails during reshape. + +Changes Prior to release 3.2.2 + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Changes Prior to release 3.2.1 + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + +Changes Prior to release 3.2 + - By far the most significant change in this release related to the + management of reshaping arrays. This code has been substantially + re-written so that it can work with 'externally managed metadata' - + Intel's IMSM in particular. We now support level migration and + OnLine Capacity Expansion on these arrays. + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + +Changes Prior to release 3.1.5 + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + Changes Prior to release 3.1.4 Two fixes related to configs that aren't using udev: - Don't remove md devices which 'standard' names on --stop @@ -9,7 +200,7 @@ - Allow --incremental to add spares to an array - Accept --no-degraded as a deprecated option rather than throwing an error - - Return correct success status when --incrmental assembling + - Return correct success status when --incrmental assembling a container which does not yet have enough devices. - Don't link mdadm with pthreads, only mdmon needs it. - Fix compiler warning due to bad use of snprintf @@ -41,7 +232,7 @@ Changes Prior to release 3.1.2 - The default metadata has change again (sorry about that). It is now v1.2 and will hopefully stay that way. It turned - out there with boot-block issues with v1.1 which make it + out there with boot-block issues with v1.1 which make it unsuitable for a default, though in many cases it is still suitable to use. - Stopping a container is not permitted when members are still @@ -64,7 +255,7 @@ - Add section on 'scrubbing' to 'md' man page. - Various command-line-option parsing improvements. - ... and lots of other bug fixes. - + Changes Prior to release 3.1.1 - Multiple fixes for new --grow levels including fixes for serious data corruption problems. @@ -80,10 +271,10 @@ - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and back. - Support --grow to reduce the number of devices in RAID4/5/6. - - Support restart of these grow options which assembling an array + - Support restart of these grow options which assembling an array which is partially grown. - Assorted tests of this code, and of different RAID6 layouts. - + Changes Prior to release 3.0.3 - Improvements for creating arrays giving just a name, like 'foo', rather than the full '/dev/md/foo'. @@ -95,7 +286,7 @@ - Handle merging of devices that have left an IMSM array and are being re-incorporated. - Add missing space in "--detail --brief" output. - + Changes Prior to release 3.0.2 - Fix crash when hosthost is not set, as often happens in early boot. @@ -104,7 +295,7 @@ - Fix various segfaults - Fixed for --examine with containers - Lots of other little fixes. - + Changes Prior to release 3.0 - Support for externally managed metadata, specifically DDF and IMSM. - Depend on udev to create entries in /dev, rather than creating them diff -Nru mdadm-3.2.5/config.c mdadm-3.3/config.c --- mdadm-3.2.5/config.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/config.c 2013-09-03 04:47:47.000000000 +0000 @@ -72,7 +72,9 @@ #define CONFFILE2 "/etc/mdadm/mdadm.conf" #endif char DefaultConfFile[] = CONFFILE; +char DefaultConfDir[] = CONFFILE ".d"; char DefaultAltConfFile[] = CONFFILE2; +char DefaultAltConfDir[] = CONFFILE2 ".d"; enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, Homehost, AutoMode, Policy, PartPolicy, LTEnd }; @@ -108,49 +110,9 @@ return -1; } -/* - * conf_line reads one logical line from the conffile. - * It skips comments and continues until it finds a line that starts - * with a non blank/comment. This character is pushed back for the next call - * A doubly linked list of words is returned. - * the first word will be a keyword. Other words will have had quotes removed. - */ - -char *conf_line(FILE *file) -{ - char *w; - char *list; - - w = conf_word(file, 1); - if (w == NULL) return NULL; - - list = dl_strdup(w); - free(w); - dl_init(list); - - while ((w = conf_word(file,0))){ - char *w2 = dl_strdup(w); - free(w); - dl_add(list, w2); - } -/* printf("got a line\n");*/ - return list; -} - -void free_line(char *line) -{ - char *w; - for (w=dl_next(line); w != line; w=dl_next(line)) { - dl_del(w); - dl_free(w); - } - dl_free(line); -} - - struct conf_dev { - struct conf_dev *next; - char *name; + struct conf_dev *next; + char *name; } *cdevlist = NULL; struct mddev_dev *load_partitions(void) @@ -159,7 +121,7 @@ char buf[1024]; struct mddev_dev *rv = NULL; if (f == NULL) { - fprintf(stderr, Name ": cannot open /proc/partitions\n"); + pr_err("cannot open /proc/partitions\n"); return NULL; } while (fgets(buf, 1024, f)) { @@ -178,10 +140,10 @@ name = map_dev(major, minor, 1); if (!name) continue; - d = malloc(sizeof(*d)); - d->devname = strdup(name); + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + d->devname = xstrdup(name); d->next = rv; - d->used = 0; rv = d; } fclose(f); @@ -190,10 +152,11 @@ struct mddev_dev *load_containers(void) { - struct mdstat_ent *mdstat = mdstat_read(1, 0); + struct mdstat_ent *mdstat = mdstat_read(0, 0); struct mdstat_ent *ent; struct mddev_dev *d; struct mddev_dev *rv = NULL; + struct map_ent *map = NULL, *me; if (!mdstat) return NULL; @@ -202,18 +165,20 @@ if (ent->metadata_version && strncmp(ent->metadata_version, "external:", 9) == 0 && !is_subarray(&ent->metadata_version[9])) { - d = malloc(sizeof(*d)); - if (!d) - continue; - if (asprintf(&d->devname, "/dev/%s", ent->dev) < 0) { + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + me = map_by_devnm(&map, ent->dev); + if (me) + d->devname = xstrdup(me->path); + else if (asprintf(&d->devname, "/dev/%s", ent->dev) < 0) { free(d); continue; } d->next = rv; - d->used = 0; rv = d; } free_mdstat(mdstat); + map_free(map); return rv; } @@ -221,6 +186,7 @@ struct createinfo createinfo = { .autof = 2, /* by default, create devices with standard names */ .symlinks = 1, + .names = 0, /* By default, stick with numbered md devices. */ #ifdef DEBIAN .gid = 6, /* disk */ .mode = 0660, @@ -264,7 +230,7 @@ (len >= 4 && strncasecmp(str,"part",4)==0)) { autof = 6; } else { - fprintf(stderr, Name ": %s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" + pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" " optionally followed by a number.\n", msg, str); exit(2); @@ -284,7 +250,7 @@ createinfo.autof = parse_auto(w+5, "auto=", 1); else if (strncasecmp(w, "owner=", 6) == 0) { if (w[6] == 0) { - fprintf(stderr, Name ": missing owner name\n"); + pr_err("missing owner name\n"); continue; } createinfo.uid = strtoul(w+6, &ep, 10); @@ -295,11 +261,11 @@ if (pw) createinfo.uid = pw->pw_uid; else - fprintf(stderr, Name ": CREATE user %s not found\n", w+6); + pr_err("CREATE user %s not found\n", w+6); } } else if (strncasecmp(w, "group=", 6) == 0) { if (w[6] == 0) { - fprintf(stderr, Name ": missing group name\n"); + pr_err("missing group name\n"); continue; } createinfo.gid = strtoul(w+6, &ep, 10); @@ -310,17 +276,17 @@ if (gr) createinfo.gid = gr->gr_gid; else - fprintf(stderr, Name ": CREATE group %s not found\n", w+6); + pr_err("CREATE group %s not found\n", w+6); } } else if (strncasecmp(w, "mode=", 5) == 0) { if (w[5] == 0) { - fprintf(stderr, Name ": missing CREATE mode\n"); + pr_err("missing CREATE mode\n"); continue; } createinfo.mode = strtoul(w+5, &ep, 8); if (*ep != 0) { createinfo.mode = 0600; - fprintf(stderr, Name ": unrecognised CREATE mode %s\n", + pr_err("unrecognised CREATE mode %s\n", w+5); } } else if (strncasecmp(w, "metadata=", 9) == 0) { @@ -330,14 +296,18 @@ createinfo.supertype = superlist[i]->match_metadata_desc(w+9); if (!createinfo.supertype) - fprintf(stderr, Name ": metadata format %s unknown, ignoring\n", + pr_err("metadata format %s unknown, ignoring\n", w+9); } else if (strncasecmp(w, "symlinks=yes", 12) == 0) createinfo.symlinks = 1; else if (strncasecmp(w, "symlinks=no", 11) == 0) createinfo.symlinks = 0; + else if (strncasecmp(w, "names=yes", 12) == 0) + createinfo.names = 1; + else if (strncasecmp(w, "names=no", 11) == 0) + createinfo.names = 0; else { - fprintf(stderr, Name ": unrecognised word on CREATE line: %s\n", + pr_err("unrecognised word on CREATE line: %s\n", w); } } @@ -351,12 +321,12 @@ for (w=dl_next(line); w != line; w=dl_next(w)) { if (w[0] == '/' || strcasecmp(w, "partitions") == 0 || strcasecmp(w, "containers") == 0) { - cd = malloc(sizeof(*cd)); - cd->name = strdup(w); + cd = xmalloc(sizeof(*cd)); + cd->name = xstrdup(w); cd->next = cdevlist; cdevlist = cd; } else { - fprintf(stderr, Name ": unreconised word on DEVICE line: %s\n", + pr_err("unreconised word on DEVICE line: %s\n", w); } } @@ -414,74 +384,74 @@ if (strcasecmp(w, "") == 0 || strncmp(w, "/dev/md/", 8) == 0 || (w[0] != '/' && w[0] != '<') || - (strncmp(w, "/dev/md", 7) == 0 && + (strncmp(w, "/dev/md", 7) == 0 && is_number(w+7)) || (strncmp(w, "/dev/md_d", 9) == 0 && is_number(w+9)) ) { /* This is acceptable */; if (mis.devname) - fprintf(stderr, Name ": only give one " + pr_err("only give one " "device per ARRAY line: %s and %s\n", mis.devname, w); else mis.devname = w; }else { - fprintf(stderr, Name ": %s is an invalid name for " + pr_err("%s is an invalid name for " "an md device - ignored.\n", w); } } else if (strncasecmp(w, "uuid=", 5)==0 ) { if (mis.uuid_set) - fprintf(stderr, Name ": only specify uuid once, %s ignored.\n", + pr_err("only specify uuid once, %s ignored.\n", w); else { if (parse_uuid(w+5, mis.uuid)) mis.uuid_set = 1; else - fprintf(stderr, Name ": bad uuid: %s\n", w); + pr_err("bad uuid: %s\n", w); } } else if (strncasecmp(w, "super-minor=", 12)==0 ) { if (mis.super_minor != UnSet) - fprintf(stderr, Name ": only specify super-minor once, %s ignored.\n", + pr_err("only specify super-minor once, %s ignored.\n", w); else { char *endptr; int minor = strtol(w+12, &endptr, 10); if (w[12]==0 || endptr[0]!=0 || minor < 0) - fprintf(stderr, Name ": invalid super-minor number: %s\n", + pr_err("invalid super-minor number: %s\n", w); else mis.super_minor = minor; } } else if (strncasecmp(w, "name=", 5)==0) { if (mis.name[0]) - fprintf(stderr, Name ": only specify name once, %s ignored.\n", + pr_err("only specify name once, %s ignored.\n", w); else if (strlen(w+5) > 32) - fprintf(stderr, Name ": name too long, ignoring %s\n", w); + pr_err("name too long, ignoring %s\n", w); else strcpy(mis.name, w+5); } else if (strncasecmp(w, "bitmap=", 7) == 0) { if (mis.bitmap_file) - fprintf(stderr, Name ": only specify bitmap file once. %s ignored\n", + pr_err("only specify bitmap file once. %s ignored\n", w); else - mis.bitmap_file = strdup(w+7); + mis.bitmap_file = xstrdup(w+7); } else if (strncasecmp(w, "devices=", 8 ) == 0 ) { if (mis.devices) - fprintf(stderr, Name ": only specify devices once (use a comma separated list). %s ignored\n", + pr_err("only specify devices once (use a comma separated list). %s ignored\n", w); else - mis.devices = strdup(w+8); + mis.devices = xstrdup(w+8); } else if (strncasecmp(w, "spare-group=", 12) == 0 ) { if (mis.spare_group) - fprintf(stderr, Name ": only specify one spare group per array. %s ignored.\n", + pr_err("only specify one spare group per array. %s ignored.\n", w); else - mis.spare_group = strdup(w+12); + mis.spare_group = xstrdup(w+12); } else if (strncasecmp(w, "level=", 6) == 0 ) { /* this is mainly for compatability with --brief output */ mis.level = map_name(pers, w+6); @@ -502,30 +472,30 @@ mis.st = superlist[i]->match_metadata_desc(w+9); if (!mis.st) - fprintf(stderr, Name ": metadata format %s unknown, ignored.\n", w+9); + pr_err("metadata format %s unknown, ignored.\n", w+9); } else if (strncasecmp(w, "auto=", 5) == 0 ) { /* whether to create device special files as needed */ mis.autof = parse_auto(w+5, "auto type", 0); } else if (strncasecmp(w, "member=", 7) == 0) { /* subarray within a container */ - mis.member = strdup(w+7); + mis.member = xstrdup(w+7); } else if (strncasecmp(w, "container=", 10) == 0) { /* the container holding this subarray. Either a device name * or a uuid */ - mis.container = strdup(w+10); + mis.container = xstrdup(w+10); } else { - fprintf(stderr, Name ": unrecognised word on ARRAY line: %s\n", + pr_err("unrecognised word on ARRAY line: %s\n", w); } } if (mis.uuid_set == 0 && mis.devices == NULL && mis.super_minor == UnSet && mis.name[0] == 0 && (mis.container == NULL || mis.member == NULL)) - fprintf(stderr, Name ": ARRAY line %s has no identity information.\n", mis.devname); + pr_err("ARRAY line %s has no identity information.\n", mis.devname); else { - mi = malloc(sizeof(*mi)); + mi = xmalloc(sizeof(*mi)); *mi = mis; - mi->devname = mis.devname ? strdup(mis.devname) : NULL; + mi->devname = mis.devname ? xstrdup(mis.devname) : NULL; mi->next = NULL; *mddevlp = mi; mddevlp = &mi->next; @@ -537,13 +507,9 @@ { char *w; - for (w=dl_next(line); w != line ; w=dl_next(w)) { + for (w=dl_next(line); w != line ; w=dl_next(w)) if (alert_email == NULL) - alert_email = strdup(w); - else - fprintf(stderr, Name ": excess address on MAIL line: %s - ignored\n", - w); - } + alert_email = xstrdup(w); } static char *alert_mail_from = NULL; @@ -553,7 +519,7 @@ for (w=dl_next(line); w != line ; w=dl_next(w)) { if (alert_mail_from == NULL) - alert_mail_from = strdup(w); + alert_mail_from = xstrdup(w); else { char *t = NULL; @@ -565,19 +531,14 @@ } } - static char *alert_program = NULL; void programline(char *line) { char *w; - for (w=dl_next(line); w != line ; w=dl_next(w)) { + for (w=dl_next(line); w != line ; w=dl_next(w)) if (alert_program == NULL) - alert_program = strdup(w); - else - fprintf(stderr, Name ": excess program on PROGRAM line: %s - ignored\n", - w); - } + alert_program = xstrdup(w); } static char *home_host = NULL; @@ -591,12 +552,10 @@ require_homehost = 0; else if (home_host == NULL) { if (strcasecmp(w, "")==0) - home_host = strdup(""); + home_host = xstrdup(""); else - home_host = strdup(w); - }else - fprintf(stderr, Name ": excess host name on HOMEHOST line: %s - ignored\n", - w); + home_host = xstrdup(w); + } } } @@ -614,11 +573,9 @@ int homehost = 0; int i; - if (auto_seen) { - fprintf(stderr, Name ": AUTO line may only be give once." - " Subsequent lines ignored\n"); + if (auto_seen) return; - } + /* Parse the 'auto' line creating policy statements for the 'auto' policy. * * The default is 'yes' but the 'auto' line might over-ride that. @@ -645,9 +602,27 @@ * been seen gets an appropriate auto= entry. */ + /* If environment variable MDADM_CONF_AUTO is defined, then + * it is prepended to the auto line. This allow a script + * to easily disable some metadata types. + */ + w = getenv("MDADM_CONF_AUTO"); + if (w && *w) { + char *l = xstrdup(w); + char *head = line; + w = strtok(l, " \t"); + while (w) { + char *nw = dl_strdup(w); + dl_insert(head, nw); + head = nw; + w = strtok(NULL, " \t"); + } + free(l); + } + for (super_cnt = 0; superlist[super_cnt]; super_cnt++) ; - seen = calloc(super_cnt, 1); + seen = xcalloc(super_cnt, 1); for (w = dl_next(line); w != line ; w = dl_next(w)) { char *val; @@ -719,44 +694,9 @@ conffile = file; } -void load_conffile(void) +void conf_file(FILE *f) { - FILE *f; char *line; - - if (loaded) return; - if (conffile == NULL) - conffile = DefaultConfFile; - - if (strcmp(conffile, "none") == 0) { - loaded = 1; - return; - } - if (strcmp(conffile, "partitions")==0) { - char *list = dl_strdup("DEV"); - dl_init(list); - dl_add(list, dl_strdup("partitions")); - devline(list); - free_line(list); - loaded = 1; - return; - } - f = fopen(conffile, "r"); - /* Debian chose to relocate mdadm.conf into /etc/mdadm/. - * To allow Debian users to compile from clean source and still - * have a working mdadm, we read /etc/mdadm/mdadm.conf - * if /etc/mdadm.conf doesn't exist - */ - if (f == NULL && - conffile == DefaultConfFile) { - f = fopen(DefaultAltConfFile, "r"); - if (f) - conffile = DefaultAltConfFile; - } - if (f == NULL) - return; - - loaded = 1; while ((line=conf_line(f))) { switch(match_keyword(line)) { case Devices: @@ -790,14 +730,128 @@ policyline(line, rule_part); break; default: - fprintf(stderr, Name ": Unknown keyword %s\n", line); + pr_err("Unknown keyword %s\n", line); } free_line(line); } +} - fclose(f); +struct fname { + struct fname *next; + char name[]; +}; -/* printf("got file\n"); */ +void conf_file_or_dir(FILE *f) +{ + struct stat st; + DIR *dir; + struct dirent *dp; + struct fname *list = NULL; + + fstat(fileno(f), &st); + if (S_ISREG(st.st_mode)) + conf_file(f); + else if (!S_ISDIR(st.st_mode)) + return; +#if _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L + dir = fdopendir(fileno(f)); + if (!dir) + return; + while ((dp = readdir(dir)) != NULL) { + int l; + struct fname *fn, **p; + if (dp->d_ino == 0) + continue; + if (dp->d_name[0] == '.') + continue; + l = strlen(dp->d_name); + if (l < 6 || strcmp(dp->d_name+l-5, ".conf") != 0) + continue; + fn = xmalloc(sizeof(*fn)+l+1); + strcpy(fn->name, dp->d_name); + for (p = &list; + *p && strcmp((*p)->name, fn->name) < 0; + p = & (*p)->next) + ; + fn->next = *p; + *p = fn; + } + while (list) { + int fd; + FILE *f2; + struct fname *fn = list; + list = list->next; + fd = openat(fileno(f), fn->name, O_RDONLY); + free(fn); + if (fd < 0) + continue; + f2 = fdopen(fd, "r"); + if (!f2) { + close(fd); + continue; + } + conf_file(f2); + fclose(f2); + } + closedir(dir); +#endif +} + +void load_conffile(void) +{ + FILE *f; + char *confdir = NULL; + char *head; + + if (loaded) + return; + if (conffile == NULL) { + conffile = DefaultConfFile; + confdir = DefaultConfDir; + } + + if (strcmp(conffile, "partitions")==0) { + char *list = dl_strdup("DEV"); + dl_init(list); + dl_add(list, dl_strdup("partitions")); + devline(list); + free_line(list); + } else if (strcmp(conffile, "none") != 0) { + f = fopen(conffile, "r"); + /* Debian chose to relocate mdadm.conf into /etc/mdadm/. + * To allow Debian users to compile from clean source and still + * have a working mdadm, we read /etc/mdadm/mdadm.conf + * if /etc/mdadm.conf doesn't exist + */ + if (f == NULL && + conffile == DefaultConfFile) { + f = fopen(DefaultAltConfFile, "r"); + if (f) { + conffile = DefaultAltConfFile; + confdir = DefaultAltConfDir; + } + } + if (f) { + conf_file_or_dir(f); + fclose(f); + } + if (confdir) { + f = fopen(confdir, "r"); + if (f) { + conf_file_or_dir(f); + fclose(f); + } + } + } + /* If there was no AUTO line, process an empty line + * now so that the MDADM_CONF_AUTO env var gets processed. + */ + head = dl_strdup("AUTO"); + dl_init(head); + autoline(head); + free_line(head); + + loaded = 1; } char *conf_get_mailaddr(void) @@ -885,10 +939,10 @@ } if (flags & GLOB_APPEND) { for (i=0; idevname = strdup(globbuf.gl_pathv[i]); + struct mddev_dev *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->devname = xstrdup(globbuf.gl_pathv[i]); t->next = dlist; - t->used = 0; dlist = t; /* printf("one dev is %s\n", t->devname);*/ } @@ -942,26 +996,26 @@ int match_oneof(char *devices, char *devname) { - /* check if one of the comma separated patterns in devices - * matches devname - */ - - while (devices && *devices) { - char patn[1024]; - char *p = devices; - devices = strchr(devices, ','); - if (!devices) - devices = p + strlen(p); - if (devices-p < 1024) { - strncpy(patn, p, devices-p); - patn[devices-p] = 0; - if (fnmatch(patn, devname, FNM_PATHNAME)==0) - return 1; + /* check if one of the comma separated patterns in devices + * matches devname + */ + + while (devices && *devices) { + char patn[1024]; + char *p = devices; + devices = strchr(devices, ','); + if (!devices) + devices = p + strlen(p); + if (devices-p < 1024) { + strncpy(patn, p, devices-p); + patn[devices-p] = 0; + if (fnmatch(patn, devname, FNM_PATHNAME)==0) + return 1; + } + if (*devices == ',') + devices++; } - if (*devices == ',') - devices++; - } - return 0; + return 0; } int devname_matches(char *name, char *match) @@ -984,7 +1038,6 @@ else if (strncmp(match, "/dev/", 5) == 0) match += 5; - if (strncmp(name, "md", 2) == 0 && isdigit(name[2])) name += 2; @@ -997,7 +1050,7 @@ int conf_name_is_free(char *name) { - /* Check if this name is already take by an ARRAY entry in + /* Check if this name is already taken by an ARRAY entry in * the config file. * It can be taken either by a match on devname, name, or * even super-minor. @@ -1032,33 +1085,29 @@ same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid) == 0) { if (verbose >= 2 && array_list->devname) - fprintf(stderr, Name - ": UUID differs from %s.\n", - array_list->devname); + pr_err("UUID differs from %s.\n", + array_list->devname); continue; } if (array_list->name[0] && strcasecmp(array_list->name, info->name) != 0) { if (verbose >= 2 && array_list->devname) - fprintf(stderr, Name - ": Name differs from %s.\n", - array_list->devname); + pr_err("Name differs from %s.\n", + array_list->devname); continue; } if (array_list->devices && devname && !match_oneof(array_list->devices, devname)) { if (verbose >= 2 && array_list->devname) - fprintf(stderr, Name - ": Not a listed device for %s.\n", - array_list->devname); + pr_err("Not a listed device for %s.\n", + array_list->devname); continue; } if (array_list->super_minor != UnSet && array_list->super_minor != info->array.md_minor) { if (verbose >= 2 && array_list->devname) - fprintf(stderr, Name - ": Different super-minor to %s.\n", - array_list->devname); + pr_err("Different super-minor to %s.\n", + array_list->devname); continue; } if (!array_list->uuid_set && @@ -1066,10 +1115,9 @@ !array_list->devices && array_list->super_minor == UnSet) { if (verbose >= 2 && array_list->devname) - fprintf(stderr, Name - ": %s doesn't have any identifying" - " information.\n", - array_list->devname); + pr_err("%s doesn't have any identifying" + " information.\n", + array_list->devname); continue; } /* FIXME, should I check raid_disks and level too?? */ @@ -1077,15 +1125,13 @@ if (match) { if (verbose >= 0) { if (match->devname && array_list->devname) - fprintf(stderr, Name - ": we match both %s and %s - " - "cannot decide which to use.\n", - match->devname, - array_list->devname); + pr_err("we match both %s and %s - " + "cannot decide which to use.\n", + match->devname, + array_list->devname); else - fprintf(stderr, Name - ": multiple lines in mdadm.conf" - " match\n"); + pr_err("multiple lines in mdadm.conf" + " match\n"); } if (rvp) *rvp = 2; @@ -1104,6 +1150,8 @@ for (a1 = array_list; a1; a1 = a1->next) { if (!a1->devname) continue; + if (strcmp(a1->devname, "") == 0) + continue; for (a2 = a1->next; a2; a2 = a2->next) { if (!a2->devname) continue; @@ -1113,15 +1161,14 @@ if (a1->uuid_set && a2->uuid_set) { char nbuf[64]; __fname_from_uuid(a1->uuid, 0, nbuf, ':'); - fprintf(stderr, - Name ": Devices %s and ", - nbuf); + pr_err("Devices %s and ", + nbuf); __fname_from_uuid(a2->uuid, 0, nbuf, ':'); fprintf(stderr, "%s have the same name: %s\n", nbuf, a1->devname); } else - fprintf(stderr, Name ": Device %s given twice" + pr_err("Device %s given twice" " in config file\n", a1->devname); return 1; } diff -Nru mdadm-3.2.5/crc32.c mdadm-3.3/crc32.c --- mdadm-3.2.5/crc32.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/crc32.c 2013-09-03 04:47:47.000000000 +0000 @@ -2,6 +2,26 @@ * Copyright (C) 1995-2003 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * + * Note: zlib license from from zlib.h added explicitly as mdadm does + * not include zlib.h. License from v1.2.2 of zlib: + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * * Thanks to Rodney Brown for his contribution of faster * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing * tables for updating the shift register in one step with three exclusive-ors diff -Nru mdadm-3.2.5/Create.c mdadm-3.3/Create.c --- mdadm-3.2.5/Create.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Create.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -22,7 +22,7 @@ * Email: */ -#include "mdadm.h" +#include "mdadm.h" #include "md_u.h" #include "md_p.h" #include @@ -42,37 +42,30 @@ case 10: layout = 0x102; /* near=2, far=1 */ if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to n2\n"); + pr_err("layout defaults to n2\n"); break; case 5: case 6: layout = map_name(r5layout, "default"); if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(r5layout, layout)); + pr_err("layout defaults to %s\n", map_num(r5layout, layout)); break; case LEVEL_FAULTY: layout = map_name(faultylayout, "default"); if (verbose > 0) - fprintf(stderr, - Name ": layout defaults to %s\n", map_num(faultylayout, layout)); + pr_err("layout defaults to %s\n", map_num(faultylayout, layout)); break; } return layout; } - int Create(struct supertype *st, char *mddev, - int chunk, int level, int layout, unsigned long long size, - int raiddisks, int sparedisks, - char *name, char *homehost, int *uuid, + char *name, int *uuid, int subdevs, struct mddev_dev *devlist, - int runstop, int verbose, int force, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, - int delay, int autof) + struct shape *s, + struct context *c, unsigned long long data_offset) { /* * Create a new raid array. @@ -123,31 +116,26 @@ int major_num = BITMAP_MAJOR_HI; memset(&info, 0, sizeof(info)); - if (level == UnSet && st && st->ss->default_geometry) - st->ss->default_geometry(st, &level, NULL, NULL); - if (level == UnSet) { - fprintf(stderr, - Name ": a RAID level is needed to create an array.\n"); + if (s->level == UnSet && st && st->ss->default_geometry) + st->ss->default_geometry(st, &s->level, NULL, NULL); + if (s->level == UnSet) { + pr_err("a RAID level is needed to create an array.\n"); return 1; } - if (raiddisks < 4 && level == 6) { - fprintf(stderr, - Name ": at least 4 raid-devices needed for level 6\n"); + if (s->raiddisks < 4 && s->level == 6) { + pr_err("at least 4 raid-devices needed for level 6\n"); return 1; } - if (raiddisks > 256 && level == 6) { - fprintf(stderr, - Name ": no more than 256 raid-devices supported for level 6\n"); + if (s->raiddisks > 256 && s->level == 6) { + pr_err("no more than 256 raid-devices supported for level 6\n"); return 1; } - if (raiddisks < 2 && level >= 4) { - fprintf(stderr, - Name ": at least 2 raid-devices needed for level 4 or 5\n"); + if (s->raiddisks < 2 && s->level >= 4) { + pr_err("at least 2 raid-devices needed for level 4 or 5\n"); return 1; } - if (level <= 0 && sparedisks) { - fprintf(stderr, - Name ": This level does not support spare devices\n"); + if (s->level <= 0 && s->sparedisks) { + pr_err("This level does not support spare devices\n"); return 1; } @@ -179,7 +167,7 @@ st = NULL; } if (have_container) { - subdevs = raiddisks; + subdevs = s->raiddisks; first_missing = subdevs * 2; second_missing = subdevs * 2; insert_point = subdevs * 2; @@ -188,108 +176,109 @@ if (fd >= 0) close(fd); } - if (st && st->ss->external && sparedisks) { - fprintf(stderr, - Name ": This metadata type does not support " - "spare disks at create time\n"); + if (st && st->ss->external && s->sparedisks) { + pr_err("This metadata type does not support " + "spare disks at create time\n"); return 1; } - if (subdevs > raiddisks+sparedisks) { - fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks); + if (subdevs > s->raiddisks+s->sparedisks) { + pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); return 1; } - if (!have_container && subdevs < raiddisks+sparedisks) { - fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n"); + if (!have_container && subdevs < s->raiddisks+s->sparedisks) { + pr_err("You haven't given enough devices (real or missing) to create this array\n"); return 1; } - if (bitmap_file && level <= 0) { - fprintf(stderr, Name ": bitmaps not meaningful with level %s\n", - map_num(pers, level)?:"given"); + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); return 1; } /* now set some defaults */ - - if (layout == UnSet) { + if (s->layout == UnSet) { do_default_layout = 1; - layout = default_layout(st, level, verbose); + s->layout = default_layout(st, s->level, c->verbose); } - if (level == 10) + if (s->level == 10) /* check layout fits in array*/ - if ((layout&255) * ((layout>>8)&255) > raiddisks) { - fprintf(stderr, Name ": that layout requires at least %d devices\n", - (layout&255) * ((layout>>8)&255)); + if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) { + pr_err("that layout requires at least %d devices\n", + (s->layout&255) * ((s->layout>>8)&255)); return 1; } - switch(level) { + switch(s->level) { case 4: case 5: case 10: case 6: case 0: - if (chunk == 0 || chunk == UnSet) { - chunk = UnSet; + if (s->chunk == 0 || s->chunk == UnSet) { + s->chunk = UnSet; do_default_chunk = 1; /* chunk will be set later */ } break; case LEVEL_LINEAR: /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ - if (get_linux_version() < 2006016 && chunk == 0) { - chunk = 64; - if (verbose > 0) - fprintf(stderr, Name ": chunk size defaults to 64K\n"); + if (get_linux_version() < 2006016 && s->chunk == 0) { + s->chunk = 64; + if (c->verbose > 0) + pr_err("chunk size defaults to 64K\n"); } break; case 1: case LEVEL_FAULTY: case LEVEL_MULTIPATH: case LEVEL_CONTAINER: - if (chunk) { - chunk = 0; - if (verbose > 0) - fprintf(stderr, Name ": chunk size ignored for this level\n"); + if (s->chunk) { + s->chunk = 0; + if (c->verbose > 0) + pr_err("chunk size ignored for this level\n"); } break; default: - fprintf(stderr, Name ": unknown level %d\n", level); + pr_err("unknown level %d\n", s->level); return 1; } - - if (size && chunk && chunk != UnSet) - size &= ~(unsigned long long)(chunk - 1); - newsize = size * 2; - if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks, - &chunk, size*2, NULL, &newsize, verbose>=0)) + if (s->size == MAX_SIZE) + /* use '0' to mean 'max' now... */ + s->size = 0; + if (s->size && s->chunk && s->chunk != UnSet) + s->size &= ~(unsigned long long)(s->chunk - 1); + newsize = s->size * 2; + if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + data_offset, NULL, + &newsize, c->verbose>=0)) return 1; - if (chunk && chunk != UnSet) { - newsize &= ~(unsigned long long)(chunk*2 - 1); + if (s->chunk && s->chunk != UnSet) { + newsize &= ~(unsigned long long)(s->chunk*2 - 1); if (do_default_chunk) { /* default chunk was just set */ - if (verbose > 0) - fprintf(stderr, Name ": chunk size " - "defaults to %dK\n", chunk); - size &= ~(unsigned long long)(chunk - 1); + if (c->verbose > 0) + pr_err("chunk size " + "defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); do_default_chunk = 0; } } - if (size == 0) { - size = newsize / 2; - if (level == 1) + if (s->size == 0) { + s->size = newsize / 2; + if (s->level == 1) /* If this is ever reshaped to RAID5, we will * need a chunksize. So round it off a bit * now just to be safe */ - size &= ~(64ULL-1); + s->size &= ~(64ULL-1); - if (size && verbose > 0) - fprintf(stderr, Name ": setting size to %lluK\n", - (unsigned long long)size); + if (s->size && c->verbose > 0) + pr_err("setting size to %lluK\n", s->size); } /* now look at the subdevs */ @@ -300,6 +289,7 @@ char *dname = dv->devname; unsigned long long freesize; int dfd; + char *doff; if (strcasecmp(dname, "missing")==0) { if (first_missing > dnum) @@ -309,22 +299,32 @@ missing_disks ++; continue; } + if (data_offset == VARIABLE_OFFSET) { + doff = strchr(dname, ':'); + if (doff) { + *doff++ = 0; + dv->data_offset = parse_size(doff); + } else + dv->data_offset = INVALID_SECTORS; + } else + dv->data_offset = data_offset; + dfd = open(dname, O_RDONLY); if (dfd < 0) { - fprintf(stderr, Name ": cannot open %s: %s\n", + pr_err("cannot open %s: %s\n", dname, strerror(errno)); exit(2); } if (fstat(dfd, &stb) != 0 || (stb.st_mode & S_IFMT) != S_IFBLK) { close(dfd); - fprintf(stderr, Name ": %s is not a block device\n", + pr_err("%s is not a block device\n", dname); exit(2); } close(dfd); info.array.working_disks++; - if (dnum < raiddisks) + if (dnum < s->raiddisks) info.array.active_disks++; if (st == NULL) { struct createinfo *ci = conf_get_create_info(); @@ -342,11 +342,12 @@ if (!st) continue; if (do_default_layout) - layout = default_layout(st, level, verbose); + s->layout = default_layout(st, s->level, c->verbose); switch (st->ss->validate_geometry( - st, level, layout, raiddisks, - &chunk, size*2, dname, &freesize, - verbose > 0)) { + st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, dname, + &freesize, c->verbose > 0)) { case -1: /* Not valid, message printed, and not * worth checking any further */ exit(2); @@ -354,7 +355,7 @@ case 0: /* Geometry not valid */ free(st); st = NULL; - chunk = do_default_chunk ? UnSet : chunk; + s->chunk = do_default_chunk ? UnSet : s->chunk; break; case 1: /* All happy */ break; @@ -364,11 +365,11 @@ if (!st) { int dfd = open(dname, O_RDONLY|O_EXCL); if (dfd < 0) { - fprintf(stderr, Name ": cannot open %s: %s\n", + pr_err("cannot open %s: %s\n", dname, strerror(errno)); exit(2); } - fprintf(stderr, Name ": device %s not suitable " + pr_err("device %s not suitable " "for any style of array\n", dname); exit(2); @@ -378,40 +379,45 @@ did_default = 1; } else { if (do_default_layout) - layout = default_layout(st, level, 0); - if (!st->ss->validate_geometry(st, level, layout, - raiddisks, - &chunk, size*2, dname, - &freesize, - verbose >= 0)) { - - fprintf(stderr, - Name ": %s is not suitable for " - "this array.\n", - dname); + s->layout = default_layout(st, s->level, 0); + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, + dname, &freesize, + c->verbose >= 0)) { + + pr_err("%s is not suitable for " + "this array.\n", + dname); fail = 1; continue; } } freesize /= 2; /* convert to K */ - if (chunk && chunk != UnSet) { + if (s->chunk && s->chunk != UnSet) { /* round to chunk size */ - freesize = freesize & ~(chunk-1); + freesize = freesize & ~(s->chunk-1); if (do_default_chunk) { /* default chunk was just set */ - if (verbose > 0) - fprintf(stderr, Name ": chunk size " - "defaults to %dK\n", chunk); - size &= ~(unsigned long long)(chunk - 1); + if (c->verbose > 0) + pr_err("chunk size " + "defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); do_default_chunk = 0; } } + if (!freesize) { + pr_err("no free space left on %s\n", dname); + fail = 1; + continue; + } - if (size && freesize < size) { - fprintf(stderr, Name ": %s is smaller than given size." + if (s->size && freesize < s->size) { + pr_err("%s is smaller than given size." " %lluK < %lluK + metadata\n", - dname, freesize, size); + dname, freesize, s->size); fail = 1; continue; } @@ -423,10 +429,10 @@ mindisc = dname; minsize = freesize; } - if (runstop != 1 || verbose >= 0) { + if (c->runstop != 1 || c->verbose >= 0) { int fd = open(dname, O_RDONLY); if (fd <0 ) { - fprintf(stderr, Name ": Cannot open %s: %s\n", + pr_err("Cannot open %s: %s\n", dname, strerror(errno)); fail=1; continue; @@ -438,20 +444,20 @@ st->minor_version >= 1) /* metadata at front */ warn |= check_partitions(fd, dname, 0, 0); - else if (level == 1 || level == LEVEL_CONTAINER - || (level == 0 && raiddisks == 1)) + else if (s->level == 1 || s->level == LEVEL_CONTAINER + || (s->level == 0 && s->raiddisks == 1)) /* partitions could be meaningful */ - warn |= check_partitions(fd, dname, freesize*2, size*2); + warn |= check_partitions(fd, dname, freesize*2, s->size*2); else /* partitions cannot be meaningful */ warn |= check_partitions(fd, dname, 0, 0); if (strcmp(st->ss->name, "1.x") == 0 && st->minor_version >= 1 && did_default && - level == 1 && + s->level == 1 && (warn & 1024) == 0) { warn |= 1024; - fprintf(stderr, Name ": Note: this array has metadata at the start and\n" + pr_err("Note: this array has metadata at the start and\n" " may not be suitable as a boot device. If you plan to\n" " store '/boot' on this device please ensure that\n" " your boot-loader understands md/v1.x metadata, or use\n" @@ -460,69 +466,81 @@ close(fd); } } - if (raiddisks + sparedisks > st->max_devs) { - fprintf(stderr, Name ": Too many devices:" + if (s->raiddisks + s->sparedisks > st->max_devs) { + pr_err("Too many devices:" " %s metadata only supports %d\n", st->ss->name, st->max_devs); return 1; } if (have_container) - info.array.working_disks = raiddisks; + info.array.working_disks = s->raiddisks; if (fail) { - fprintf(stderr, Name ": create aborted\n"); + pr_err("create aborted\n"); return 1; } - if (size == 0) { + if (s->size == 0) { if (mindisc == NULL && !have_container) { - fprintf(stderr, Name ": no size and no drives given - aborting create.\n"); + pr_err("no size and no drives given - aborting create.\n"); return 1; } - if (level > 0 || level == LEVEL_MULTIPATH - || level == LEVEL_FAULTY + if (s->level > 0 || s->level == LEVEL_MULTIPATH + || s->level == LEVEL_FAULTY || st->ss->external ) { /* size is meaningful */ - if (!st->ss->validate_geometry(st, level, layout, - raiddisks, - &chunk, minsize*2, + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, minsize*2, + data_offset, NULL, NULL, 0)) { - fprintf(stderr, Name ": devices too large for RAID level %d\n", level); + pr_err("devices too large for RAID level %d\n", s->level); return 1; } - size = minsize; - if (level == 1) + s->size = minsize; + if (s->level == 1) /* If this is ever reshaped to RAID5, we will * need a chunksize. So round it off a bit * now just to be safe */ - size &= ~(64ULL-1); - if (verbose > 0) - fprintf(stderr, Name ": size set to %lluK\n", size); + s->size &= ~(64ULL-1); + if (c->verbose > 0) + pr_err("size set to %lluK\n", s->size); } } - if (!have_container && level > 0 && ((maxsize-size)*100 > maxsize)) { - if (runstop != 1 || verbose >= 0) - fprintf(stderr, Name ": largest drive (%s) exceeds size (%lluK) by more than 1%%\n", - maxdisc, size); + + if (!s->bitmap_file && + s->level >= 1 && + (s->write_behind || s->size > 100*1024*1024ULL)) { + if (c->verbose > 0) + pr_err("automatically enabling write-intent bitmap on large array\n"); + s->bitmap_file = "internal"; + } + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", + maxdisc, s->size); warn = 1; } - if (st->ss->detail_platform && st->ss->detail_platform(0, 1) != 0) { - if (runstop != 1 || verbose >= 0) - fprintf(stderr, Name ": %s unable to enumerate platform support\n" + if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("%s unable to enumerate platform support\n" " array may not be compatible with hardware/firmware\n", st->ss->name); warn = 1; } if (warn) { - if (runstop!= 1) { + if (c->runstop!= 1) { if (!ask("Continue creating array? ")) { - fprintf(stderr, Name ": create aborted.\n"); + pr_err("create aborted.\n"); return 1; } } else { - if (verbose > 0) - fprintf(stderr, Name ": creation continuing despite oddities due to --run\n"); + if (c->verbose > 0) + pr_err("creation continuing despite oddities due to --run\n"); } } @@ -531,12 +549,12 @@ * FIX: Can we do this for raid6 as well? */ if (st->ss->external == 0 && - assume_clean==0 && force == 0 && first_missing >= raiddisks) { - switch ( level ) { + s->assume_clean==0 && c->force == 0 && first_missing >= s->raiddisks) { + switch ( s->level ) { case 4: case 5: - insert_point = raiddisks-1; - sparedisks++; + insert_point = s->raiddisks-1; + s->sparedisks++; info.array.active_disks--; missing_disks++; break; @@ -547,26 +565,25 @@ /* For raid6, if creating with 1 missing drive, make a good drive * into a spare, else the create will fail */ - if (assume_clean == 0 && force == 0 && first_missing < raiddisks && + if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks && st->ss->external == 0 && - second_missing >= raiddisks && level == 6) { - insert_point = raiddisks - 1; + second_missing >= s->raiddisks && s->level == 6) { + insert_point = s->raiddisks - 1; if (insert_point == first_missing) insert_point--; - sparedisks ++; + s->sparedisks ++; info.array.active_disks--; missing_disks++; } - if (level <= 0 && first_missing < subdevs * 2) { - fprintf(stderr, - Name ": This level does not support missing devices\n"); + if (s->level <= 0 && first_missing < subdevs * 2) { + pr_err("This level does not support missing devices\n"); return 1; } /* We need to create the device */ map_lock(&map); - mdfd = create_mddev(mddev, name, autof, LOCAL, chosen_name); + mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name); if (mdfd < 0) { map_unlock(&map); return 1; @@ -577,7 +594,7 @@ */ if (strncmp(chosen_name, "/dev/md/", 8) == 0 && map_by_name(&map, chosen_name+8) != NULL) { - fprintf(stderr, Name ": Array name %s is in use already.\n", + pr_err("Array name %s is in use already.\n", chosen_name); close(mdfd); map_unlock(&map); @@ -587,14 +604,14 @@ vers = md_get_version(mdfd); if (vers < 9000) { - fprintf(stderr, Name ": Create requires md driver version 0.90.0 or later\n"); + pr_err("Create requires md driver version 0.90.0 or later\n"); goto abort_locked; } else { mdu_array_info_t inf; memset(&inf, 0, sizeof(inf)); ioctl(mdfd, GET_ARRAY_INFO, &inf); if (inf.working_disks != 0) { - fprintf(stderr, Name ": another array by this name" + pr_err("another array by this name" " is already running.\n"); goto abort_locked; } @@ -602,9 +619,9 @@ /* Ok, lets try some ioctls */ - info.array.level = level; - info.array.size = size; - info.array.raid_disks = raiddisks; + info.array.level = s->level; + info.array.size = s->size; + info.array.raid_disks = s->raiddisks; /* The kernel should *know* what md_minor we are dealing * with, but it chooses to trust me instead. Sigh */ @@ -613,15 +630,15 @@ info.array.md_minor = minor(stb.st_rdev); info.array.not_persistent = 0; - if ( ( (level == 4 || level == 5) && - (insert_point < raiddisks || first_missing < raiddisks) ) + if ( ( (s->level == 4 || s->level == 5) && + (insert_point < s->raiddisks || first_missing < s->raiddisks) ) || - ( level == 6 && (insert_point < raiddisks - || second_missing < raiddisks)) + ( s->level == 6 && (insert_point < s->raiddisks + || second_missing < s->raiddisks)) || - ( level <= 0 ) + ( s->level <= 0 ) || - assume_clean + s->assume_clean ) { info.array.state = 1; /* clean, but one+ drive will be missing*/ info.resync_start = MaxSector; @@ -629,16 +646,16 @@ info.array.state = 0; /* not clean, but no errors */ info.resync_start = 0; } - if (level == 10) { + if (s->level == 10) { /* for raid10, the bitmap size is the capacity of the array, * which is array.size * raid_disks / ncopies; * .. but convert to sectors. */ - int ncopies = ((layout>>8) & 255) * (layout & 255); - bitmapsize = (unsigned long long)size * raiddisks / ncopies * 2; -/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, size, raiddisks, ncopies);*/ + int ncopies = ((s->layout>>8) & 255) * (s->layout & 255); + bitmapsize = s->size * s->raiddisks / ncopies * 2; +/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/ } else - bitmapsize = (unsigned long long)size * 2; + bitmapsize = s->size * 2; /* There is lots of redundancy in these disk counts, * raid_disks is the most meaningful value @@ -654,22 +671,23 @@ * including spares * failed_disks is the number of disks marked failed * - * Ideally, the kernel would keep these (except raid_disks) + * Ideally, the kernel would keep these (except raid_disks) * up-to-date as we ADD_NEW_DISK, but it doesn't (yet). * So for now, we assume that all raid and spare * devices will be given. */ - info.array.spare_disks=sparedisks; + info.array.spare_disks=s->sparedisks; info.array.failed_disks=missing_disks; info.array.nr_disks = info.array.working_disks + info.array.failed_disks; - info.array.layout = layout; - info.array.chunk_size = chunk*1024; + info.array.layout = s->layout; + info.array.chunk_size = s->chunk*1024; if (name == NULL || *name == 0) { /* base name on mddev */ /* /dev/md0 -> 0 * /dev/md_d0 -> d0 + * /dev/md_foo -> foo * /dev/md/1 -> 1 * /dev/md/d1 -> d1 * /dev/md/home -> home @@ -679,9 +697,8 @@ name = strrchr(mddev, '/'); if (name) { name++; - if (strncmp(name, "md_d", 4)==0 && - strlen(name) > 4 && - isdigit(name[4]) && + if (strncmp(name, "md_", 3)==0 && + strlen(name) > 3 && (name-mddev) == 5 /* /dev/ */) name += 3; else if (strncmp(name, "md", 2)==0 && @@ -691,70 +708,70 @@ name += 2; } } - if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid)) + if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid, + data_offset)) goto abort_locked; total_slots = info.array.nr_disks; st->ss->getinfo_super(st, &info, NULL); - sysfs_init(&info, mdfd, 0); + sysfs_init(&info, mdfd, NULL); - if (did_default && verbose >= 0) { + if (did_default && c->verbose >= 0) { if (is_subarray(info.text_version)) { - int dnum = devname2devnum(info.text_version+1); - char *path; - int mdp = get_mdp_major(); + char devnm[32]; + char *ep; struct mdinfo *mdi; - if (dnum > 0) - path = map_dev(MD_MAJOR, dnum, 1); - else - path = map_dev(mdp, (-1-dnum)<< 6, 1); - mdi = sysfs_read(-1, dnum, GET_VERSION); + strncpy(devnm, info.text_version+1, 32); + devnm[31] = 0; + ep = strchr(devnm, '/'); + if (ep) + *ep = 0; - fprintf(stderr, Name ": Creating array inside " - "%s container %s\n", - mdi?mdi->text_version:"managed", path); + mdi = sysfs_read(-1, devnm, GET_VERSION); + + pr_err("Creating array inside %s container %s\n", + mdi?mdi->text_version:"managed", devnm); sysfs_free(mdi); } else - fprintf(stderr, Name ": Defaulting to version" + pr_err("Defaulting to version" " %s metadata\n", info.text_version); } - map_update(&map, fd2devnum(mdfd), info.text_version, + map_update(&map, fd2devnm(mdfd), info.text_version, info.uuid, chosen_name); map_unlock(&map); - if (bitmap_file && vers < 9003) { + if (s->bitmap_file && vers < 9003) { major_num = BITMAP_MAJOR_HOSTENDIAN; #ifdef __BIG_ENDIAN - fprintf(stderr, Name ": Warning - bitmaps created on this kernel are not portable\n" + pr_err("Warning - bitmaps created on this kernel are not portable\n" " between different architectured. Consider upgrading the Linux kernel.\n"); #endif } - if (bitmap_file && strcmp(bitmap_file, "internal")==0) { + if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) { if ((vers%100) < 2) { - fprintf(stderr, Name ": internal bitmaps not supported by this kernel.\n"); + pr_err("internal bitmaps not supported by this kernel.\n"); goto abort; } if (!st->ss->add_internal_bitmap) { - fprintf(stderr, Name ": internal bitmaps not supported with %s metadata\n", + pr_err("internal bitmaps not supported with %s metadata\n", st->ss->name); goto abort; } - if (!st->ss->add_internal_bitmap(st, &bitmap_chunk, - delay, write_behind, + if (!st->ss->add_internal_bitmap(st, &s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, 1, major_num)) { - fprintf(stderr, Name ": Given bitmap chunk size not supported.\n"); + pr_err("Given bitmap chunk size not supported.\n"); goto abort; } - bitmap_file = NULL; + s->bitmap_file = NULL; } + sysfs_init(&info, mdfd, NULL); - sysfs_init(&info, mdfd, 0); - - if (st->ss->external && st->container_dev != NoMdDev) { + if (st->ss->external && st->container_devnm[0]) { /* member */ /* When creating a member, we need to be careful @@ -769,57 +786,53 @@ * * For now, fail if it is already running. */ - container_fd = open_dev_excl(st->container_dev); + container_fd = open_dev_excl(st->container_devnm); if (container_fd < 0) { - fprintf(stderr, Name ": Cannot get exclusive " + pr_err("Cannot get exclusive " "open on container - weird.\n"); goto abort; } - if (mdmon_running(st->container_dev)) { - if (verbose) - fprintf(stderr, Name ": reusing mdmon " + if (mdmon_running(st->container_devnm)) { + if (c->verbose) + pr_err("reusing mdmon " "for %s.\n", - devnum2devname(st->container_dev)); + st->container_devnm); st->update_tail = &st->updates; } else need_mdmon = 1; } rv = set_array_info(mdfd, st, &info); if (rv) { - fprintf(stderr, Name ": failed to set array info for %s: %s\n", + pr_err("failed to set array info for %s: %s\n", mddev, strerror(errno)); goto abort; } - if (bitmap_file) { + if (s->bitmap_file) { int uuid[4]; st->ss->uuid_from_super(st, uuid); - if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk, - delay, write_behind, + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major_num)) { goto abort; } - bitmap_fd = open(bitmap_file, O_RDWR); + bitmap_fd = open(s->bitmap_file, O_RDWR); if (bitmap_fd < 0) { - fprintf(stderr, Name ": weird: %s cannot be openned\n", - bitmap_file); + pr_err("weird: %s cannot be openned\n", + s->bitmap_file); goto abort; } if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { - fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n", + pr_err("Cannot set bitmap file for %s: %s\n", mddev, strerror(errno)); goto abort; } } - infos = malloc(sizeof(*infos) * total_slots); - if (!infos) { - fprintf(stderr, Name ": Unable to allocate memory\n"); - goto abort; - } - + infos = xmalloc(sizeof(*infos) * total_slots); + enable_fds(total_slots); for (pass=1; pass <=2 ; pass++) { struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ @@ -849,7 +862,7 @@ inf->disk.number = dnum; inf->disk.raid_disk = dnum; - if (inf->disk.raid_disk < raiddisks) + if (inf->disk.raid_disk < s->raiddisks) inf->disk.state = (1<ss->external && - st->container_dev != NoMdDev) + st->container_devnm[0]) fd = open(dv->devname, O_RDWR); else fd = open(dv->devname, O_RDWR|O_EXCL); if (fd < 0) { - fprintf(stderr, Name ": failed to open %s " + pr_err("failed to open %s " "after earlier success - aborting\n", dv->devname); goto abort; @@ -880,15 +893,16 @@ if (fd >= 0) remove_partitions(fd); if (st->ss->add_to_super(st, &inf->disk, - fd, dv->devname)) { + fd, dv->devname, + dv->data_offset)) { ioctl(mdfd, STOP_ARRAY, NULL); goto abort; } st->ss->getinfo_super(st, inf, NULL); safe_mode_delay = inf->safe_mode_delay; - if (have_container && verbose > 0) - fprintf(stderr, Name ": Using %s for device %d\n", + if (have_container && c->verbose > 0) + pr_err("Using %s for device %d\n", map_dev(inf->disk.major, inf->disk.minor, 0), dnum); @@ -905,10 +919,9 @@ rv = add_disk(mdfd, st, &info, inf); if (rv) { - fprintf(stderr, - Name ": ADD_NEW_DISK for %s " - "failed: %s\n", - dv->devname, strerror(errno)); + pr_err("ADD_NEW_DISK for %s " + "failed: %s\n", + dv->devname, strerror(errno)); goto abort; } break; @@ -928,12 +941,12 @@ */ map_lock(&map); st->ss->getinfo_super(st, &info_new, NULL); - if (st->ss->external && level != LEVEL_CONTAINER && + if (st->ss->external && s->level != LEVEL_CONTAINER && !same_uuid(info_new.uuid, info.uuid, 0)) { - map_update(&map, fd2devnum(mdfd), + map_update(&map, fd2devnm(mdfd), info_new.text_version, info_new.uuid, chosen_name); - me = map_by_devnum(&map, st->container_dev); + me = map_by_devnm(&map, st->container_devnm); } if (st->ss->write_init_super(st)) { @@ -943,10 +956,10 @@ /* update parent container uuid */ if (me) { - char *path = strdup(me->path); + char *path = xstrdup(me->path); st->ss->getinfo_super(st, &info_new, NULL); - map_update(&map, st->container_dev, + map_update(&map, st->container_devnm, info_new.text_version, info_new.uuid, path); free(path); @@ -959,22 +972,24 @@ } free(infos); - if (level == LEVEL_CONTAINER) { + if (s->level == LEVEL_CONTAINER) { /* No need to start. But we should signal udev to * create links */ sysfs_uevent(&info, "change"); - if (verbose >= 0) - fprintf(stderr, Name ": container %s prepared.\n", mddev); + if (c->verbose >= 0) + pr_err("container %s prepared.\n", mddev); wait_for(chosen_name, mdfd); - } else if (runstop == 1 || subdevs >= raiddisks) { + } else if (c->runstop == 1 || subdevs >= s->raiddisks) { if (st->ss->external) { int err; - switch(level) { + switch(s->level) { case LEVEL_LINEAR: case LEVEL_MULTIPATH: case 0: err = sysfs_set_str(&info, NULL, "array_state", - "active"); + c->readonly + ? "readonly" + : "active"); need_mdmon = 0; break; default: @@ -984,8 +999,18 @@ } sysfs_set_safemode(&info, safe_mode_delay); if (err) { - fprintf(stderr, Name ": failed to" - " activate array.\n"); + pr_err("failed to" + " activate array.\n"); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else if (c->readonly && + sysfs_attribute_available( + &info, NULL, "array_state")) { + if (sysfs_set_str(&info, NULL, + "array_state", "readonly") < 0) { + pr_err("Failed to start array: %s\n", + strerror(errno)); ioctl(mdfd, STOP_ARRAY, NULL); goto abort; } @@ -993,28 +1018,33 @@ /* param is not actually used */ mdu_param_t param; if (ioctl(mdfd, RUN_ARRAY, ¶m)) { - fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", - strerror(errno)); + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); if (info.array.chunk_size & (info.array.chunk_size-1)) { - fprintf(stderr, " : Problem may be that " - "chunk size is not a power of 2\n"); + cont_err("Problem may be that " + "chunk size is not a power of 2\n"); } ioctl(mdfd, STOP_ARRAY, NULL); goto abort; } + /* if start_ro module parameter is set, array is + * auto-read-only, which is bad as the resync won't + * start. So lets make it read-write now. + */ + ioctl(mdfd, RESTART_ARRAY_RW, NULL); } - if (verbose >= 0) - fprintf(stderr, Name ": array %s started.\n", mddev); - if (st->ss->external && st->container_dev != NoMdDev) { + if (c->verbose >= 0) + pr_err("array %s started.\n", mddev); + if (st->ss->external && st->container_devnm[0]) { if (need_mdmon) - start_mdmon(st->container_dev); + start_mdmon(st->container_devnm); - ping_monitor_by_id(st->container_dev); + ping_monitor(st->container_devnm); close(container_fd); } wait_for(chosen_name, mdfd); } else { - fprintf(stderr, Name ": not starting array - not enough devices.\n"); + pr_err("not starting array - not enough devices.\n"); } close(mdfd); return 0; @@ -1022,7 +1052,7 @@ abort: map_lock(&map); abort_locked: - map_remove(&map, fd2devnum(mdfd)); + map_remove(&map, fd2devnm(mdfd)); map_unlock(&map); if (mdfd >= 0) diff -Nru mdadm-3.2.5/debian/changelog mdadm-3.3/debian/changelog --- mdadm-3.2.5/debian/changelog 2014-04-30 19:42:45.000000000 +0000 +++ mdadm-3.3/debian/changelog 2014-07-17 13:49:19.000000000 +0000 @@ -1,3 +1,106 @@ +mdadm (3.3-2ubuntu1) utopic; urgency=medium + + * Merge from Debian unstable, remaining changes: + - debian/initramfs/mdadm-functions: add call wait_for_udev to wait a + little longer for RAID devices to appear. + - debian/control: we need udev and util-linux in the right version. We + also remove the build dependency from quilt and docbook-to-man as both + are not used in Ubuntus mdadm. + - debian/initramfs/hook: kept the Ubuntus version for handling the absence + of active raid arrays in /etc/mdadm/mdadm.conf + - debian/initramfs/script.local-top.DEBIAN, debian/mdadm-startall, + debian/mdadm.raid.DEBIAN: removed. udev does its job now instead. + - debian/mdadm-startall.sgml, debian/mdadm-startall.8: documentation of + unused startall script + - debian/mdadm.config, debian/mdadm.postinst - let udev do the handling + instead. Resolved merge conflict by keeping Ubuntu's version. + - debian/mkconf.in is the older mkconf. Kept the Ubuntu version. + - debian/rules: Kept Ubuntus version for installing apport hooks, not + installing un-used startall script. + - debian/presubj: Dropped this unused bug reporting file. Instead use + source_mdadm.py act as an apport hook for bug handling. + - d/p/debian-changes-3.1.4-1+8efb9d1ubuntu4: mdadm udev rule + incrementally adds mdadm member when detected. Starting such an + array in degraded mode is possible by mdadm -IRs. Using mdadm + -ARs without stopping the array first does nothing when no + mdarray-unassociated device is available. Using mdadm -IRs to + start a previously partially assembled array through incremental + mode. Keeping the mdadm -ARs for assembling arrays which were for + some reason not assembled through incremental mode (i.e through + mdadm's udev rule). + - Take kernel cmdline parameters "nomdmonisw" and "nomdmonddf" into + account, when assembling imsm/ddf arrays. This defaults to assembling + those arrays using mdmon/mdadm, with a fallback to dmraid upon + request. + - Use default/grub.d snippet to continue using dmraid to assemble + fakeraid arrays. + + * Install incremental assembly rules. + * Disable use-external-blkid.diff, udev in Ubuntu is recent enough. + + -- Dimitri John Ledkov Thu, 17 Jul 2014 14:49:19 +0100 + +mdadm (3.3-2) unstable; urgency=low + + * use 63-md-raid-arrays.rules instead of old 64-md-raid.rules + (Closes: #726237) + * do not use builtin blkid in udev rules, as our udev (at least + on wheezy) does not have it (use-external-blkid.diff) + + -- Michael Tokarev Mon, 14 Oct 2013 15:49:54 +0400 + +mdadm (3.3-1) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream 3.3 release (Closes: #718896) + See ANNOUNCE-3.3 for details. + Patches: + - refreshed debian-conffile-location.diff + (added .conf.d) + - removed debian-disable-udev-incr-assembly.diff + (do not ship udev-md-raid-assembly.rules for now) + - refreshed debian-no-Werror.diff + - refreshed sha1-includes.diff + - removed patches (included upstream)A: + spelling-and-manpages.patch + fix-enough-function-for-RAID10.patch + fix-segfaults-in-detail.patch + super0-do-not-override-uuid-with-homehost.patch + mdmon-allow-takeover-when-original-was-started-with-.patch + mdmon-fix-arg-parsing.patch + mdmon-fix-arg-processing-for-a.path + Install udev-md-raid-arrays.rules instead of udev-md-raid.rules, + don't install new udev-md-raid-assembly.rules for now. + * remove Martin F. Krafft from uploaders per his request. + Thank you for your contributions! + * added remove-bashism-from-makefile.patch patch to work around + newly introduced bashism + * remove debian/source/options, there's no need to set compression + options for debian.tar.gz. + * remove outdated debian/docs/md_superblock_formats.txt and + debian/docs/md.txt (Closes: #714977, #714978) + * ship ANNOUNCE-*, external-reshape-design.txt, mdmon-design.txt + files as documentation (Closes: #715324) + + [ Dmitrijs Ledkovs ] + * Properly remove 65-mdadm.vol_id.rules, instead of trying to remove a + never-existed 65_mdadm.vol_id.rules (note the 65- vs 65_). + + -- Michael Tokarev Fri, 11 Oct 2013 10:12:47 +0400 + +mdadm (3.2.5-6) unstable; urgency=low + + * replace home-grown and not-working-since-etch udevsettle call + in initramfs script with proper wait_for_udev function (from + common initramfs functions). This unbreaks situations when + the underlying device needs some udev magic to happen before + being available, which includes stacked devices (md on lvm) + and other cases. Thanks to Thomas Parmelan and Dave Whitla + for finding the root cause of breakage and for providing + the fix. (Closes: #644876) + + -- Michael Tokarev Tue, 05 Mar 2013 13:32:21 +0400 + mdadm (3.2.5-5ubuntu5) utopic; urgency=medium * Instead of calling update-grub in postinst, call update-grub2. (LP: diff -Nru mdadm-3.2.5/debian/control mdadm-3.3/debian/control --- mdadm-3.2.5/debian/control 2014-01-24 14:53:21.000000000 +0000 +++ mdadm-3.3/debian/control 2014-07-16 15:56:33.000000000 +0000 @@ -2,8 +2,8 @@ Section: admin Priority: optional XSBC-Original-Maintainer: Debian mdadm maintainers -Maintainer: Ubuntu Core Developers -Uploaders: martin f. krafft , Michael Tokarev +Maintainer: Ubuntu Developers +Uploaders: Michael Tokarev Build-Depends: debhelper (>= 7.4.2), po-debconf, groff-base Standards-Version: 3.9.3 Vcs-Git: git://git.debian.org/git/pkg-mdadm/mdadm @@ -12,7 +12,7 @@ Package: mdadm Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends}, lsb-base (>= 3.1-6), debconf (>= 1.4.72), udev (>= 136-1), initramfs-tools (>= 0.85eubuntu24), util-linux (>= 2.15-1), initscripts (>= 2.88dsf-13.3) +Depends: ${shlibs:Depends}, ${misc:Depends}, lsb-base (>= 3.1-6), debconf (>= 1.4.72), udev (>= 204), initramfs-tools (>= 0.85eubuntu24), util-linux (>= 2.15-1), initscripts (>= 2.88dsf-13.3) Recommends: default-mta | mail-transport-agent, module-init-tools Replaces: mdctl Breaks: udev (<< 136-1), mdctl (<< 0.7.2), raidtools2 (<< 1.00.3-12.1), dmraid (<= 1.0.0.rc16-4.2ubuntu1) diff -Nru mdadm-3.2.5/debian/docs/md_superblock_formats.txt mdadm-3.3/debian/docs/md_superblock_formats.txt --- mdadm-3.2.5/debian/docs/md_superblock_formats.txt 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/docs/md_superblock_formats.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,534 +0,0 @@ -# From: http://linux-raid.osdl.org/index.php/RAID_superblock_formats - -RAID superblock formats - -From Linux-raid - -Jump to: navigation, search - -Contents - - • 1 RAID superblock formats - □ 1.1 The version-0.90 Superblock Format - □ 1.2 The version-1 Superblock Format - □ 1.3 Sub-versions of the version-1 superblock - □ 1.4 The version-1 superblock format on-disk layout - ☆ 1.4.1 Total Size of superblock - ☆ 1.4.2 Section: Superblock/"Magic-Number" Identification area - ☆ 1.4.3 Section: Per-Array Identification & Configuration area - ☆ 1.4.4 Section: RAID-Reshape In-Process Metadata Storage/Recovery - area - ☆ 1.4.5 Section: This-Component-Device Information area - ☆ 1.4.6 Section: Array-State Information area - ☆ 1.4.7 Section: Device-Roles (Positions-in-Array) area - -[edit] - -RAID superblock formats - -Currently, the Linux RAID subsystem recognizes two distinct variant -superblocks. - -They are known as "version-0.90" and "version-1" Superblock formats. - -[edit] - -The version-0.90 Superblock Format - -The version-0.90 superblock format has several limitations. It limits the -number of component devices within an array to 28, and limits each component -device to a maximum size of 2TB. - -[edit] - -The version-1 Superblock Format - -The version-1 superblock format represents a more-expandable format, capable of -supporting arrays with 384+ devices, with 64-bit sector lengths. - -[edit] - -Sub-versions of the version-1 superblock - -The "version-1" superblock format is currently used in three different -"sub-versions". - -The sub-versions differ primarily (solely?) in the location on each component -device at which they actually store the superblock. - -┌───────────┬───────────────────────────────────┐ -│Sub-Version│ Superblock Position on Device │ -├───────────┼───────────────────────────────────┤ -│1.0 │At the end of the device │ -├───────────┼───────────────────────────────────┤ -│1.1 │At the beginning of the device │ -├───────────┼───────────────────────────────────┤ -│1.2 │4K from the beginning of the device│ -└───────────┴───────────────────────────────────┘ -[edit] - -The version-1 superblock format on-disk layout - -[edit] - -Total Size of superblock - -Total Size of superblock: 256 Bytes, plus 2 bytes per device in the array - -[edit] - -Section: Superblock/"Magic-Number" Identification area - -16 Bytes, Offset 0-15 (0x00 - 0x0F) - -┌──────┬──────┬──────┬─────────────┬───────────┬─────┬──────────────────────────┬───────┐ -│Offset│Offset│Length│ │ Usage/ │Data │ │ │ -│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ -│ │ │bytes)│ │ │ │ │ │ -├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ -│ │ │ │ │"Magic │ │ │ │ -│0x00 -│0 - 3 │4 │magic │Number" │__u32│0xa92b4efc │ │ -│0x03 │ │ │ │(Superblock│ │(little-endian) │ │ -│ │ │ │ │ID) │ │ │ │ -├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ -│ │ │ │ │Major │ │ │ │ -│0x04 -│4 - 7 │4 │major_version│Version │__u32│1 │ │ -│0x07 │ │ │ │of the │ │ │ │ -│ │ │ │ │Superblock │ │ │ │ -├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ -│ │ │ │ │ │ │0 │ │ -│ │ │ │ │ │ │Bit-Mapped Field │ │ -│ │ │ │ │ │ │ │ │ -│ │ │ │ │ │ │┌─────┬──────────────────┐│ │ -│ │ │ │ │ │ ││ Bit │ Meaning ││ │ -│ │ │ │ │ │ ││Value│ ││ │ -│ │ │ │ │ │ │├─────┼──────────────────┤│ │ -│ │ │ │ │ │ ││1 │RAID Bitmap is ││ │ -│ │ │ │ │ │ ││ │used ││ │ -│ │ │ │ │ │ │├─────┼──────────────────┤│ │ -│ │ │ │ │Feature Map│ ││ │RAID Recovery is ││ │ -│ │ │ │ │- which │ ││2 │in progress ││ │ -│ │ │ │ │extended │ ││ │(See ││ │ -│ │ │ │ │features │ ││ │"recovery_offset")││ │ -│ │ │ │ │(such as │ │├─────┼──────────────────┤│ │ -│0x08 -│ │ │ │volume │ ││4 │RAID Reshape is in││ │ -│0x0B │8 - 11│4 │feature_map │bitmaps, │__u32││ │progress ││ │ -│ │ │ │ │recovery, │ │├─────┼──────────────────┤│ │ -│ │ │ │ │or reshape)│ ││8 │undefined/reserved││ │ -│ │ │ │ │are in use │ ││ │(0) ││ │ -│ │ │ │ │on this │ │├─────┼──────────────────┤│ │ -│ │ │ │ │array │ ││16 │undefined/reserved││ │ -│ │ │ │ │ │ ││ │(0) ││ │ -│ │ │ │ │ │ │├─────┼──────────────────┤│ │ -│ │ │ │ │ │ ││32 │undefined/reserved││ │ -│ │ │ │ │ │ ││ │(0) ││ │ -│ │ │ │ │ │ │├─────┼──────────────────┤│ │ -│ │ │ │ │ │ ││64 │undefined/reserved││ │ -│ │ │ │ │ │ ││ │(0) ││ │ -│ │ │ │ │ │ │├─────┼──────────────────┤│ │ -│ │ │ │ │ │ ││128 │undefined/reserved││ │ -│ │ │ │ │ │ ││ │(0) ││ │ -│ │ │ │ │ │ │└─────┴──────────────────┘│ │ -├──────┼──────┼──────┼─────────────┼───────────┼─────┼──────────────────────────┼───────┤ -│ │ │ │ │ │ │ │Always │ -│0x0C -│12 - │ │ │Padding │ │ │set to │ -│0x0F │15 │4 │pad0 │Block 0 │__u32│0 │zero │ -│ │ │ │ │ │ │ │when │ -│ │ │ │ │ │ │ │writing│ -└──────┴──────┴──────┴─────────────┴───────────┴─────┴──────────────────────────┴───────┘ - - -[edit] - -Section: Per-Array Identification & Configuration area - -48 Bytes, Offset 16-63 (0x10 - 0x3F) - -┌──────┬──────┬──────┬─────────────┬──────────┬─────┬────────────────┬───────────┐ -│Offset│Offset│Length│ │ Usage/ │Data │ │ │ -│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ -│ │ │bytes)│ │ │ │ │ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│0x10 -│16 - │ │ │UUID for │__u8 │Set by │ │ -│0x1F │31 │16 │set_uuid │the Array │[16] │user-space │ │ -│ │ │ │ │(?) │ │formatting util │ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│0x20 -│32 - │ │ │Name for │char │Set and used by │ │ -│0x3F │63 │32 │set_name │the Array │[32] │user-space utils│Nt │ -│ │ │ │ │(?) │ │ │ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │low 40-bits are │ │ -│0x40 -│64 - │8 │ctime │Creation │__u64│seconds │ │ -│0x47 │71 │ │ │Time(?) │ │high 24-bits are│ │ -│ │ │ │ │ │ │uSeconds │ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │┌──┬───────────┐│ │ -│ │ │ │ │ │ ││-4│Multi-Path ││ │ -│ │ │ │ │ │ │├──┼───────────┤│ │ -│ │ │ │ │ │ ││-1│Linear ││ │ -│ │ │ │ │ │ │├──┼───────────┤│ │ -│ │ │ │ │ │ ││0 │RAID-0 ││ │ -│ │ │ │ │ │ ││ │(Striped) ││ │ -│ │ │ │ │ │ │├──┼───────────┤│ │ -│ │ │ │ │ │ ││1 │RAID-1 ││ │ -│ │ │ │ │ │ ││ │(Mirrored) ││mdadm │ -│ │ │ │ │ │ │├──┼───────────┤│versions │ -│ │ │ │ │ │ ││ │RAID-4 ││(as of │ -│ │ │ │ │ │ ││ │(Striped ││v2.6.4) │ -│0x48 -│72 - │ │ │RAID Level│ ││4 │with ││limit │ -│0x4B │75 │4 │level │of the │__u32││ │Dedicated ││RAID-6 │ -│ │ │ │ │Array │ ││ │Block-Level││(creation) │ -│ │ │ │ │ │ ││ │Parity) ││to 256 │ -│ │ │ │ │ │ │├──┼───────────┤│disks or │ -│ │ │ │ │ │ ││ │RAID-5 ││less │ -│ │ │ │ │ │ ││ │(Striped ││ │ -│ │ │ │ │ │ ││5 │with ││ │ -│ │ │ │ │ │ ││ │Distributed││ │ -│ │ │ │ │ │ ││ │Parity) ││ │ -│ │ │ │ │ │ │├──┼───────────┤│ │ -│ │ │ │ │ │ ││ │RAID-6 ││ │ -│ │ │ │ │ │ ││6 │(Striped ││ │ -│ │ │ │ │ │ ││ │with Dual ││ │ -│ │ │ │ │ │ ││ │Parity) ││ │ -│ │ │ │ │ │ │└──┴───────────┘│ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │┌─┬────────────┐│ │ -│ │ │ │ │ │ ││0│left ││ │ -│ │ │ │ │ │ ││ │asymmetric ││ │ -│ │ │ │ │ │ │├─┼────────────┤│Controls │ -│ │ │ │ │ │ ││1│right ││the │ -│ │ │ │ │layout of │ ││ │asymmetric ││relative │ -│0x4C -│76 - │4 │layout │array │__u32│├─┼────────────┤│arrangement│ -│0x4F │79 │ │ │(RAID5(and│ ││ │left ││of data and│ -│ │ │ │ │6?) only) │ ││2│symmetric ││parity │ -│ │ │ │ │ │ ││ │(default) ││blocks on │ -│ │ │ │ │ │ │├─┼────────────┤│the disks. │ -│ │ │ │ │ │ ││3│right ││ │ -│ │ │ │ │ │ ││ │symmetric ││ │ -│ │ │ │ │ │ │└─┴────────────┘│ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │size of │ │ -│ │ │ │ │used-size │ │component │ │ -│0x50 -│80 - │8 │size │of │__u64│devices │ │ -│0x57 │87 │ │ │component │ │(in # of │ │ -│ │ │ │ │devices │ │512-byte │ │ -│ │ │ │ │ │ │sectors) │ │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │ │default is │ -│ │ │ │ │ │ │ │64K? for │ -│ │ │ │ │ │ │ │raid levels│ -│ │ │ │ │ │ │ │0, 10, 4, │ -│ │ │ │ │ │ │ │5, and 6 │ -│ │ │ │ │ │ │ │chunksize │ -│ │ │ │ │ │ │ │not used in│ -│ │ │ │ │ │ │ │raid levels│ -│ │ │ │ │ │ │chunk-size of │1, linear, │ -│ │ │ │ │chunk-size│ │the array │and │ -│0x58 -│88 - │4 │chunksize │of the │__u32│(in # of │multi-path │ -│0x5B │91 │ │ │array │ │512-byte │ │ -│ │ │ │ │ │ │sectors) │Note: │ -│ │ │ │ │ │ │ │During │ -│ │ │ │ │ │ │ │creation │ -│ │ │ │ │ │ │ │this │ -│ │ │ │ │ │ │ │appears to │ -│ │ │ │ │ │ │ │be created │ -│ │ │ │ │ │ │ │as a │ -│ │ │ │ │ │ │ │multiple of│ -│ │ │ │ │ │ │ │1024 rather│ -│ │ │ │ │ │ │ │than 512. │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │ │raid4 │ -│ │ │ │ │ │ │ │requires a │ -│ │ │ │ │ │ │ │minimum of │ -│ │ │ │ │ │ │ │2 member │ -│ │ │ │ │ │ │ │devs │ -│ │ │ │ │ │ │ │raid5 │ -│ │ │ │ │ │ │ │requires a │ -│ │ │ │ │ │ │ │minimum of │ -│ │ │ │ │(?)number │ │ │2 member │ -│0x5C -│92 - │4 │raid_disks │of disks │__u32│# │devs │ -│0x5F │95 │ │ │in array │ │ │raid6 │ -│ │ │ │ │(?) │ │ │requires a │ -│ │ │ │ │ │ │ │minimum of │ -│ │ │ │ │ │ │ │4 member │ -│ │ │ │ │ │ │ │devs │ -│ │ │ │ │ │ │ │raid6 │ -│ │ │ │ │ │ │ │limited to │ -│ │ │ │ │ │ │ │a max of │ -│ │ │ │ │ │ │ │256 member │ -│ │ │ │ │ │ │ │devs │ -├──────┼──────┼──────┼─────────────┼──────────┼─────┼────────────────┼───────────┤ -│ │ │ │ │ │ │ │This is │ -│ │ │ │ │# of │ │ │only valid │ -│ │ │ │ │sectors │ │ │if │ -│ │ │ │ │after │ │ │feature_map│ -│ │ │ │ │superblock│ │ │[1] is set │ -│ │ │ │ │that │ │ │ │ -│0x60 -│96 - │4 │bitmap_offset│bitmap │__u32│(signed) │Signed │ -│0x63 │99 │ │ │starts │ │ │value │ -│ │ │ │ │(See note │ │ │allows │ -│ │ │ │ │about │ │ │bitmap │ -│ │ │ │ │signed │ │ │to appear │ -│ │ │ │ │value) │ │ │before │ -│ │ │ │ │ │ │ │superblock │ -│ │ │ │ │ │ │ │on the disk│ -└──────┴──────┴──────┴─────────────┴──────────┴─────┴────────────────┴───────────┘ - - -[edit] - -Section: RAID-Reshape In-Process Metadata Storage/Recovery area - -64 Bytes, Offset 100-163 (0x64 - 0x7F) -(Note: Only contains valid data if feature_map bit '4' is set) - -┌──────┬──────┬──────┬────────────────┬───────────┬─────┬─────────────┬───────┐ -│Offset│Offset│Length│ │ Usage/ │Data │ │ │ -│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ -│ │ │bytes)│ │ │ │ │ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│ │ │ │ │the new │ │ │ │ -│0x64 -│100 - │4 │new_level │RAID level │__u32│see level │ │ -│0x67 │103 │ │ │being │ │field (above)│ │ -│ │ │ │ │reshaped-to│ │ │ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│ │ │ │ │Next │ │current │ │ -│0x68 -│104 - │8 │reshape_position│address of │__u64│position of │ │ -│0x6F │111 │ │ │the array │ │the reshape │ │ -│ │ │ │ │to reshape │ │operation │ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│ │ │ │ │this holds │ │ │ │ -│0x70 -│112 - │4 │delta_disks │the change │__u32│change in # │ │ -│0x73 │115 │ │ │in # of │ │of raid disks│ │ -│ │ │ │ │raid disks │ │ │ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│0x74 -│116 - │4 │new_layout │new layout │__u32│see layout │ │ -│0x77 │119 │ │ │for array │ │field (above)│ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│0x78 -│120 - │4 │new_chunk │new chunk │__u32│see chunksize│ │ -│0x7B │123 │ │ │size │ │field (above)│ │ -├──────┼──────┼──────┼────────────────┼───────────┼─────┼─────────────┼───────┤ -│ │ │ │ │ │ │ │Always │ -│0x7C -│124 - │ │ │Padding │__u8 │ │set to │ -│0x7F │127 │4 │pad1 │Block #1 │[4] │0 │zero │ -│ │ │ │ │ │ │ │when │ -│ │ │ │ │ │ │ │writing│ -└──────┴──────┴──────┴────────────────┴───────────┴─────┴─────────────┴───────┘ - - - -[edit] - -Section: This-Component-Device Information area - -64 Bytes, Offset 128-191 (0x80 - 0xbf) - -┌──────┬──────┬──────┬──────────────────┬────────────┬─────┬────────────────────┬────────────┐ -│Offset│Offset│Length│ │ Usage/ │Data │ │ │ -│(Hex) │(Dec) │ (in │ Field Name │ Meaning │Type │ Data Value │ Notes │ -│ │ │bytes)│ │ │ │ │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│0x80 -│128 - │ │ │the sector #│ │sector # where data │ │ -│0x87 │135 │8 │data_offset │upon which │__u64│begins │ │ -│ │ │ │ │data starts │ │(Often 0) │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │sectors in │ │ │ │ -│0x88 -│136 - │ │ │the device │ │# of sectors that │ │ -│0x8F │143 │8 │data_size │that are │__u64│can be used for data│ │ -│ │ │ │ │used for │ │ │ │ -│ │ │ │ │data │ │ │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │# of the │ │ │ │ -│0x90 -│144 - │ │ │sector upon │ │# of the sector upon│ │ -│0x97 │151 │8 │super_offset │which this │__u64│which this │ │ -│ │ │ │ │superblock │ │superblock starts │ │ -│ │ │ │ │starts │ │ │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │sectors │ │ │ │ -│ │ │ │ │before this │ │ │ │ -│0x98 -│152 - │ │ │offset │ │ │ │ -│0x9F │159 │8 │recovery_offset │(from │__u64│sector # │ │ -│ │ │ │ │data_offset)│ │ │ │ -│ │ │ │ │have been │ │ │ │ -│ │ │ │ │recovered │ │ │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│0xA0 -│160 - │ │ │ │ │Permanent identifier│ │ -│0xA3 │163 │4 │dev_number │Fm │__u32│of this device (Not │ │ -│ │ │ │ │ │ │its role in RAID(?))│ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │Number of │ │ │ │ -│0xA4 -│164 - │ │ │read-errors │ │ │ │ -│0xA7 │167 │4 │cnt_corrected_read│that were │__u32│Dv │ │ -│ │ │ │ │corrected by│ │ │ │ -│ │ │ │ │re-writing │ │ │ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │UUID of the │ │ │Set by │ -│0xA8 -│168 - │16 │device_uuid │component │__u8 │ │User-Space │ -│0xB7 │183 │ │ │device │[16] │ │Ignored by │ -│ │ │ │ │ │ │ │kernel │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │ │ │Bit-Mapped Field │ │ -│ │ │ │ │ │ │ │ │ -│ │ │ │ │ │ │┌─────┬────────────┐│ │ -│ │ │ │ │ │ ││ Bit │ Meaning ││ │ -│ │ │ │ │ │ ││Value│ ││WriteMostly1│ -│ │ │ │ │ │ │├─────┼────────────┤│indicates │ -│ │ │ │ │ │ ││1 │WriteMostly1││that this │ -│ │ │ │ │ │ │├─────┼────────────┤│device │ -│ │ │ │ │ │ ││2 │(?) ││should only │ -│ │ │ │ │Per-Device │ │├─────┼────────────┤│be updated │ -│0xB8 │184 │1 │devflags │Flags │__u8 ││4 │(?) ││on writes, │ -│ │ │ │ │(Bit-Mapped │ │├─────┼────────────┤│not read │ -│ │ │ │ │Field) │ ││8 │(?) ││from. │ -│ │ │ │ │ │ │├─────┼────────────┤│(Useful with│ -│ │ │ │ │ │ ││16 │(?) ││slow devices│ -│ │ │ │ │ │ │├─────┼────────────┤│in RAID1 │ -│ │ │ │ │ │ ││32 │(?) ││arrays?) │ -│ │ │ │ │ │ │├─────┼────────────┤│ │ -│ │ │ │ │ │ ││64 │(?) ││ │ -│ │ │ │ │ │ │├─────┼────────────┤│ │ -│ │ │ │ │ │ ││128 │(?) ││ │ -│ │ │ │ │ │ │└─────┴────────────┘│ │ -├──────┼──────┼──────┼──────────────────┼────────────┼─────┼────────────────────┼────────────┤ -│ │ │ │ │ │ │ │Always set │ -│0xB9 -│185 - │7 │pad2 │Padding │__u8 │0 │to │ -│0xBF │191 │ │ │block 2 │[7] │ │zero when │ -│ │ │ │ │ │ │ │writing │ -└──────┴──────┴──────┴──────────────────┴────────────┴─────┴────────────────────┴────────────┘ - - -[edit] - -Section: Array-State Information area - -64 Bytes, Offset 192-255 (0xC0 - 0xFF) - -┌──────┬──────┬──────┬─────────────┬─────────────┬─────┬────────┬─────────────┐ -│Offset│Offset│Length│ │ │Data │ Data │ │ -│(Hex) │(Dec) │ (in │ Field Name │Usage/Meaning│Type │ Value │ Notes │ -│ │ │bytes)│ │ │ │ │ │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│ │ │ │ │ │ │low │ │ -│ │ │ │ │ │ │40-bits │ │ -│ │ │ │ │ │ │are │ │ -│0xC0 -│192 - │8 │utime │Fm │__u64│seconds │Nt │ -│0xC7 │199 │ │ │ │ │high │ │ -│ │ │ │ │ │ │24-bits │ │ -│ │ │ │ │ │ │are │ │ -│ │ │ │ │ │ │uSeconds│ │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│ │ │ │ │ │ │ │Updated │ -│ │ │ │ │ │ │ │whenever the │ -│ │ │ │ │ │ │ │superblock is│ -│ │ │ │ │ │ │ │updated. │ -│ │ │ │ │ │ │ │Used by mdadm│ -│0xC8 -│200 - │8 │events │Event Count │__u64│# │in │ -│0xCF │207 │ │ │for the Array│ │ │re-assembly │ -│ │ │ │ │ │ │ │to detect │ -│ │ │ │ │ │ │ │failed/ │ -│ │ │ │ │ │ │ │out-of-sync │ -│ │ │ │ │ │ │ │component │ -│ │ │ │ │ │ │ │devices. │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│ │ │ │ │Offsets │ │ │ │ -│ │ │ │ │before this │ │ │ │ -│ │ │ │ │one (starting│ │ │ │ -│0xD0 -│208 - │8 │resync_offset│from │__u64│offset #│ │ -│0xD7 │215 │ │ │data_offset) │ │ │ │ -│ │ │ │ │are 'known' │ │ │ │ -│ │ │ │ │to be in │ │ │ │ -│ │ │ │ │sync. │ │ │ │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│ │ │ │ │ │ │ │This value │ -│ │ │ │ │Checksum of │ │ │will be │ -│0xD8 -│216 - │ │ │this │ │ │different for│ -│0xDB │219 │4 │sb_csum │superblock up│__u32│# │each │ -│ │ │ │ │to devs │ │ │component │ -│ │ │ │ │[max_dev] │ │ │device's │ -│ │ │ │ │ │ │ │superblock. │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│ │ │ │ │How many │ │ │ │ -│0xDC -│220 - │ │ │devices are │ │ │ │ -│0xDF │223 │4 │max_dev │part of (or │__u32│# │ │ -│ │ │ │ │related to) │ │ │ │ -│ │ │ │ │the array │ │ │ │ -├──────┼──────┼──────┼─────────────┼─────────────┼─────┼────────┼─────────────┤ -│0xE0 -│224 - │ │ │Padding Block│__u8 │ │Always set to│ -│0xFF │255 │32 │pad3 │3 │[32] │0 │zero when │ -│ │ │ │ │ │ │ │writing │ -└──────┴──────┴──────┴─────────────┴─────────────┴─────┴────────┴─────────────┘ - - -[edit] - -Section: Device-Roles (Positions-in-Array) area - -Length: Variable number of bytes (but at least 768 bytes?) -2 Bytes per device in the array, including both spare-devices and -faulty-devices - -┌──────────────────────────────────────────────────────────────────────────────┐ -│ Section: Device-Roles (Positions-in-Array) area │ -├──────────────────────────────────────────────────────────────────────────────┤ -│(Variable length - 2 Bytes per Device in Array (including Spares/Faulty-Devs) │ -├──────────────────────────────────────────────────────────────────────────────┤ -│ │ -├────────┬───────┬──────┬─────────┬────────┬─────┬───────────────────────┬─────┤ -│ Offset │Offset │Length│ Field │ Usage/ │Data │ │ │ -│ (Hex) │ (Dec) │ (in │ Name │Meaning │Type │ Data Value │Notes│ -│ │ │bytes)│ │ │ │ │ │ -├────────┴───────┴──────┴─────────┴────────┴─────┴───────────────────────┴─────┤ -│ ?? Bytes, Offset 256-??? (0x100 - 0x???) │ -├────────┬───────┬──────┬─────────┬────────┬─────┬───────────────────────┬─────┤ -│ │ │ │ │ │ │Role or Position of │ │ -│0x100 - │256 │? │dev_roles│Fm │__u16│device in the array. │ │ -│0x??? │- ??? │ │ │ │ │0xFFFF means "spare". │ │ -│ │ │ │ │ │ │0xFFFE means "faulty". │ │ -└────────┴───────┴──────┴─────────┴────────┴─────┴───────────────────────┴─────┘ -Retrieved from "http://linux-raid.osdl.org/index.php/RAID_superblock_formats" - -Views - - • Article - • Discussion - • Edit - • History - -Personal tools - - • Log in / create account - - - -Navigation - - • Linux Raid - • Community portal - • Current events - • Recent changes - • Random page - • Help - • Donations - -Search - -[ ] [Go] [Search] -Toolbox - - • What links here - • Related changes - • Special pages - • Printable version - • Permanent link - -MediaWiki -GNU Free Documentation License 1.2 - - • This page was last modified 04:50, 3 June 2008. - • This page has been accessed 5,723 times. - • Content is available under GNU Free Documentation License 1.2. - • Privacy policy - • About Linux-raid - • Disclaimers - diff -Nru mdadm-3.2.5/debian/docs/md.txt mdadm-3.3/debian/docs/md.txt --- mdadm-3.2.5/debian/docs/md.txt 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/docs/md.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,600 +0,0 @@ -Tools that manage md devices can be found at - http://www.kernel.org/pub/linux/utils/raid/ - - -Boot time assembly of RAID arrays ---------------------------------- - -You can boot with your md device with the following kernel command -lines: - -for old raid arrays without persistent superblocks: - md=,,,,dev0,dev1,...,devn - -for raid arrays with persistent superblocks - md=,dev0,dev1,...,devn -or, to assemble a partitionable array: - md=d,dev0,dev1,...,devn - -md device no. = the number of the md device ... - 0 means md0, - 1 md1, - 2 md2, - 3 md3, - 4 md4 - -raid level = -1 linear mode - 0 striped mode - other modes are only supported with persistent super blocks - -chunk size factor = (raid-0 and raid-1 only) - Set the chunk size as 4k << n. - -fault level = totally ignored - -dev0-devn: e.g. /dev/hda1,/dev/hdc1,/dev/sda1,/dev/sdb1 - -A possible loadlin line (Harald Hoyer ) looks like this: - -e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro - - -Boot time autodetection of RAID arrays --------------------------------------- - -When md is compiled into the kernel (not as module), partitions of -type 0xfd are scanned and automatically assembled into RAID arrays. -This autodetection may be suppressed with the kernel parameter -"raid=noautodetect". As of kernel 2.6.9, only drives with a type 0 -superblock can be autodetected and run at boot time. - -The kernel parameter "raid=partitionable" (or "raid=part") means -that all auto-detected arrays are assembled as partitionable. - -Boot time assembly of degraded/dirty arrays -------------------------------------------- - -If a raid5 or raid6 array is both dirty and degraded, it could have -undetectable data corruption. This is because the fact that it is -'dirty' means that the parity cannot be trusted, and the fact that it -is degraded means that some datablocks are missing and cannot reliably -be reconstructed (due to no parity). - -For this reason, md will normally refuse to start such an array. This -requires the sysadmin to take action to explicitly start the array -despite possible corruption. This is normally done with - mdadm --assemble --force .... - -This option is not really available if the array has the root -filesystem on it. In order to support this booting from such an -array, md supports a module parameter "start_dirty_degraded" which, -when set to 1, bypassed the checks and will allows dirty degraded -arrays to be started. - -So, to boot with a root filesystem of a dirty degraded raid[56], use - - md-mod.start_dirty_degraded=1 - - -Superblock formats ------------------- - -The md driver can support a variety of different superblock formats. -Currently, it supports superblock formats "0.90.0" and the "md-1" format -introduced in the 2.5 development series. - -The kernel will autodetect which format superblock is being used. - -Superblock format '0' is treated differently to others for legacy -reasons - it is the original superblock format. - - -General Rules - apply for all superblock formats ------------------------------------------------- - -An array is 'created' by writing appropriate superblocks to all -devices. - -It is 'assembled' by associating each of these devices with an -particular md virtual device. Once it is completely assembled, it can -be accessed. - -An array should be created by a user-space tool. This will write -superblocks to all devices. It will usually mark the array as -'unclean', or with some devices missing so that the kernel md driver -can create appropriate redundancy (copying in raid1, parity -calculation in raid4/5). - -When an array is assembled, it is first initialized with the -SET_ARRAY_INFO ioctl. This contains, in particular, a major and minor -version number. The major version number selects which superblock -format is to be used. The minor number might be used to tune handling -of the format, such as suggesting where on each device to look for the -superblock. - -Then each device is added using the ADD_NEW_DISK ioctl. This -provides, in particular, a major and minor number identifying the -device to add. - -The array is started with the RUN_ARRAY ioctl. - -Once started, new devices can be added. They should have an -appropriate superblock written to them, and then passed be in with -ADD_NEW_DISK. - -Devices that have failed or are not yet active can be detached from an -array using HOT_REMOVE_DISK. - - -Specific Rules that apply to format-0 super block arrays, and - arrays with no superblock (non-persistent). -------------------------------------------------------------- - -An array can be 'created' by describing the array (level, chunksize -etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and -raid_disks != 0. - -Then uninitialized devices can be added with ADD_NEW_DISK. The -structure passed to ADD_NEW_DISK must specify the state of the device -and its role in the array. - -Once started with RUN_ARRAY, uninitialized spares can be added with -HOT_ADD_DISK. - - - -MD devices in sysfs -------------------- -md devices appear in sysfs (/sys) as regular block devices, -e.g. - /sys/block/md0 - -Each 'md' device will contain a subdirectory called 'md' which -contains further md-specific information about the device. - -All md devices contain: - level - a text file indicating the 'raid level'. e.g. raid0, raid1, - raid5, linear, multipath, faulty. - If no raid level has been set yet (array is still being - assembled), the value will reflect whatever has been written - to it, which may be a name like the above, or may be a number - such as '0', '5', etc. - - raid_disks - a text file with a simple number indicating the number of devices - in a fully functional array. If this is not yet known, the file - will be empty. If an array is being resized this will contain - the new number of devices. - Some raid levels allow this value to be set while the array is - active. This will reconfigure the array. Otherwise it can only - be set while assembling an array. - A change to this attribute will not be permitted if it would - reduce the size of the array. To reduce the number of drives - in an e.g. raid5, the array size must first be reduced by - setting the 'array_size' attribute. - - chunk_size - This is the size in bytes for 'chunks' and is only relevant to - raid levels that involve striping (0,4,5,6,10). The address space - of the array is conceptually divided into chunks and consecutive - chunks are striped onto neighbouring devices. - The size should be at least PAGE_SIZE (4k) and should be a power - of 2. This can only be set while assembling an array - - layout - The "layout" for the array for the particular level. This is - simply a number that is interpretted differently by different - levels. It can be written while assembling an array. - - array_size - This can be used to artificially constrain the available space in - the array to be less than is actually available on the combined - devices. Writing a number (in Kilobytes) which is less than - the available size will set the size. Any reconfiguration of the - array (e.g. adding devices) will not cause the size to change. - Writing the word 'default' will cause the effective size of the - array to be whatever size is actually available based on - 'level', 'chunk_size' and 'component_size'. - - This can be used to reduce the size of the array before reducing - the number of devices in a raid4/5/6, or to support external - metadata formats which mandate such clipping. - - reshape_position - This is either "none" or a sector number within the devices of - the array where "reshape" is up to. If this is set, the three - attributes mentioned above (raid_disks, chunk_size, layout) can - potentially have 2 values, an old and a new value. If these - values differ, reading the attribute returns - new (old) - and writing will effect the 'new' value, leaving the 'old' - unchanged. - - component_size - For arrays with data redundancy (i.e. not raid0, linear, faulty, - multipath), all components must be the same size - or at least - there must a size that they all provide space for. This is a key - part or the geometry of the array. It is measured in sectors - and can be read from here. Writing to this value may resize - the array if the personality supports it (raid1, raid5, raid6), - and if the component drives are large enough. - - metadata_version - This indicates the format that is being used to record metadata - about the array. It can be 0.90 (traditional format), 1.0, 1.1, - 1.2 (newer format in varying locations) or "none" indicating that - the kernel isn't managing metadata at all. - Alternately it can be "external:" followed by a string which - is set by user-space. This indicates that metadata is managed - by a user-space program. Any device failure or other event that - requires a metadata update will cause array activity to be - suspended until the event is acknowledged. - - resync_start - The point at which resync should start. If no resync is needed, - this will be a very large number (or 'none' since 2.6.30-rc1). At - array creation it will default to 0, though starting the array as - 'clean' will set it much larger. - - new_dev - This file can be written but not read. The value written should - be a block device number as major:minor. e.g. 8:0 - This will cause that device to be attached to the array, if it is - available. It will then appear at md/dev-XXX (depending on the - name of the device) and further configuration is then possible. - - safe_mode_delay - When an md array has seen no write requests for a certain period - of time, it will be marked as 'clean'. When another write - request arrives, the array is marked as 'dirty' before the write - commences. This is known as 'safe_mode'. - The 'certain period' is controlled by this file which stores the - period as a number of seconds. The default is 200msec (0.200). - Writing a value of 0 disables safemode. - - array_state - This file contains a single word which describes the current - state of the array. In many cases, the state can be set by - writing the word for the desired state, however some states - cannot be explicitly set, and some transitions are not allowed. - - Select/poll works on this file. All changes except between - active_idle and active (which can be frequent and are not - very interesting) are notified. active->active_idle is - reported if the metadata is externally managed. - - clear - No devices, no size, no level - Writing is equivalent to STOP_ARRAY ioctl - inactive - May have some settings, but array is not active - all IO results in error - When written, doesn't tear down array, but just stops it - suspended (not supported yet) - All IO requests will block. The array can be reconfigured. - Writing this, if accepted, will block until array is quiessent - readonly - no resync can happen. no superblocks get written. - write requests fail - read-auto - like readonly, but behaves like 'clean' on a write request. - - clean - no pending writes, but otherwise active. - When written to inactive array, starts without resync - If a write request arrives then - if metadata is known, mark 'dirty' and switch to 'active'. - if not known, block and switch to write-pending - If written to an active array that has pending writes, then fails. - active - fully active: IO and resync can be happening. - When written to inactive array, starts with resync - - write-pending - clean, but writes are blocked waiting for 'active' to be written. - - active-idle - like active, but no writes have been seen for a while (safe_mode_delay). - - bitmap/location - This indicates where the write-intent bitmap for the array is - stored. - It can be one of "none", "file" or "[+-]N". - "file" may later be extended to "file:/file/name" - "[+-]N" means that many sectors from the start of the metadata. - This is replicated on all devices. For arrays with externally - managed metadata, the offset is from the beginning of the - device. - bitmap/chunksize - The size, in bytes, of the chunk which will be represented by a - single bit. For RAID456, it is a portion of an individual - device. For RAID10, it is a portion of the array. For RAID1, it - is both (they come to the same thing). - bitmap/time_base - The time, in seconds, between looking for bits in the bitmap to - be cleared. In the current implementation, a bit will be cleared - between 2 and 3 times "time_base" after all the covered blocks - are known to be in-sync. - bitmap/backlog - When write-mostly devices are active in a RAID1, write requests - to those devices proceed in the background - the filesystem (or - other user of the device) does not have to wait for them. - 'backlog' sets a limit on the number of concurrent background - writes. If there are more than this, new writes will by - synchronous. - bitmap/metadata - This can be either 'internal' or 'external'. - 'internal' is the default and means the metadata for the bitmap - is stored in the first 256 bytes of the allocated space and is - managed by the md module. - 'external' means that bitmap metadata is managed externally to - the kernel (i.e. by some userspace program) - bitmap/can_clear - This is either 'true' or 'false'. If 'true', then bits in the - bitmap will be cleared when the corresponding blocks are thought - to be in-sync. If 'false', bits will never be cleared. - This is automatically set to 'false' if a write happens on a - degraded array, or if the array becomes degraded during a write. - When metadata is managed externally, it should be set to true - once the array becomes non-degraded, and this fact has been - recorded in the metadata. - - - - -As component devices are added to an md array, they appear in the 'md' -directory as new directories named - dev-XXX -where XXX is a name that the kernel knows for the device, e.g. hdb1. -Each directory contains: - - block - a symlink to the block device in /sys/block, e.g. - /sys/block/md0/md/dev-hdb1/block -> ../../../../block/hdb/hdb1 - - super - A file containing an image of the superblock read from, or - written to, that device. - - state - A file recording the current state of the device in the array - which can be a comma separated list of - faulty - device has been kicked from active use due to - a detected fault or it has unacknowledged bad - blocks - in_sync - device is a fully in-sync member of the array - writemostly - device will only be subject to read - requests if there are no other options. - This applies only to raid1 arrays. - blocked - device has failed, and the failure hasn't been - acknowledged yet by the metadata handler. - Writes that would write to this device if - it were not faulty are blocked. - spare - device is working, but not a full member. - This includes spares that are in the process - of being recovered to - write_error - device has ever seen a write error. - This list may grow in future. - This can be written to. - Writing "faulty" simulates a failure on the device. - Writing "remove" removes the device from the array. - Writing "writemostly" sets the writemostly flag. - Writing "-writemostly" clears the writemostly flag. - Writing "blocked" sets the "blocked" flag. - Writing "-blocked" clears the "blocked" flags and allows writes - to complete and possibly simulates an error. - Writing "in_sync" sets the in_sync flag. - Writing "write_error" sets writeerrorseen flag. - Writing "-write_error" clears writeerrorseen flag. - - This file responds to select/poll. Any change to 'faulty' - or 'blocked' causes an event. - - errors - An approximate count of read errors that have been detected on - this device but have not caused the device to be evicted from - the array (either because they were corrected or because they - happened while the array was read-only). When using version-1 - metadata, this value persists across restarts of the array. - - This value can be written while assembling an array thus - providing an ongoing count for arrays with metadata managed by - userspace. - - slot - This gives the role that the device has in the array. It will - either be 'none' if the device is not active in the array - (i.e. is a spare or has failed) or an integer less than the - 'raid_disks' number for the array indicating which position - it currently fills. This can only be set while assembling an - array. A device for which this is set is assumed to be working. - - offset - This gives the location in the device (in sectors from the - start) where data from the array will be stored. Any part of - the device before this offset us not touched, unless it is - used for storing metadata (Formats 1.1 and 1.2). - - size - The amount of the device, after the offset, that can be used - for storage of data. This will normally be the same as the - component_size. This can be written while assembling an - array. If a value less than the current component_size is - written, it will be rejected. - - recovery_start - When the device is not 'in_sync', this records the number of - sectors from the start of the device which are known to be - correct. This is normally zero, but during a recovery - operation is will steadily increase, and if the recovery is - interrupted, restoring this value can cause recovery to - avoid repeating the earlier blocks. With v1.x metadata, this - value is saved and restored automatically. - - This can be set whenever the device is not an active member of - the array, either before the array is activated, or before - the 'slot' is set. - - Setting this to 'none' is equivalent to setting 'in_sync'. - Setting to any other value also clears the 'in_sync' flag. - - bad_blocks - This gives the list of all known bad blocks in the form of - start address and length (in sectors respectively). If output - is too big to fit in a page, it will be truncated. Writing - "sector length" to this file adds new acknowledged (i.e. - recorded to disk safely) bad blocks. - - unacknowledged_bad_blocks - This gives the list of known-but-not-yet-saved-to-disk bad - blocks in the same form of 'bad_blocks'. If output is too big - to fit in a page, it will be truncated. Writing to this file - adds bad blocks without acknowledging them. This is largely - for testing. - - - -An active md device will also contain and entry for each active device -in the array. These are named - - rdNN - -where 'NN' is the position in the array, starting from 0. -So for a 3 drive array there will be rd0, rd1, rd2. -These are symbolic links to the appropriate 'dev-XXX' entry. -Thus, for example, - cat /sys/block/md*/md/rd*/state -will show 'in_sync' on every line. - - - -Active md devices for levels that support data redundancy (1,4,5,6) -also have - - sync_action - a text file that can be used to monitor and control the rebuild - process. It contains one word which can be one of: - resync - redundancy is being recalculated after unclean - shutdown or creation - recover - a hot spare is being built to replace a - failed/missing device - idle - nothing is happening - check - A full check of redundancy was requested and is - happening. This reads all block and checks - them. A repair may also happen for some raid - levels. - repair - A full check and repair is happening. This is - similar to 'resync', but was requested by the - user, and the write-intent bitmap is NOT used to - optimise the process. - - This file is writable, and each of the strings that could be - read are meaningful for writing. - - 'idle' will stop an active resync/recovery etc. There is no - guarantee that another resync/recovery may not be automatically - started again, though some event will be needed to trigger - this. - 'resync' or 'recovery' can be used to restart the - corresponding operation if it was stopped with 'idle'. - 'check' and 'repair' will start the appropriate process - providing the current state is 'idle'. - - This file responds to select/poll. Any important change in the value - triggers a poll event. Sometimes the value will briefly be - "recover" if a recovery seems to be needed, but cannot be - achieved. In that case, the transition to "recover" isn't - notified, but the transition away is. - - degraded - This contains a count of the number of devices by which the - arrays is degraded. So an optimal array with show '0'. A - single failed/missing drive will show '1', etc. - This file responds to select/poll, any increase or decrease - in the count of missing devices will trigger an event. - - mismatch_count - When performing 'check' and 'repair', and possibly when - performing 'resync', md will count the number of errors that are - found. The count in 'mismatch_cnt' is the number of sectors - that were re-written, or (for 'check') would have been - re-written. As most raid levels work in units of pages rather - than sectors, this my be larger than the number of actual errors - by a factor of the number of sectors in a page. - - bitmap_set_bits - If the array has a write-intent bitmap, then writing to this - attribute can set bits in the bitmap, indicating that a resync - would need to check the corresponding blocks. Either individual - numbers or start-end pairs can be written. Multiple numbers - can be separated by a space. - Note that the numbers are 'bit' numbers, not 'block' numbers. - They should be scaled by the bitmap_chunksize. - - sync_speed_min - sync_speed_max - This are similar to /proc/sys/dev/raid/speed_limit_{min,max} - however they only apply to the particular array. - If no value has been written to these, of if the word 'system' - is written, then the system-wide value is used. If a value, - in kibibytes-per-second is written, then it is used. - When the files are read, they show the currently active value - followed by "(local)" or "(system)" depending on whether it is - a locally set or system-wide value. - - sync_completed - This shows the number of sectors that have been completed of - whatever the current sync_action is, followed by the number of - sectors in total that could need to be processed. The two - numbers are separated by a '/' thus effectively showing one - value, a fraction of the process that is complete. - A 'select' on this attribute will return when resync completes, - when it reaches the current sync_max (below) and possibly at - other times. - - sync_max - This is a number of sectors at which point a resync/recovery - process will pause. When a resync is active, the value can - only ever be increased, never decreased. The value of 'max' - effectively disables the limit. - - - sync_speed - This shows the current actual speed, in K/sec, of the current - sync_action. It is averaged over the last 30 seconds. - - suspend_lo - suspend_hi - The two values, given as numbers of sectors, indicate a range - within the array where IO will be blocked. This is currently - only supported for raid4/5/6. - - sync_min - sync_max - The two values, given as numbers of sectors, indicate a range - within the array where 'check'/'repair' will operate. Must be - a multiple of chunk_size. When it reaches "sync_max" it will - pause, rather than complete. - You can use 'select' or 'poll' on "sync_completed" to wait for - that number to reach sync_max. Then you can either increase - "sync_max", or can write 'idle' to "sync_action". - - -Each active md device may also have attributes specific to the -personality module that manages it. -These are specific to the implementation of the module and could -change substantially if the implementation changes. - -These currently include - - stripe_cache_size (currently raid5 only) - number of entries in the stripe cache. This is writable, but - there are upper and lower limits (32768, 16). Default is 128. - strip_cache_active (currently raid5 only) - number of active entries in the stripe cache - preread_bypass_threshold (currently raid5 only) - number of times a stripe requiring preread will be bypassed by - a stripe that does not require preread. For fairness defaults - to 1. Setting this to 0 disables bypass accounting and - requires preread stripes to wait until all full-width stripe- - writes are complete. Valid values are 0 to stripe_cache_size. diff -Nru mdadm-3.2.5/debian/initramfs/hook mdadm-3.3/debian/initramfs/hook --- mdadm-3.2.5/debian/initramfs/hook 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/initramfs/hook 2014-07-16 15:52:38.000000000 +0000 @@ -54,13 +54,14 @@ copy_exec /sbin/mdmon /sbin # Copy udev rules, which udev no longer does -UDEV_RULE=64-md-raid.rules +for UDEV_RULE in 63-md-raid-arrays.rules 64-md-raid-assembly.rules; do for rules_folder in /lib/udev/rules.d /etc/udev/rules.d; do if [ -f $rules_folder/$UDEV_RULE ]; then mkdir -p $DESTDIR$rules_folder cp $rules_folder/$UDEV_RULE $DESTDIR$rules_folder/$UDEV_RULE fi done +done # copy the mdadm configuration CONFIG=/etc/mdadm/mdadm.conf diff -Nru mdadm-3.2.5/debian/initramfs/mdadm-functions mdadm-3.3/debian/initramfs/mdadm-functions --- mdadm-3.2.5/debian/initramfs/mdadm-functions 2014-02-17 14:31:46.000000000 +0000 +++ mdadm-3.3/debian/initramfs/mdadm-functions 2014-07-16 15:29:30.000000000 +0000 @@ -1,5 +1,7 @@ #!/bin/sh +. /scripts/functions + txt_message () { if [ -x /bin/plymouth ] && plymouth --ping; then @@ -23,6 +25,7 @@ mountroot_fail() { + wait_for_udev 10 message "Incrementally starting RAID arrays..." if mdadm --incremental --run --scan; then message "Incrementally started RAID arrays." diff -Nru mdadm-3.2.5/debian/mdadm.doc-base.md-txt mdadm-3.3/debian/mdadm.doc-base.md-txt --- mdadm-3.2.5/debian/mdadm.doc-base.md-txt 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/mdadm.doc-base.md-txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,11 +0,0 @@ -Document: mdadm-md-txt -Title: Linux kernel documentation on the md driver (2.6.26) -Author: Neil Brown -Abstract: The document comes from the Linux kernel sources' Documentation/ - directory and contains notes and other information about the md kernel - driver (which mdadm uses). It is current for version 2.6.26 of the kernel. -Section: System/Administration - -Format: text -Index: /usr/share/doc/mdadm/md.txt.gz -Files: /usr/share/doc/mdadm/md.txt.gz diff -Nru mdadm-3.2.5/debian/mdadm.doc-base.superblock-formats mdadm-3.3/debian/mdadm.doc-base.superblock-formats --- mdadm-3.2.5/debian/mdadm.doc-base.superblock-formats 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/mdadm.doc-base.superblock-formats 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ -Document: mdadm-superblock-formats -Title: Description of md superblock formats -Author: GrangerX -Abstract: This document details the different md superblock formats and their - disk layouts -Section: System/Administration - -Format: text -Index: /usr/share/doc/mdadm/md_superblock_formats.txt.gz -Files: /usr/share/doc/mdadm/md_superblock_formats.txt.gz diff -Nru mdadm-3.2.5/debian/mdadm.docs mdadm-3.3/debian/mdadm.docs --- mdadm-3.2.5/debian/mdadm.docs 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/mdadm.docs 2014-07-16 15:04:25.000000000 +0000 @@ -3,3 +3,6 @@ debian/README.recipes debian/README.checkarray debian/FAQ +ANNOUNCE-* +external-reshape-design.txt +mdmon-design.txt diff -Nru mdadm-3.2.5/debian/mdadm.preinst mdadm-3.3/debian/mdadm.preinst --- mdadm-3.2.5/debian/mdadm.preinst 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/mdadm.preinst 2014-07-16 15:19:26.000000000 +0000 @@ -53,6 +53,7 @@ $MDADM -Esc /var/backups/mdadm-Es_v1.dump >> /var/backups/mdadm-Es_v1.dump || : fi + # Used incorrect name s/_/-/, keep all throughout until after jessie is released. rm_conffile /etc/udev/rules.d/65-mdadm.vol_id.rules ;; diff -Nru mdadm-3.2.5/debian/patches/debian-conffile-location.diff mdadm-3.3/debian/patches/debian-conffile-location.diff --- mdadm-3.2.5/debian/patches/debian-conffile-location.diff 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/debian-conffile-location.diff 2014-07-16 15:04:25.000000000 +0000 @@ -17,12 +17,10 @@ mdassemble.8 | 2 +- 5 files changed, 11 insertions(+), 13 deletions(-) -diff --git a/Makefile b/Makefile -index 72087be..8c1fa08 100644 --- a/Makefile +++ b/Makefile -@@ -60,8 +60,8 @@ else - endif +@@ -61,8 +61,8 @@ + PKG_CONFIG ?= pkg-config SYSCONFDIR = /etc -CONFFILE = $(SYSCONFDIR)/mdadm.conf @@ -32,74 +30,67 @@ MAILCMD =/usr/sbin/sendmail -t CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the -diff --git a/ReadMe.c b/ReadMe.c -index b658841..a5dac94 100644 --- a/ReadMe.c +++ b/ReadMe.c -@@ -565,7 +565,7 @@ char Help_incr[] = +@@ -590,7 +590,7 @@ ; char Help_config[] = -"The /etc/mdadm.conf config file:\n\n" +"The /etc/mdadm/mdadm.conf config file:\n\n" " The config file contains, apart from blank lines and comment lines that\n" - " start with a hash(#), four sorts of configuration lines: array lines, \n" - " device lines, mailaddr lines and program lines.\n" -diff --git a/mdadm.8.in b/mdadm.8.in -index 7e8981e..5342d5c 100644 + " start with a hash(#), array lines, device lines, and various\n" + " configuration lines.\n" --- a/mdadm.8.in +++ b/mdadm.8.in -@@ -262,9 +262,9 @@ the exact meaning of this option in different contexts. +@@ -264,13 +264,13 @@ .TP .BR \-c ", " \-\-config= - Specify the config file. Default is to use --.BR /etc/mdadm.conf , --or if that is missing then --.BR /etc/mdadm/mdadm.conf . -+.BR /etc/mdadm/mdadm.conf , -+or if that is missing, then -+.BR /etc/mdadm.conf . + Specify the config file or directory. Default is to use +-.B /etc/mdadm.conf ++.B /etc/mdadm/mdadm.conf + and +-.BR /etc/mdadm.conf.d , ++.BR /etc/mdadm/mdadm.conf.d , + or if those are missing then +-.B /etc/mdadm/mdadm.conf ++.B /etc/mdadm.conf + and +-.BR /etc/mdadm/mdadm.conf.d . ++.BR /etc/mdadm.conf.d . If the config file given is .B "partitions" then nothing will be read, but -@@ -1529,8 +1529,6 @@ The config file is only used if explicitly named with +@@ -1742,9 +1742,9 @@ or requested with (a possibly implicit) .BR \-\-scan . In the later case, -.B /etc/mdadm.conf -or .B /etc/mdadm/mdadm.conf ++or ++.B /etc/mdadm.conf is used. -@@ -1859,7 +1857,7 @@ or - .B \-\-scan - will cause the output to be less detailed and the format to be - suitable for inclusion in --.BR mdadm.conf . -+.BR /etc/mdadm/mdadm.conf . - The exit status of - .I mdadm - will normally be 0 unless -@@ -1936,7 +1934,7 @@ or - is given, then multiple devices that are components of the one array - are grouped together and reported in a single entry suitable - for inclusion in --.BR mdadm.conf . -+.BR /etc/mdadm/mdadm.conf . - - Having - .B \-\-scan -@@ -2698,7 +2696,7 @@ uses this to find arrays when + If +@@ -3003,7 +3003,7 @@ is given in Misc mode, and to monitor array reconstruction on Monitor mode. -.SS /etc/mdadm.conf -+.SS /etc/mdadm/mdadm.conf ++.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) The config file lists which devices may be scanned to see if they contain MD super block, and gives identifying information -diff --git a/mdadm.conf.5 b/mdadm.conf.5 -index 9f31c73..f0b07a1 100644 +@@ -3011,7 +3011,7 @@ + .BR mdadm.conf (5) + for more details. + +-.SS /etc/mdadm.conf.d ++.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) + + A directory containing configuration files which are read in lexical + order. --- a/mdadm.conf.5 +++ b/mdadm.conf.5 @@ -8,7 +8,7 @@ @@ -111,11 +102,9 @@ .SH DESCRIPTION .PP .I mdadm -diff --git a/mdassemble.8 b/mdassemble.8 -index 0210524..27779af 100644 --- a/mdassemble.8 +++ b/mdassemble.8 -@@ -40,7 +40,7 @@ There are no options to +@@ -40,7 +40,7 @@ .SH FILES diff -Nru mdadm-3.2.5/debian/patches/debian-disable-udev-incr-assembly.diff mdadm-3.3/debian/patches/debian-disable-udev-incr-assembly.diff --- mdadm-3.2.5/debian/patches/debian-disable-udev-incr-assembly.diff 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/debian-disable-udev-incr-assembly.diff 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -From: martin f. krafft -Subject: Disable udev incremental assembly - -Upstream enabled incremental assembly in the udev rules by default for 3.1.3, -but this is too early. Hence, this patch simply reverts the change. - -Signed-off-by: martin f. krafft - ---- - udev-md-raid.rules | 4 ++++ - 1 files changed, 4 insertions(+), 0 deletions(-) - -diff --git a/udev-md-raid.rules b/udev-md-raid.rules -index f564f70..241c31d 100644 ---- a/udev-md-raid.rules -+++ b/udev-md-raid.rules -@@ -8,6 +8,10 @@ GOTO="md_inc_skip" - - LABEL="md_inc" - -+## DISABLED: Incremental udev assembly disabled -+## ** this is a Debian-specific change ** -+GOTO="md_inc_skip" -+ - # remember you can limit what gets auto/incrementally assembled by - # mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' - ACTION=="add", RUN+="/sbin/mdadm --incremental $tempnode" diff -Nru mdadm-3.2.5/debian/patches/debian-no-Werror.diff mdadm-3.3/debian/patches/debian-no-Werror.diff --- mdadm-3.2.5/debian/patches/debian-no-Werror.diff 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/debian-no-Werror.diff 2014-07-16 15:04:25.000000000 +0000 @@ -11,16 +11,14 @@ Makefile | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) -diff --git a/Makefile b/Makefile -index b8d363f..63d9742 100644 --- a/Makefile +++ b/Makefile -@@ -42,7 +42,7 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB +@@ -43,7 +43,7 @@ CC = $(CROSS_COMPILE)gcc - CXFLAGS = -ggdb + CXFLAGS ?= -ggdb -CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter +CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter ifdef WARN_UNUSED - CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O + CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 endif diff -Nru mdadm-3.2.5/debian/patches/dmraid-fallback.patch mdadm-3.3/debian/patches/dmraid-fallback.patch --- mdadm-3.2.5/debian/patches/dmraid-fallback.patch 2014-01-24 13:48:15.000000000 +0000 +++ mdadm-3.3/debian/patches/dmraid-fallback.patch 2014-07-16 15:40:56.000000000 +0000 @@ -5,16 +5,17 @@ request. Author: Dimitri John Ledkov ---- mdadm-3.2.5.orig/udev-md-raid.rules -+++ mdadm-3.2.5/udev-md-raid.rules -@@ -3,6 +3,10 @@ - SUBSYSTEM!="block", GOTO="md_end" +--- a/udev-md-raid-assembly.rules ++++ b/udev-md-raid-assembly.rules +@@ -4,6 +4,11 @@ + + SUBSYSTEM!="block", GOTO="md_inc_end" - # handle potential components of arrays (the ones supported by md) +IMPORT{cmdline}="nomdmonisw" +IMPORT{cmdline}="nomdmonddf" -+ENV{nomdmonisw}=="1", ENV{ID_FS_TYPE}=="isw_raid_member", GOTO="md_inc_skip" -+ENV{nomdmonddf}=="1", ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc_skip" ++ENV{nomdmonisw}=="1", ENV{ID_FS_TYPE}=="isw_raid_member", GOTO="md_inc_end" ++ENV{nomdmonddf}=="1", ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc_end" ++ + # handle potential components of arrays (the ones supported by md) ENV{ID_FS_TYPE}=="ddf_raid_member|isw_raid_member|linux_raid_member", GOTO="md_inc" - GOTO="md_inc_skip" - + GOTO="md_inc_end" diff -Nru mdadm-3.2.5/debian/patches/fix-enough-function-for-RAID10.patch mdadm-3.3/debian/patches/fix-enough-function-for-RAID10.patch --- mdadm-3.2.5/debian/patches/fix-enough-function-for-RAID10.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/fix-enough-function-for-RAID10.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ -From 2117ad1dd1b79cf6d02a065d9e38076aa9f4788d Mon Sep 17 00:00:00 2001 -From: NeilBrown -Date: Thu, 27 Sep 2012 16:58:44 +1000 -Subject: Fix 'enough' function for RAID10. -Bug-Debian: http://bugs.debian.org/691668 -Comment: from stable/bugfix 3.2.6 upstream version - -The 'enough' function is written to work with 'near' arrays only -in that is implicitly assumes that the offset from one 'group' of -devices to the next is the same as the number of copies. -In reality it is the number of 'near' copies. - -So change it to make this number explicit. - -Reported-by: Jakub Husák -Signed-off-by: NeilBrown ---- - util.c | 7 ++++--- - 1 file changed, 4 insertions(+), 3 deletions(-) - -diff --git a/util.c b/util.c -index 83f3187..eef0d6f 100644 ---- a/util.c -+++ b/util.c -@@ -332,14 +332,15 @@ int enough(int level, int raid_disks, int layout, int clean, char *avail) - /* there must be one of the 'copies' form 'first' */ - int n = copies; - int cnt=0; -+ int this = first; - while (n--) { -- if (avail[first]) -+ if (avail[this]) - cnt++; -- first = (first+1) % raid_disks; -+ this = (this+1) % raid_disks; - } - if (cnt == 0) - return 0; -- -+ first = (first+(layout&255)) % raid_disks; - } while (first != 0); - return 1; - --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/fix-segfaults-in-detail.patch mdadm-3.3/debian/patches/fix-segfaults-in-detail.patch --- mdadm-3.2.5/debian/patches/fix-segfaults-in-detail.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/fix-segfaults-in-detail.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -From aec89f63e98322a141d146a84c67b0cc2d1cd1a4 Mon Sep 17 00:00:00 2001 -From: Lukasz Dorau -Date: Thu, 27 Sep 2012 16:58:44 +1000 -Subject: fix segfaults in Detail() -Bug-Debian: http://bugs.debian.org/691670 -Comment: from stable/bugfix upstream 3.2.6 version - -If disk has been removed, 'st' and 'info' can be NULL. It causes segfault. -'st' and 'info' should be checked against being NULL before being used. - -Signed-off-by: Lukasz Dorau -Signed-off-by: NeilBrown ---- - Detail.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/Detail.c b/Detail.c -index 85e2b89..67ddc80 100644 ---- a/Detail.c -+++ b/Detail.c -@@ -424,7 +424,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost, char *pre - } - free_mdstat(ms); - -- if (st->sb && info->reshape_active) { -+ if ((st && st->sb) && (info && info->reshape_active)) { - #if 0 - This is pretty boring - printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9, -@@ -575,7 +575,8 @@ This is pretty boring - if (spares && brief && array.raid_disks) printf(" spares=%d", spares); - if (brief && st && st->sb) - st->ss->brief_detail_super(st); -- st->ss->free_super(st); -+ if (st) -+ st->ss->free_super(st); - - if (brief > 1 && devices) printf("\n devices=%s", devices); - if (brief) printf("\n"); --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/mdmon-allow-takeover-when-original-was-started-with-.patch mdadm-3.3/debian/patches/mdmon-allow-takeover-when-original-was-started-with-.patch --- mdadm-3.2.5/debian/patches/mdmon-allow-takeover-when-original-was-started-with-.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/mdmon-allow-takeover-when-original-was-started-with-.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,37 +0,0 @@ -From c4e96a305fbca4f83ae9f3a81482481524380905 Mon Sep 17 00:00:00 2001 -From: NeilBrown -Date: Mon, 20 Aug 2012 10:37:21 +1000 -Subject: mdmon: allow --takeover when original was started with --offroot -Bug-Debian: http://bugs.debian.org/691671 -Comment: from stable/bugfix upstream 3.2.6 version -Comment: one of the 3 patches fixing mentioned issues - -As --offroot causes ARGV[0] to be changed, we need to be more -lenient when checking that the mdmon we are about to kill really -is mdmon. i.e. allow name to be "@dmon" instead. - -Signed-off-by: NeilBrown ---- - mdmon.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/mdmon.c b/mdmon.c -index 2d0503d..54e1238 100644 ---- a/mdmon.c -+++ b/mdmon.c -@@ -184,7 +184,11 @@ static void try_kill_monitor(pid_t pid, char *devname, int sock) - buf[sizeof(buf)-1] = 0; - close(fd); - -- if (n < 0 || !strstr(buf, "mdmon")) -+ /* Note that if started with --offroot, the name -+ * might be "@dmon" -+ */ -+ if (n < 0 || !(strstr(buf, "mdmon") || -+ strstr(buf, "@dmon"))) - return; - - kill(pid, SIGTERM); --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/mdmon-fix-arg-parsing.patch mdadm-3.3/debian/patches/mdmon-fix-arg-parsing.patch --- mdadm-3.2.5/debian/patches/mdmon-fix-arg-parsing.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/mdmon-fix-arg-parsing.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -From 68ad53b301b6fc722fee6d32a5267c1a4506452d Mon Sep 17 00:00:00 2001 -From: NeilBrown -Date: Mon, 9 Jul 2012 16:50:22 +1000 -Subject: mdmon: fix arg parsing. -Bug-Debian: http://bugs.debian.org/691671 -Comment: from stable/bugfix upstream 3.2.6 version -Comment: one of the 3 patches fixing mentioned issues - --t aka --takeover should not be setting container_name. -It sets it to NULL which causes failure when you try - mdmon --all --takeover - -Signed-off-by: NeilBrown ---- - mdmon.c | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/mdmon.c b/mdmon.c -index 2093476..dee02a9 100644 ---- a/mdmon.c -+++ b/mdmon.c -@@ -306,7 +306,6 @@ int main(int argc, char *argv[]) - all = 1; - break; - case 't': -- container_name = optarg; - takeover = 1; - break; - case OffRootOpt: --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/mdmon-fix-arg-processing-for-a.patch mdadm-3.3/debian/patches/mdmon-fix-arg-processing-for-a.patch --- mdadm-3.2.5/debian/patches/mdmon-fix-arg-processing-for-a.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/mdmon-fix-arg-processing-for-a.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -From 23084aaa76d9fc3121ff2ab38c04ee9893793e00 Mon Sep 17 00:00:00 2001 -From: NeilBrown -Date: Mon, 20 Aug 2012 10:33:50 +1000 -Subject: mdmon: fix arg processing for -a -Bug-Debian: http://bugs.debian.org/691671 -Comment: from stable/bugfix upstream 3.2.6 version -Comment: one of the 3 patches fixing mentioned issues - -'-a' was not being recognised as an abbreviation for '--all'. - -Signed-off-by: NeilBrown ---- - mdmon.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mdmon.c b/mdmon.c -index dee02a9..2d0503d 100644 ---- a/mdmon.c -+++ b/mdmon.c -@@ -299,7 +299,7 @@ int main(int argc, char *argv[]) - {NULL, 0, NULL, 0} - }; - -- while ((opt = getopt_long(argc, argv, "th", options, NULL)) != -1) { -+ while ((opt = getopt_long(argc, argv, "tha", options, NULL)) != -1) { - switch (opt) { - case 'a': - container_name = argv[optind-1]; --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/remove-bashism-from-makefile.patch mdadm-3.3/debian/patches/remove-bashism-from-makefile.patch --- mdadm-3.2.5/debian/patches/remove-bashism-from-makefile.patch 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/debian/patches/remove-bashism-from-makefile.patch 2014-07-16 15:04:25.000000000 +0000 @@ -0,0 +1,22 @@ +From: Michael Tokarev +Subject: Remove bashism from Makefile + +Makefile uses [ x == y ] construct which does not work +with POSIX shell. Since this is just testing a flag, +replace it with string comparison (=) operator instead. + +Signed-off-By: Michael Tokarev + +diff --git a/Makefile b/Makefile +index e8da3a5..c60cc2c 100644 +--- a/Makefile ++++ b/Makefile +@@ -156,7 +156,7 @@ all : check_rundir mdadm mdmon + man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man + + check_rundir: +- @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" == 1 ]; then \ ++ @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ diff -Nru mdadm-3.2.5/debian/patches/series mdadm-3.3/debian/patches/series --- mdadm-3.2.5/debian/patches/series 2014-02-17 13:44:06.000000000 +0000 +++ mdadm-3.3/debian/patches/series 2014-07-16 15:54:32.000000000 +0000 @@ -1,14 +1,7 @@ debian-conffile-location.diff -#Ubuntu: Revert back to incremental assembly -#debian-disable-udev-incr-assembly.diff debian-no-Werror.diff debian-changes-3.1.4-1+8efb9d1ubuntu4 sha1-includes.diff -spelling-and-manpages.patch -fix-enough-function-for-RAID10.patch -fix-segfaults-in-detail.patch -super0-do-not-override-uuid-with-homehost.patch -mdmon-allow-takeover-when-original-was-started-with-.patch -mdmon-fix-arg-parsing.patch -mdmon-fix-arg-processing-for-a.patch +remove-bashism-from-makefile.patch +#disable use-external-blkid.diff in ubuntu dmraid-fallback.patch diff -Nru mdadm-3.2.5/debian/patches/sha1-includes.diff mdadm-3.3/debian/patches/sha1-includes.diff --- mdadm-3.2.5/debian/patches/sha1-includes.diff 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/sha1-includes.diff 2014-07-16 15:04:25.000000000 +0000 @@ -16,7 +16,7 @@ --- a/sha1.h +++ b/sha1.h -@@ -22,12 +22,10 @@ +@@ -22,7 +22,7 @@ #include @@ -25,12 +25,7 @@ # include #endif --#include "ansidecl.h" -- - /* The following contortions are an attempt to use the C preprocessor - to determine an unsigned integral type that is 32 bits wide. An - alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but -@@ -35,9 +33,9 @@ +@@ -33,9 +33,9 @@ the resulting executable. Locally running cross-compiled executables is usually not possible. */ diff -Nru mdadm-3.2.5/debian/patches/spelling-and-manpages.patch mdadm-3.3/debian/patches/spelling-and-manpages.patch --- mdadm-3.2.5/debian/patches/spelling-and-manpages.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/spelling-and-manpages.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,101 +0,0 @@ ---- - Build.c | 2 +- - md.4 | 2 +- - mdadm.8.in | 26 -------------------------- - mdadm.conf.5 | 2 +- - super0.c | 2 +- - 5 files changed, 4 insertions(+), 30 deletions(-) - ---- a/Build.c -+++ b/Build.c -@@ -174,7 +174,7 @@ - goto abort; - } - if ((stb.st_mode & S_IFMT)!= S_IFBLK) { -- fprintf(stderr, Name ": Wierd: %s is no longer a block device.\n", -+ fprintf(stderr, Name ": Weird: %s is no longer a block device.\n", - dv->devname); - goto abort; - } ---- a/md.4 -+++ b/md.4 -@@ -773,7 +773,7 @@ - .I n - gives the md device number, - .I l --gives the level, 0 for RAID0 or -1 for LINEAR, -+gives the level, 0 for RAID0 or \-1 for LINEAR, - .I c - gives the chunk size as a base-2 logarithm offset by twelve, so 0 - means 4K, 1 means 8K. ---- a/mdadm.conf.5 -+++ b/mdadm.conf.5 -@@ -573,7 +573,7 @@ - .br - HOMEHOST - .br --AUTO +1.x homehost -all -+AUTO +1.x homehost \-all - - .SH SEE ALSO - .BR mdadm (8), ---- a/super0.c -+++ b/super0.c -@@ -445,7 +445,7 @@ - sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1, - (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4); - if (verbose >= 0) -- fprintf (stderr, Name ": adjusting superblock of %s for 2.2/sparc compatability.\n", -+ fprintf (stderr, Name ": adjusting superblock of %s for 2.2/sparc compatibility.\n", - devname); - } else if (strcmp(update, "super-minor") ==0) { - sb->md_minor = info->array.md_minor; ---- a/mdadm.8.in -+++ b/mdadm.8.in -@@ -332,7 +332,6 @@ - .IP "0, 0.90, default" - .el - .IP "0, 0.90" --.. - Use the original 0.90 format superblock. This format limits arrays to - 28 component devices and limits component devices of levels 1 and - greater to 2 terabytes. It is also possible for there to be confusion -@@ -342,7 +341,6 @@ - .IP "1, 1.0, 1.1, 1.2" - .el - .IP "1, 1.0, 1.1, 1.2 default" --.. - Use the new version-1 format superblock. This has fewer restrictions. - It can easily be moved between hosts with different endian-ness, and a - recovery operation can be checkpointed and restarted. The different -@@ -888,30 +886,6 @@ - or - .BR \-\-build . - --.ig XX --.\".TP --.\".BR \-\-symlink = no --.\"Normally when --.\".B \-\-auto --.\"causes --.\".I mdadm --.\"to create devices in --.\".B /dev/md/ --.\"it will also create symlinks from --.\".B /dev/ --.\"with names starting with --.\".B md --.\"or --.\".BR md_ . --.\"Use --.\".B \-\-symlink=no --.\"to suppress this, or --.\".B \-\-symlink=yes --.\"to enforce this even if it is suppressing --.\".IR mdadm.conf . --.\" --.XX -- - .TP - .BR \-a ", " "\-\-add" - This option can be used in Grow mode in two cases. diff -Nru mdadm-3.2.5/debian/patches/super0-do-not-override-uuid-with-homehost.patch mdadm-3.3/debian/patches/super0-do-not-override-uuid-with-homehost.patch --- mdadm-3.2.5/debian/patches/super0-do-not-override-uuid-with-homehost.patch 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/patches/super0-do-not-override-uuid-with-homehost.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,34 +0,0 @@ -From 3713633a30349773a83bd8257cdd64b86ce32dbd Mon Sep 17 00:00:00 2001 -From: Michael Tokarev -Date: Sat, 20 Oct 2012 15:40:02 +0400 -Subject: super0: do not override uuid with homehost -Bug-Debian: http://bugs.debian.org/686703 -Comment: from stable/bugfix upstream 3.2.6 version - -When --uuid is specified in the command line, even for v0.90 -superblock we override last portion of uuid with data from ---homehost, which is wrong (and disagrees with the manpage). -Only use homehost in super0 if no uuid is specified. - -Signed-off-By: Michael Tokarev -Signed-off-by: NeilBrown ---- - super0.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/super0.c b/super0.c -index 1375799..ca4c082 100644 ---- a/super0.c -+++ b/super0.c -@@ -657,7 +657,7 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info, - if (rfd >= 0) - close(rfd); - } -- if (homehost) { -+ if (homehost && !uuid) { - char buf[20]; - char *hash = sha1_buffer(homehost, - strlen(homehost), --- -1.7.10.4 - diff -Nru mdadm-3.2.5/debian/patches/use-external-blkid.diff mdadm-3.3/debian/patches/use-external-blkid.diff --- mdadm-3.2.5/debian/patches/use-external-blkid.diff 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/debian/patches/use-external-blkid.diff 2014-07-16 15:04:25.000000000 +0000 @@ -0,0 +1,16 @@ +From: Michael Tokarev +Subject: blkid is not udev builtin, use /sbin/blkid + +--- a/udev-md-raid-arrays.rules ++++ b/udev-md-raid-arrays.rules +@@ -26,9 +26,7 @@ ENV{DEVTYPE}=="partition", ENV{MD_UUID}= + ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" + ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +-IMPORT{builtin}="blkid" +-OPTIONS+="link_priority=100" +-OPTIONS+="watch" ++IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode" + ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" + ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + diff -Nru mdadm-3.2.5/debian/rules mdadm-3.3/debian/rules --- mdadm-3.2.5/debian/rules 2014-02-18 08:50:03.000000000 +0000 +++ mdadm-3.3/debian/rules 2014-07-16 15:52:13.000000000 +0000 @@ -81,7 +81,8 @@ install -m0644 debian/source_mdadm.py $(DESTDIR)/usr/share/apport/package-hooks/ install -m0755 mdadm.udeb $(DESTDIR_UDEB)/sbin/mdadm install -m0755 mdmon.udeb $(DESTDIR_UDEB)/sbin/mdmon - install -m0644 udev-md-raid.rules $(DESTDIR_UDEB)/lib/udev/rules.d/64-md-raid.rules + install -m0644 udev-md-raid-arrays.rules $(DESTDIR_UDEB)/lib/udev/rules.d/63-md-raid-arrays.rules + install -m0644 udev-md-raid-assembly.rules $(DESTDIR_UDEB)/lib/udev/rules.d/64-md-raid-assembly.rules binary-indep: build install diff -Nru mdadm-3.2.5/debian/source/options mdadm-3.3/debian/source/options --- mdadm-3.2.5/debian/source/options 2013-12-06 13:09:58.000000000 +0000 +++ mdadm-3.3/debian/source/options 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -compression = "bzip2" -compression-level = 9 diff -Nru mdadm-3.2.5/Detail.c mdadm-3.3/Detail.c --- mdadm-3.2.5/Detail.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Detail.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -27,7 +27,28 @@ #include "md_u.h" #include -int Detail(char *dev, int brief, int export, int test, char *homehost, char *prefer) +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static int add_device(const char *dev, char ***p_devices, + int *p_max_devices, int n_devices) +{ + if (n_devices + 1 >= *p_max_devices) { + *p_max_devices += 16; + *p_devices = xrealloc(*p_devices, *p_max_devices * + sizeof(**p_devices)); + if (!*p_devices) { + *p_max_devices = 0; + return 0; + } + }; + (*p_devices)[n_devices] = xstrdup(dev); + return n_devices + 1; +} + +int Detail(char *dev, struct context *c) { /* * Print out details for an md array by using @@ -41,8 +62,9 @@ int next; int d; time_t atime; - char *c; - char *devices = NULL; + char *str; + char **devices = NULL; + int max_devices = 0, n_devices = 0; int spares = 0; struct stat stb; int is_26 = get_linux_version() >= 2006000; @@ -53,43 +75,49 @@ int max_disks = MD_SB_DISKS; /* just a default */ struct mdinfo *info = NULL; struct mdinfo *sra; + struct mdinfo *subdev; char *member = NULL; char *container = NULL; - int rv = test ? 4 : 1; + int rv = c->test ? 4 : 1; int avail_disks = 0; char *avail = NULL; + int external; + int inactive; if (fd < 0) { - fprintf(stderr, Name ": cannot open %s: %s\n", + pr_err("cannot open %s: %s\n", dev, strerror(errno)); return rv; } vers = md_get_version(fd); if (vers < 0) { - fprintf(stderr, Name ": %s does not appear to be an md device\n", + pr_err("%s does not appear to be an md device\n", dev); close(fd); return rv; } if (vers < 9000) { - fprintf(stderr, Name ": cannot get detail for md device %s: driver version too old.\n", + pr_err("cannot get detail for md device %s: driver version too old.\n", dev); close(fd); return rv; } - if (ioctl(fd, GET_ARRAY_INFO, &array)<0) { - if (errno == ENODEV) - fprintf(stderr, Name ": md device %s does not appear to be active.\n", - dev); - else - fprintf(stderr, Name ": cannot get array detail for %s: %s\n", - dev, strerror(errno)); + sra = sysfs_read(fd, NULL, GET_VERSION|GET_DEVS); + external = (sra != NULL && sra->array.major_version == -1 + && sra->array.minor_version == -2); + st = super_by_fd(fd, &subarray); + if (ioctl(fd, GET_ARRAY_INFO, &array) == 0) { + inactive = 0; + } else if (errno == ENODEV) { + array = sra->array; + inactive = 1; + } else { + pr_err("cannot get array detail for %s: %s\n", + dev, strerror(errno)); close(fd); return rv; } - sra = sysfs_read(fd, 0, GET_VERSION); - st = super_by_fd(fd, &subarray); if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode)) stb.st_rdev = 0; @@ -102,25 +130,40 @@ /* This is a subarray of some container. * We want the name of the container, and the member */ - int dn = st->container_dev; + int devid = devnm2devid(st->container_devnm); + int cfd, err; member = subarray; - container = map_dev_preferred(dev2major(dn), dev2minor(dn), 1, prefer); + container = map_dev_preferred(major(devid), minor(devid), + 1, c->prefer); + cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + err = st->ss->load_container(st, cfd, NULL); + close(cfd); + if (err == 0) + info = st->ss->container_content(st, subarray); + } } - /* try to load a superblock */ - if (st) for (d = 0; d < max_disks; d++) { + /* try to load a superblock. Try sra->devs first, then try ioctl */ + if (st && !info) for (d = 0, subdev = sra ? sra->devs : NULL; + d < max_disks || subdev; + subdev ? (void)(subdev = subdev->next) : (void)(d++)){ mdu_disk_info_t disk; char *dv; int fd2; int err; - disk.number = d; - if (ioctl(fd, GET_DISK_INFO, &disk) < 0) - continue; - if (d >= array.raid_disks && - disk.major == 0 && - disk.minor == 0) - continue; + if (subdev) + disk = subdev->disk; + else { + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (d >= array.raid_disks && + disk.major == 0 && + disk.minor == 0) + continue; + } if (array.raid_disks > 0 && (disk.state & (1 << MD_DISK_ACTIVE)) == 0) @@ -146,7 +189,7 @@ if (subarray) info = st->ss->container_content(st, subarray); else { - info = malloc(sizeof(*info)); + info = xmalloc(sizeof(*info)); st->ss->getinfo_super(st, info, NULL); } if (!info) @@ -176,15 +219,16 @@ } /* Ok, we have some info to print... */ - c = map_num(pers, array.level); + str = map_num(pers, array.level); - if (export) { + if (c->export) { if (array.raid_disks) { - if (c) - printf("MD_LEVEL=%s\n", c); + if (str) + printf("MD_LEVEL=%s\n", str); printf("MD_DEVICES=%d\n", array.raid_disks); } else { - printf("MD_LEVEL=container\n"); + if (!inactive) + printf("MD_LEVEL=container\n"); printf("MD_DEVICES=%d\n", array.nr_disks); } if (container) { @@ -197,7 +241,7 @@ printf("MD_METADATA=%d.%d\n", array.major_version, array.minor_version); } - + if (st && st->sb && info) { char nbuf[64]; struct map_ent *mp, *map = NULL; @@ -206,73 +250,111 @@ printf("MD_UUID=%s\n", nbuf+5); mp = map_by_uuid(&map, info->uuid); if (mp && mp->path && - strncmp(mp->path, "/dev/md/", 8) == 0) - printf("MD_DEVNAME=%s\n", mp->path+8); + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } if (st->ss->export_detail_super) st->ss->export_detail_super(st); } else { struct map_ent *mp, *map = NULL; char nbuf[64]; - mp = map_by_devnum(&map, fd2devnum(fd)); + mp = map_by_devnm(&map, fd2devnm(fd)); if (mp) { __fname_from_uuid(mp->uuid, 0, nbuf, ':'); printf("MD_UUID=%s\n", nbuf+5); } if (mp && mp->path && - strncmp(mp->path, "/dev/md/", 8) == 0) - printf("MD_DEVNAME=%s\n", mp->path+8); + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + } + if (sra) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + char *path = + map_dev(mdi->disk.major, + mdi->disk.minor, 0); + + if (mdi->disk.raid_disk >= 0) + printf("MD_DEVICE_%s_ROLE=%d\n", + mdi->sys_name+4, + mdi->disk.raid_disk); + else + printf("MD_DEVICE_%s_ROLE=spare\n", + mdi->sys_name+4); + if (path) + printf("MD_DEVICE_%s_DEV=%s\n", + mdi->sys_name+4, path); + } } goto out; } - disks = malloc(max_disks * sizeof(mdu_disk_info_t)); - for (d=0; ddevs; mdi; mdi = mdi->next) { + disks[next++] = mdi->disk; + disks[next-1].number = -1; + } + } else for (d = 0; d < max_disks; d++) { mdu_disk_info_t disk; disk.number = d; if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { if (d < array.raid_disks) - fprintf(stderr, Name ": cannot get device detail for device %d: %s\n", + pr_err("cannot get device detail for device %d: %s\n", d, strerror(errno)); continue; } if (disk.major == 0 && disk.minor == 0) continue; - if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks) - disks[disk.raid_disk] = disk; + if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2].state == (1<= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2+1].state == (1<brief) { mdu_bitmap_file_t bmf; - printf("ARRAY %s", dev); - if (brief > 1) { + printf("%sARRAY %s", inactive ? "INACTIVE-":"", dev); + if (c->verbose > 0) { if (array.raid_disks) printf(" level=%s num-devices=%d", - c?c:"-unknown-", + str?str:"-unknown-", array.raid_disks ); - else + else if (!inactive) printf(" level=container num-devices=%d", array.nr_disks); + else + printf(" num-devices=%d", array.nr_disks); } if (container) { printf(" container=%s", container); @@ -296,12 +378,11 @@ unsigned long long larray_size; struct mdstat_ent *ms = mdstat_read(0, 0); struct mdstat_ent *e; - int devnum = array.md_minor; - if (major(stb.st_rdev) == (unsigned)get_mdp_major()) - devnum = -1 - devnum; + char *devnm; + devnm = stat2devnm(&stb); for (e=ms; e; e=e->next) - if (e->devnum == devnum) + if (strcmp(e->devnm, devnm) == 0) break; if (!get_dev_size(fd, NULL, &larray_size)) larray_size = 0; @@ -321,11 +402,16 @@ atime = array.ctime; if (atime) printf(" Creation Time : %.24s\n", ctime(&atime)); - if (array.raid_disks == 0) c = "container"; - printf(" Raid Level : %s\n", c?c:"-unknown-"); + if (array.raid_disks == 0 && external) + str = "container"; + if (str) + printf(" Raid Level : %s\n", str); if (larray_size) - printf(" Array Size : %llu%s\n", (larray_size>>10), human_size(larray_size)); + printf(" Array Size : %llu%s\n", (larray_size>>10), + human_size(larray_size)); if (array.level >= 1) { + if (sra) + array.major_version = sra->array.major_version; if (array.major_version != 0 && (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) { unsigned long long dsize = get_component_size(fd); @@ -342,7 +428,7 @@ if (array.raid_disks) printf(" Raid Devices : %d\n", array.raid_disks); printf(" Total Devices : %d\n", array.nr_disks); - if (!container && + if (!container && ((sra == NULL && array.major_version == 0) || (sra && sra->array.major_version == 0))) printf("Preferred Minor : %d\n", array.md_minor); @@ -362,7 +448,9 @@ if (atime) printf(" Update Time : %.24s\n", ctime(&atime)); if (array.raid_disks) { - static char *sync_action[] = {", recovering",", resyncing",", reshaping",", checking"}; + static char *sync_action[] = { + ", recovering", ", resyncing", + ", reshaping", ", checking" }; char *st; if (avail_disks == array.raid_disks) st = ""; @@ -374,27 +462,30 @@ printf(" State : %s%s%s%s%s%s \n", (array.state&(1<percent < 0 && e->percent != PROCESS_PENDING && - e->percent != PROCESS_DELAYED)) ? "" : sync_action[e->resync], + (!e || (e->percent < 0 && e->percent != RESYNC_PENDING && + e->percent != RESYNC_DELAYED)) ? "" : sync_action[e->resync], larray_size ? "": ", Not Started", - e->percent == PROCESS_DELAYED ? " (DELAYED)": "", - e->percent == PROCESS_PENDING ? " (PENDING)": ""); + e->percent == RESYNC_DELAYED ? " (DELAYED)": "", + e->percent == RESYNC_PENDING ? " (PENDING)": ""); + } else if (inactive) { + printf(" State : inactive\n"); } if (array.raid_disks) printf(" Active Devices : %d\n", array.active_disks); - printf("Working Devices : %d\n", array.working_disks); + if (array.working_disks > 0) + printf("Working Devices : %d\n", array.working_disks); if (array.raid_disks) { printf(" Failed Devices : %d\n", array.failed_disks); printf(" Spare Devices : %d\n", array.spare_disks); } printf("\n"); if (array.level == 5) { - c = map_num(r5layout, array.layout); - printf(" Layout : %s\n", c?c:"-unknown-"); + str = map_num(r5layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); } if (array.level == 6) { - c = map_num(r6layout, array.layout); - printf(" Layout : %s\n", c?c:"-unknown-"); + str = map_num(r6layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); } if (array.level == 10) { printf(" Layout :"); @@ -418,13 +509,15 @@ } if (e && e->percent >= 0) { - static char *sync_action[] = {"Rebuild", "Resync", "Reshape", "Check"}; + static char *sync_action[] = { + "Rebuild", "Resync", + "Reshape", "Check"}; printf(" %7s Status : %d%% complete\n", sync_action[e->resync], e->percent); is_rebuilding = 1; } free_mdstat(ms); - if (st->sb && info->reshape_active) { + if ((st && st->sb) && (info && info->reshape_active)) { #if 0 This is pretty boring printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9, @@ -432,22 +525,24 @@ #endif if (info->delta_disks != 0) printf(" Delta Devices : %d, (%d->%d)\n", - info->delta_disks, array.raid_disks - info->delta_disks, array.raid_disks); + info->delta_disks, + array.raid_disks - info->delta_disks, + array.raid_disks); if (info->new_level != array.level) { - char *c = map_num(pers, info->new_level); - printf(" New Level : %s\n", c?c:"-unknown-"); + str = map_num(pers, info->new_level); + printf(" New Level : %s\n", str?str:"-unknown-"); } if (info->new_level != array.level || info->new_layout != array.layout) { if (info->new_level == 5) { - char *c = map_num(r5layout, info->new_layout); + str = map_num(r5layout, info->new_layout); printf(" New Layout : %s\n", - c?c:"-unknown-"); + str?str:"-unknown-"); } if (info->new_level == 6) { - char *c = map_num(r6layout, info->new_layout); + str = map_num(r6layout, info->new_layout); printf(" New Layout : %s\n", - c?c:"-unknown-"); + str?str:"-unknown-"); } if (info->new_level == 10) { printf(" New Layout : near=%d, %s=%d\n", @@ -462,7 +557,7 @@ } else if (e && e->percent >= 0) printf("\n"); if (st && st->sb) - st->ss->detail_super(st, homehost); + st->ss->detail_super(st, c->homehost); if (array.raid_disks == 0 && sra && sra->array.major_version == -1 && sra->array.minor_version == -2 && sra->text_version[0] != '/') { @@ -478,7 +573,7 @@ char path[200]; char vbuf[1024]; int nlen = strlen(sra->sys_name); - int dn; + int devid; if (de->d_name[0] == '.') continue; sprintf(path, "/sys/block/%s/md/metadata_version", @@ -490,10 +585,10 @@ strncmp(vbuf+10, sra->sys_name, nlen) != 0 || vbuf[10+nlen] != '/') continue; - dn = devname2devnum(de->d_name); + devid = devnm2devid(de->d_name); printf(" %s", map_dev_preferred( - dev2major(dn), - dev2minor(dn), 1, prefer)); + major(devid), + minor(devid), 1, c->prefer)); } if (dir) closedir(dir); @@ -511,20 +606,27 @@ char *dv; mdu_disk_info_t disk = disks[d]; - if (d >= array.raid_disks && + if (d >= array.raid_disks*2 && disk.major == 0 && disk.minor == 0) continue; - if (!brief) { - if (d == array.raid_disks) printf("\n"); - if (disk.raid_disk < 0) + if ((d & 1) && + disk.major == 0 && + disk.minor == 0) + continue; + if (!c->brief) { + if (d == array.raid_disks*2) printf("\n"); + if (disk.number < 0) + printf(" - %5d %5d - ", + disk.major, disk.minor); + else if (disk.raid_disk < 0) printf(" %5d %5d %5d - ", disk.number, disk.major, disk.minor); else printf(" %5d %5d %5d %5d ", disk.number, disk.major, disk.minor, disk.raid_disk); } - if (!brief && array.raid_disks) { + if (!c->brief && array.raid_disks) { if (disk.state & (1<> 8) & 0xff; + int copies = nc*fc; + if (fc == 1 && array.raid_disks % copies == 0 && copies <= 26) { + /* We can divide the devices into 'sets' */ + int set = disk.raid_disk % copies; + printf(" set-%c", set + 'A'); + } + } + } if (disk.state & (1<test && d < array.raid_disks && !(disk.state & (1<prefer); + if (dv != NULL) { + if (c->brief) + n_devices = add_device(dv, &devices, + &max_devices, + n_devices); + else printf(" %s", dv); } - if (!brief) printf("\n"); + if (!c->brief) printf("\n"); } - if (spares && brief && array.raid_disks) printf(" spares=%d", spares); - if (brief && st && st->sb) + if (spares && c->brief && array.raid_disks) printf(" spares=%d", spares); + if (c->brief && st && st->sb) st->ss->brief_detail_super(st); - st->ss->free_super(st); + if (st) + st->ss->free_super(st); - if (brief > 1 && devices) printf("\n devices=%s", devices); - if (brief) printf("\n"); - if (test && + if (c->brief && c->verbose > 0 && devices) { + qsort(devices, n_devices, sizeof(*devices), cmpstringp); + printf("\n devices=%s", devices[0]); + for (d = 1; d < n_devices; d++) + printf(",%s", devices[d]); + } + if (c->brief) + printf("\n"); + if (c->test && !enough(array.level, array.raid_disks, array.layout, 1, avail)) rv = 2; @@ -589,11 +709,14 @@ close(fd); free(subarray); free(avail); + for (d = 0; d < n_devices; d++) + free(devices[d]); + free(devices); sysfs_free(sra); return rv; } -int Detail_Platform(struct superswitch *ss, int scan, int verbose) +int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path) { /* display platform capabilities for the given metadata format * 'scan' in this context means iterate over all metadata types @@ -601,34 +724,39 @@ int i; int err = 1; - if (ss && ss->detail_platform) - err = ss->detail_platform(verbose, 0); + if (ss && export && ss->export_detail_platform) + err = ss->export_detail_platform(verbose, controller_path); + else if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0, controller_path); else if (ss) { - if (verbose) - fprintf(stderr, Name ": %s metadata is platform independent\n", + if (verbose > 0) + pr_err("%s metadata is platform independent\n", ss->name ? : "[no name]"); } else if (!scan) { - if (verbose) - fprintf(stderr, Name ": specify a metadata type or --scan\n"); + if (verbose > 0) + pr_err("specify a metadata type or --scan\n"); } if (!scan) return err; + err = 0; for (i = 0; superlist[i]; i++) { struct superswitch *meta = superlist[i]; if (meta == ss) continue; - if (verbose) - fprintf(stderr, Name ": checking metadata %s\n", + if (verbose > 0) + pr_err("checking metadata %s\n", meta->name ? : "[no name]"); if (!meta->detail_platform) { - if (verbose) - fprintf(stderr, Name ": %s metadata is platform independent\n", + if (verbose > 0) + pr_err("%s metadata is platform independent\n", meta->name ? : "[no name]"); + } else if (export && meta->export_detail_platform) { + err |= meta->export_detail_platform(verbose, controller_path); } else - err |= meta->detail_platform(verbose, 0); + err |= meta->detail_platform(verbose, 0, controller_path); } return err; diff -Nru mdadm-3.2.5/dlink.c mdadm-3.3/dlink.c --- mdadm-3.2.5/dlink.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/dlink.c 2013-09-03 04:47:47.000000000 +0000 @@ -8,9 +8,9 @@ #ifdef __dietlibc__ char *strncpy(char *dest, const char *src, size_t n) __THROW; #endif +void *xcalloc(size_t num, size_t size); #include "dlink.h" - void *dl_head() { void *h; @@ -63,14 +63,9 @@ if (s == NULL) return NULL; n = dl_newv(char, l+1); - if (n == NULL) - return NULL; - else - { - strncpy(n, s, l); - n[l] = 0; - return n; - } + strncpy(n, s, l); + n[l] = 0; + return n; } char *dl_strdup(char *s) diff -Nru mdadm-3.2.5/dlink.h mdadm-3.3/dlink.h --- mdadm-3.2.5/dlink.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/dlink.h 2013-09-03 04:47:47.000000000 +0000 @@ -8,7 +8,7 @@ void * dh_next; }; -#define dl_alloc(size) ((void*)(((char*)calloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head))) +#define dl_alloc(size) ((void*)(((char*)xcalloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head))) #define dl_new(t) ((t*)dl_alloc(sizeof(t))) #define dl_newv(t,n) ((t*)dl_alloc(sizeof(t)*n)) diff -Nru mdadm-3.2.5/Dump.c mdadm-3.3/Dump.c --- mdadm-3.2.5/Dump.c 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/Dump.c 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,311 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2013 Neil Brown + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include + +int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st) +{ + /* create a new file in 'dir' named for the basename of 'dev'. + * Truncate to the same size as 'dev' and ask the metadata + * handler to copy metadata there. + * For every name in /dev/disk/by-id that points to this device, + * create a hardlink in 'dir'. + * Complain if any of those hardlinks cannot be created. + */ + int fd, fl; + struct stat stb, dstb; + char *base; + char *fname = NULL; + unsigned long long size; + DIR *dirp; + struct dirent *de; + + if (stat(dir, &stb) != 0 || + (S_IFMT & stb.st_mode) != S_IFDIR) { + pr_err("--dump requires an existing directory, not: %s\n", + dir); + return 16; + } + + fd = dev_open(dev, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s to dump metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if (st == NULL) + st = guess_super_type(fd, guess_array); + if (!st) { + pr_err("Cannot find RAID metadata on %s\n", dev); + close(fd); + return 1; + } + + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fd, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, dev); + close(fd); + return 1; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + close(fd); + return 1; + } + + base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666); + if (fl < 0) { + pr_err("Cannot create dump file %s: %s\n", + fname, strerror(errno)); + close(fd); + free(fname); + return 1; + } + if (ftruncate(fl, size) < 0) { + pr_err("failed to set size of dump file: %s\n", + strerror(errno)); + close(fd); + close(fl); + free(fname); + return 1; + } + + if (st->ss->copy_metadata(st, fd, fl) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + dev, fname); + close(fd); + close(fl); + unlink(fname); + free(fname); + return 1; + } + if (c->verbose >= 0) + printf("%s saved as %s.\n", dev, fname); + fstat(fd, &dstb); + close(fd); + close(fl); + if ((dstb.st_mode & S_IFMT) != S_IFBLK) { + /* Not a block device, so cannot create links */ + free(fname); + return 0; + } + /* mostly done: just want to find some other names */ + dirp = opendir("/dev/disk/by-id"); + if (!dirp) { + free(fname); + return 0; + } + while ((de = readdir(dirp)) != NULL) { + char *p = NULL; + if (de->d_name[0] == '.') + continue; + xasprintf(&p, "/dev/disk/by-id/%s", de->d_name); + if (stat(p, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK || + stb.st_rdev != dstb.st_rdev) { + /* Not this one */ + free(p); + continue; + } + free(p); + xasprintf(&p, "%s/%s", dir, de->d_name); + if (link(fname, p) == 0) { + if (c->verbose >= 0) + printf("%s also saved as %s.\n", + dev, p); + } else { + pr_err("Could not save %s as %s!!\n", + dev, p); + } + free(p); + } + closedir(dirp); + free(fname); + return 0; +} + +int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only) +{ + /* If 'dir' really is a directory we choose a name + * from it that matches a suitable name in /dev/disk/by-id, + * and copy metadata from the file to the device. + * If two names from by-id match and aren't both the same + * inode, we fail. If none match and basename of 'dev' + * can be found in dir, use that. + * If 'dir' is really a file then it is only permitted if + * 'only' is set (meaning there was only one device given) + * and the metadata is restored irrespective of file names. + */ + int fd, fl; + struct stat stb, dstb; + char *fname = NULL; + unsigned long long size; + + if (stat(dir, &stb) != 0) { + pr_err("%s does not exist: cannot restore from there.\n", + dir); + return 16; + } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) { + pr_err("--restore requires a directory when multiple devices given\n"); + return 16; + } + + fd = dev_open(dev, O_RDWR); + if (fd < 0) { + pr_err("Cannot open %s to restore metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if ((S_IFMT & stb.st_mode) == S_IFDIR) { + /* choose one name from the directory. */ + DIR *d = opendir(dir); + struct dirent *de; + char *chosen = NULL; + unsigned int chosen_inode = 0; + + fstat(fd, &dstb); + + while (d && (de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name); + if (stat(fname, &stb) != 0) { + free(fname); + continue; + } + free(fname); + if ((S_IFMT & stb.st_mode) != S_IFBLK) + continue; + if (stb.st_rdev != dstb.st_rdev) + continue; + /* This file is a good match for our device. */ + xasprintf(&fname, "%s/%s", dir, de->d_name); + if (stat(fname, &stb) != 0) { + /* Weird! */ + free(fname); + continue; + } + if (chosen == NULL) { + chosen = fname; + chosen_inode = stb.st_ino; + continue; + } + if (chosen_inode == stb.st_ino) { + /* same, no need to change */ + free(fname); + continue; + } + /* Oh dear, two names both match. Must give up. */ + pr_err("Both %s and %s seem suitable for %s. Please choose one.\n", + chosen, fname, dev); + free(fname); + free(chosen); + close(fd); + closedir(d); + return 1; + } + closedir(d); + if (!chosen) { + /* One last chance: try basename of device */ + char *base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + if (stat(fname, &stb) == 0) + chosen = fname; + else + free(fname); + } + fname = chosen; + } else + fname = strdup(dir); + + if (!fname) { + pr_err("Cannot find suitable file in %s for %s\n", + dir, dev); + close(fd); + return 1; + } + + fl = open(fname, O_RDONLY); + if (!fl) { + pr_err("Could not open %s for --restore.\n", + fname); + goto err; + } + if (((unsigned long long)stb.st_size) != size) { + pr_err("%s is not the same size as %s - cannot restore.\n", + fname, dev); + goto err; + } + if (st == NULL) + st = guess_super_type(fl, guess_array); + if (!st) { + pr_err("Cannot find metadata on %s\n", fname); + goto err; + } + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fl, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, fname); + goto err; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + goto err; + } + if (st->ss->copy_metadata(st, fl, fd) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + fname, dev); + goto err; + } + if (c->verbose >= 0) + printf("%s restored from %s.\n", dev, fname); + return 0; + +err: + close(fd); + close(fl); + free(fname); + return 1; +} diff -Nru mdadm-3.2.5/Examine.c mdadm-3.3/Examine.c --- mdadm-3.2.5/Examine.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Examine.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -30,9 +30,9 @@ #endif #include "md_u.h" #include "md_p.h" -int Examine(struct mddev_dev *devlist, int brief, int export, int scan, - int SparcAdjust, struct supertype *forcest, - char *homehost) +int Examine(struct mddev_dev *devlist, + struct context *c, + struct supertype *forcest) { /* Read the raid superblock from a device and @@ -47,7 +47,8 @@ * * utime, state etc * - * If (brief) gather devices for same array and just print a mdadm.conf line including devices= + * If (brief) gather devices for same array and just print a mdadm.conf + * line including devices= * if devlist==NULL, use conf_get_devs() */ int fd; @@ -62,15 +63,15 @@ int spares; } *arrays = NULL; - for (; devlist ; devlist=devlist->next) { + for (; devlist ; devlist = devlist->next) { struct supertype *st; int have_container = 0; fd = dev_open(devlist->devname, O_RDONLY); if (fd < 0) { - if (!scan) { - fprintf(stderr,Name ": cannot open %s: %s\n", - devlist->devname, strerror(errno)); + if (!c->scan) { + pr_err("cannot open %s: %s\n", + devlist->devname, strerror(errno)); rv = 1; } err = 1; @@ -90,19 +91,19 @@ st->ignore_hw_compat = 1; if (!container) err = st->ss->load_super(st, fd, - (brief||scan) ? NULL + (c->brief||c->scan) ? NULL :devlist->devname); if (err && st->ss->load_container) { err = st->ss->load_container(st, fd, - (brief||scan) ? NULL + (c->brief||c->scan) ? NULL :devlist->devname); if (!err) have_container = 1; } st->ignore_hw_compat = 0; } else { - if (!brief) { - fprintf(stderr, Name ": No md superblock detected on %s.\n", devlist->devname); + if (!c->brief) { + pr_err("No md superblock detected on %s.\n", devlist->devname); rv = 1; } err = 1; @@ -112,25 +113,25 @@ if (err) continue; - if (SparcAdjust) + if (c->SparcAdjust) st->ss->update_super(st, NULL, "sparc2.2", devlist->devname, 0, 0, NULL); /* Ok, its good enough to try, though the checksum could be wrong */ - if (brief && st->ss->brief_examine_super == NULL) { - if (!scan) - fprintf(stderr, Name ": No brief listing for %s on %s\n", + if (c->brief && st->ss->brief_examine_super == NULL) { + if (!c->scan) + pr_err("No brief listing for %s on %s\n", st->ss->name, devlist->devname); - } else if (brief) { + } else if (c->brief) { struct array *ap; char *d; - for (ap=arrays; ap; ap=ap->next) { + for (ap = arrays; ap; ap = ap->next) { if (st->ss == ap->st->ss && - st->ss->compare_super(ap->st, st)==0) + st->ss->compare_super(ap->st, st) == 0) break; } if (!ap) { - ap = malloc(sizeof(*ap)); + ap = xmalloc(sizeof(*ap)); ap->devs = dl_head(); ap->next = arrays; ap->spares = 0; @@ -144,29 +145,31 @@ ap->spares++; d = dl_strdup(devlist->devname); dl_add(ap->devs, d); - } else if (export) { + } else if (c->export) { if (st->ss->export_examine_super) st->ss->export_examine_super(st); st->ss->free_super(st); } else { printf("%s:\n",devlist->devname); - st->ss->examine_super(st, homehost); + st->ss->examine_super(st, c->homehost); st->ss->free_super(st); } } - if (brief) { + if (c->brief) { struct array *ap; - for (ap=arrays; ap; ap=ap->next) { + for (ap = arrays; ap; ap = ap->next) { char sep='='; char *d; int newline = 0; - ap->st->ss->brief_examine_super(ap->st, brief > 1); + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); if (ap->spares) newline += printf(" spares=%d", ap->spares); - if (brief > 1) { + if (c->verbose > 0) { newline += printf(" devices"); - for (d=dl_next(ap->devs); d!= ap->devs; d=dl_next(d)) { + for (d = dl_next(ap->devs); + d != ap->devs; + d=dl_next(d)) { printf("%c%s", sep, d); sep=','; } @@ -174,13 +177,49 @@ if (ap->st->ss->brief_examine_subarrays) { if (newline) printf("\n"); - ap->st->ss->brief_examine_subarrays(ap->st, brief > 1); + ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); } ap->st->ss->free_super(ap->st); /* FIXME free ap */ - if (ap->spares || brief > 1) + if (ap->spares || c->verbose > 0) printf("\n"); } } return rv; } + +int ExamineBadblocks(char *devname, int brief, struct supertype *forcest) +{ + int fd = dev_open(devname, O_RDONLY); + struct supertype *st = forcest; + int err = 1; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", devname, strerror(errno)); + return 1; + } + if (!st) + st = guess_super(fd); + if (!st) { + if (!brief) + pr_err("No md superblock detected on %s\n", devname); + goto out; + } + if (!st->ss->examine_badblocks) { + pr_err("%s metadata does not support badblocks\n", st->ss->name); + goto out; + } + err = st->ss->load_super(st, fd, brief ? NULL : devname); + if (err) + goto out; + err = st->ss->examine_badblocks(st, fd, devname); + +out: + if (fd >= 0) + close(fd); + if (st) { + st->ss->free_super(st); + free(st); + } + return err; +} diff -Nru mdadm-3.2.5/.gitignore mdadm-3.3/.gitignore --- mdadm-3.2.5/.gitignore 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/.gitignore 2013-09-03 04:47:47.000000000 +0000 @@ -4,7 +4,15 @@ /mdadm /mdadm.8 /mdadm.udeb +/mdassemble /mdmon /swap_super /test_stripe /TAGS +/mdadm.O2 +/mdadm.Os +/mdadm.static +/mdassemble.auto +/mdassemble.static +/mdmon.O2 +/raid6check diff -Nru mdadm-3.2.5/Grow.c mdadm-3.3/Grow.c --- mdadm-3.2.5/Grow.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Grow.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -24,6 +24,8 @@ #include "mdadm.h" #include "dlink.h" #include +#include +#include #if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) #error no endian defined @@ -49,12 +51,9 @@ int disk_count = next_spare + working_disks; dprintf("Called restore_backup()\n"); - fdlist = malloc(sizeof(int) * disk_count); - if (fdlist == NULL) { - fprintf(stderr, - Name ": cannot allocate memory for disk list\n"); - return 1; - } + fdlist = xmalloc(sizeof(int) * disk_count); + + enable_fds(next_spare); for (i = 0; i < next_spare; i++) fdlist[i] = -1; for (dev = content->devs; dev; dev = dev->next) { @@ -84,10 +83,10 @@ } free(fdlist); if (err) { - fprintf(stderr, Name ": Failed to restore critical" - " section for reshape - sorry.\n"); + pr_err("Failed to restore critical" + " section for reshape - sorry.\n"); if (!backup_file) - fprintf(stderr, Name ": Possibly you need" + pr_err("Possibly you need" " to specify a --backup-file\n"); return 1; } @@ -115,23 +114,24 @@ char *subarray = NULL; if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { - fprintf(stderr, Name ": cannot get array info for %s\n", devname); + pr_err("cannot get array info for %s\n", devname); return 1; } if (info.array.level != -1) { - fprintf(stderr, Name ": can only add devices to linear arrays\n"); + pr_err("can only add devices to linear arrays\n"); return 1; } st = super_by_fd(fd, &subarray); if (!st) { - fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version); + pr_err("cannot handle arrays with superblock version %d\n", + info.array.major_version); return 1; } if (subarray) { - fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n"); + pr_err("Cannot grow linear sub-arrays yet\n"); free(subarray); free(st); return 1; @@ -139,18 +139,19 @@ nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); if (nfd < 0) { - fprintf(stderr, Name ": cannot open %s\n", newdev); + pr_err("cannot open %s\n", newdev); free(st); return 1; } fstat(nfd, &stb); if ((stb.st_mode & S_IFMT) != S_IFBLK) { - fprintf(stderr, Name ": %s is not a block device!\n", newdev); + pr_err("%s is not a block device!\n", newdev); close(nfd); free(st); return 1; } - /* now check out all the devices and make sure we can read the superblock */ + /* now check out all the devices and make sure we can read the + * superblock */ for (d=0 ; d < info.array.raid_disks ; d++) { mdu_disk_info_t disk; char *dv; @@ -159,7 +160,7 @@ disk.number = d; if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { - fprintf(stderr, Name ": cannot get device detail for device %d\n", + pr_err("cannot get device detail for device %d\n", d); close(nfd); free(st); @@ -167,7 +168,7 @@ } dv = map_dev(disk.major, disk.minor, 1); if (!dv) { - fprintf(stderr, Name ": cannot find device file for device %d\n", + pr_err("cannot find device file for device %d\n", d); close(nfd); free(st); @@ -175,14 +176,14 @@ } fd2 = dev_open(dv, O_RDWR); if (fd2 < 0) { - fprintf(stderr, Name ": cannot open device file %s\n", dv); + pr_err("cannot open device file %s\n", dv); close(nfd); free(st); return 1; } if (st->ss->load_super(st, fd2, NULL)) { - fprintf(stderr, Name ": cannot find super block on %s\n", dv); + pr_err("cannot find super block on %s\n", dv); close(nfd); close(fd2); free(st); @@ -203,7 +204,7 @@ 0, 0, NULL); if (st->ss->store_super(st, nfd)) { - fprintf(stderr, Name ": Cannot store new superblock on %s\n", + pr_err("Cannot store new superblock on %s\n", newdev); close(nfd); return 1; @@ -211,7 +212,7 @@ close(nfd); if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) { - fprintf(stderr, Name ": Cannot add new disk to this array\n"); + pr_err("Cannot add new disk to this array\n"); return 1; } /* Well, that seems to have worked. @@ -219,7 +220,7 @@ */ if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { - fprintf(stderr, Name ": cannot get array info for %s\n", devname); + pr_err("cannot get array info for %s\n", devname); return 1; } @@ -230,23 +231,23 @@ disk.number = d; if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { - fprintf(stderr, Name ": cannot get device detail for device %d\n", + pr_err("cannot get device detail for device %d\n", d); return 1; } dv = map_dev(disk.major, disk.minor, 1); if (!dv) { - fprintf(stderr, Name ": cannot find device file for device %d\n", + pr_err("cannot find device file for device %d\n", d); return 1; } fd2 = dev_open(dv, O_RDWR); if (fd2 < 0) { - fprintf(stderr, Name ": cannot open device file %s\n", dv); + pr_err("cannot open device file %s\n", dv); return 1; } if (st->ss->load_super(st, fd2, NULL)) { - fprintf(stderr, Name ": cannot find super block on %s\n", dv); + pr_err("cannot find super block on %s\n", dv); close(fd); return 1; } @@ -259,7 +260,7 @@ 0, 0, NULL); if (st->ss->store_super(st, fd2)) { - fprintf(stderr, Name ": Cannot store new superblock on %s\n", dv); + pr_err("Cannot store new superblock on %s\n", dv); close(fd2); return 1; } @@ -269,7 +270,7 @@ return 0; } -int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force) +int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) { /* * First check that array doesn't have a bitmap @@ -290,7 +291,7 @@ if (vers < 9003) { major = BITMAP_MAJOR_HOSTENDIAN; - fprintf(stderr, Name ": Warning - bitmaps created on this kernel" + pr_err("Warning - bitmaps created on this kernel" " are not portable\n" " between different architectures. Consider upgrading" " the Linux kernel.\n"); @@ -298,48 +299,48 @@ if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { if (errno == ENOMEM) - fprintf(stderr, Name ": Memory allocation failure.\n"); + pr_err("Memory allocation failure.\n"); else - fprintf(stderr, Name ": bitmaps not supported by this kernel.\n"); + pr_err("bitmaps not supported by this kernel.\n"); return 1; } if (bmf.pathname[0]) { - if (strcmp(file,"none")==0) { + if (strcmp(s->bitmap_file,"none")==0) { if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) { - fprintf(stderr, Name ": failed to remove bitmap %s\n", + pr_err("failed to remove bitmap %s\n", bmf.pathname); return 1; } return 0; } - fprintf(stderr, Name ": %s already has a bitmap (%s)\n", + pr_err("%s already has a bitmap (%s)\n", devname, bmf.pathname); return 1; } if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": cannot get array status for %s\n", devname); + pr_err("cannot get array status for %s\n", devname); return 1; } if (array.state & (1<bitmap_file, "none")==0) { array.state &= ~(1<bitmap_file, "none") == 0) { + pr_err("no bitmap found on %s\n", devname); return 1; } if (array.level <= 0) { - fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n", + pr_err("Bitmaps not meaningful with level %s\n", map_num(pers, array.level)?:"of this array"); return 1; } @@ -353,7 +354,7 @@ bitmapsize = get_component_size(fd); } if (bitmapsize == 0) { - fprintf(stderr, Name ": Cannot reliably determine size of array to create bitmap - sorry.\n"); + pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n"); return 1; } @@ -364,27 +365,27 @@ st = super_by_fd(fd, &subarray); if (!st) { - fprintf(stderr, Name ": Cannot understand version %d.%d\n", + pr_err("Cannot understand version %d.%d\n", array.major_version, array.minor_version); return 1; } if (subarray) { - fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n"); + pr_err("Cannot add bitmaps to sub-arrays yet\n"); free(subarray); free(st); return 1; } - if (strcmp(file, "internal") == 0) { + if (strcmp(s->bitmap_file, "internal") == 0) { int rv; int d; int offset_setable = 0; struct mdinfo *mdi; if (st->ss->add_internal_bitmap == NULL) { - fprintf(stderr, Name ": Internal bitmaps not supported " + pr_err("Internal bitmaps not supported " "with %s metadata\n", st->ss->name); return 1; } - mdi = sysfs_read(fd, -1, GET_BITMAP_LOCATION); + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); if (mdi) offset_setable = 1; for (d=0; d< st->max_devs; d++) { @@ -406,14 +407,14 @@ if (st->ss->load_super(st, fd2, NULL)==0) { if (st->ss->add_internal_bitmap( st, - &chunk, delay, write_behind, + &s->bitmap_chunk, c->delay, s->write_behind, bitmapsize, offset_setable, major) ) st->ss->write_bitmap(st, fd2); else { - fprintf(stderr, Name ": failed " - "to create internal bitmap - chunksize problem.\n"); + pr_err("failed to create internal bitmap" + " - chunksize problem.\n"); close(fd2); return 1; } @@ -423,7 +424,7 @@ } if (offset_setable) { st->ss->getinfo_super(st, mdi, NULL); - sysfs_init(mdi, fd, -1); + sysfs_init(mdi, fd, NULL); rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", mdi->bitmap_offset); } else { @@ -432,10 +433,9 @@ } if (rv < 0) { if (errno == EBUSY) - fprintf(stderr, Name - ": Cannot add bitmap while array is" - " resyncing or reshaping etc.\n"); - fprintf(stderr, Name ": failed to set internal bitmap.\n"); + pr_err("Cannot add bitmap while array is" + " resyncing or reshaping etc.\n"); + pr_err("failed to set internal bitmap.\n"); return 1; } } else { @@ -445,7 +445,7 @@ int max_devs = st->max_devs; /* try to load a superblock */ - for (d=0; d= 0) { if (st->ss->load_super(st, fd2, NULL) == 0) { @@ -468,26 +469,25 @@ } } if (d == max_devs) { - fprintf(stderr, Name ": cannot find UUID for array!\n"); + pr_err("cannot find UUID for array!\n"); return 1; } - if (CreateBitmap(file, force, (char*)uuid, chunk, - delay, write_behind, bitmapsize, major)) { + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { return 1; } - bitmap_fd = open(file, O_RDWR); + bitmap_fd = open(s->bitmap_file, O_RDWR); if (bitmap_fd < 0) { - fprintf(stderr, Name ": weird: %s cannot be opened\n", - file); + pr_err("weird: %s cannot be opened\n", + s->bitmap_file); return 1; } if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) { int err = errno; if (errno == EBUSY) - fprintf(stderr, Name - ": Cannot add bitmap while array is" - " resyncing or reshaping etc.\n"); - fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n", + pr_err("Cannot add bitmap while array is" + " resyncing or reshaping etc.\n"); + pr_err("Cannot set bitmap file for %s: %s\n", devname, strerror(err)); return 1; } @@ -526,7 +526,7 @@ { int i; int csum = 0; - for (i=0; icontainer_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); struct mdstat_ent *ent, *e; int is_idle = 1; - fmt_devname(container, container_dev); ent = mdstat_read(0, 0); for (e = ent ; e; e = e->next) { if (!is_container_member(e, container)) @@ -558,17 +556,14 @@ static int freeze_container(struct supertype *st) { - int container_dev = (st->container_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); if (!check_idle(st)) return -1; - fmt_devname(container, container_dev); - if (block_monitor(container, 1)) { - fprintf(stderr, Name ": failed to freeze container\n"); + pr_err("failed to freeze container\n"); return -2; } @@ -577,11 +572,8 @@ static void unfreeze_container(struct supertype *st) { - int container_dev = (st->container_dev != NoMdDev - ? st->container_dev : st->devnum); - char container[40]; - - fmt_devname(container, container_dev); + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); unblock_monitor(container, 1); } @@ -597,7 +589,7 @@ if (st->ss->external) return freeze_container(st); else { - struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION); + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); int err; char buf[20]; @@ -619,7 +611,7 @@ if (st->ss->external) return unfreeze_container(st); else { - struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION); + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); if (sra) sysfs_set_str(sra, NULL, "sync_action", "idle"); @@ -635,18 +627,14 @@ if (fd < 0) return; - while (sysfs_fd_get_str(fd, action, 20) > 0 && - strncmp(action, "reshape", 7) == 0) { - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, NULL, NULL, &rfds, NULL); - } + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); close(fd); } -static int reshape_super(struct supertype *st, long long size, int level, - int layout, int chunksize, int raid_disks, +static int reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, int delta_disks, char *backup_file, char *dev, int direction, int verbose) { @@ -655,7 +643,7 @@ return 0; if (!st->ss->reshape_super || !st->ss->manage_reshape) { - fprintf(stderr, Name ": %s metadata does not support reshape\n", + pr_err("%s metadata does not support reshape\n", st->ss->name); return 1; } @@ -742,7 +730,8 @@ sysfs_set_num(sra, NULL, "suspend_hi", 0); sysfs_set_num(sra, NULL, "suspend_lo", 0); sysfs_set_num(sra, NULL, "sync_min", 0); - sysfs_set_str(sra, NULL, "sync_max", "max"); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. } int remove_disks_for_takeover(struct supertype *st, @@ -848,6 +837,7 @@ int d = 0; struct mdinfo *sd; + enable_fds(nrdisks); for (d = 0; d <= nrdisks; d++) fdlist[d] = -1; d = raid_disks; @@ -861,9 +851,8 @@ = dev_open(dn, O_RDONLY); offsets[sd->disk.raid_disk] = sd->data_offset*512; if (fdlist[sd->disk.raid_disk] < 0) { - fprintf(stderr, - Name ": %s: cannot open component %s\n", - devname, dn ? dn : "-unknown-"); + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); d = -1; goto release; } @@ -874,7 +863,7 @@ fdlist[d] = dev_open(dn, O_RDWR); offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; if (fdlist[d] < 0) { - fprintf(stderr, Name ": %s: cannot open component %s\n", + pr_err("%s: cannot open component %s\n", devname, dn ? dn : "-unknown-"); d = -1; goto release; @@ -905,7 +894,7 @@ S_IRUSR | S_IWUSR); *offsets = 8 * 512; if (*fdlist < 0) { - fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + pr_err("%s: cannot create backup file %s: %s\n", devname, backup_file, strerror(errno)); return 0; } @@ -918,7 +907,7 @@ dev = stb.st_dev; fstat(fd, &stb); if (stb.st_rdev == dev) { - fprintf(stderr, Name ": backup file must NOT be" + pr_err("backup file must NOT be" " on the array being reshaped.\n"); close(*fdlist); return 0; @@ -927,14 +916,14 @@ memset(buf, 0, 512); for (i=0; i < blocks + 8 ; i++) { if (write(*fdlist, buf, 512) != 512) { - fprintf(stderr, Name ": %s: cannot create" + pr_err("%s: cannot create" " backup file %s: %s\n", devname, backup_file, strerror(errno)); return 0; } } if (fsync(*fdlist) != 0) { - fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n", + pr_err("%s: cannot create backup file %s: %s\n", devname, backup_file, strerror(errno)); return 0; } @@ -954,19 +943,14 @@ a = (ochunk/512) * odata; b = (nchunk/512) * ndata; /* Find GCD */ - while (a != b) { - if (a < b) - b -= a; - if (b < a) - a -= b; - } + a = GCD(a, b); /* LCM == product / GCD */ blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; return blocks; } -char *analyse_change(struct mdinfo *info, struct reshape *re) +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) { /* Based on the current array state in info->array and * the changes in info->new_* etc, determine: @@ -983,12 +967,16 @@ * This can be called as part of starting a reshape, or * when assembling an array that is undergoing reshape. */ + int near, far, offset, copies; int new_disks; + int old_chunk, new_chunk; /* delta_parity records change in number of devices * caused by level change */ int delta_parity = 0; + memset(re, 0, sizeof(*re)); + /* If a new level not explicitly given, we assume no-change */ if (info->new_level == UnSet) info->new_level = info->array.level; @@ -1003,9 +991,16 @@ /* chunk size is meaningful, must divide component_size * evenly */ - if (info->component_size % (info->new_chunk/512)) - return "New chunk size does not" - " divide component size"; + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } break; default: return "chunk size not meaningful for this level"; @@ -1033,9 +1028,6 @@ re->level = 0; re->before.data_disks = 1; re->after.data_disks = 1; - re->before.layout = 0; - re->backup_blocks = 0; - re->parity = 0; return NULL; } if (info->new_level == 1) { @@ -1043,8 +1035,6 @@ /* Don't know what to do */ return "no change requested for Growing RAID1"; re->level = 1; - re->backup_blocks = 0; - re->parity = 0; return NULL; } if (info->array.raid_disks == 2 && @@ -1070,38 +1060,94 @@ return "Impossibly level change request for RAID1"; case 10: - /* RAID10 can only be converted from near mode to - * RAID0 by removing some devices + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. */ - if ((info->array.layout & ~0xff) != 0x100) - return "Cannot Grow RAID10 with far/offset layout"; - /* number of devices must be multiple of number of copies */ - if (info->array.raid_disks % (info->array.layout & 0xff)) - return "RAID10 layout too complex for Grow operation"; + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* number of devices must be multiple of number of copies */ + if (info->array.raid_disks % (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; - if (info->new_level != 0) - return "RAID10 can only be changed to RAID0"; - new_disks = (info->array.raid_disks - / (info->array.layout & 0xff)); - if (info->delta_disks == UnSet) - info->delta_disks = (new_disks - - info->array.raid_disks); - - if (info->delta_disks != new_disks - info->array.raid_disks) - return "New number of raid-devices impossible for RAID10"; - if (info->new_chunk && - info->new_chunk != info->array.chunk_size) - return "Cannot change chunk-size with RAID10 Grow"; - - /* looks good */ - re->level = 0; - re->parity = 0; - re->before.data_disks = new_disks; - re->after.data_disks = re->before.data_disks; - re->before.layout = 0; - re->backup_blocks = 0; - return NULL; + new_disks = (info->array.raid_disks + / (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); + + if (info->delta_disks != new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies" + " when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + + default: + return "RAID10 can only be changed to RAID0"; + } case 0: /* RAID0 can be converted to RAID10, or to RAID456 */ if (info->new_level == 10) { @@ -1132,12 +1178,10 @@ return "Cannot change chunk-size with RAID0->RAID10"; /* looks good */ re->level = 10; - re->parity = 0; re->before.data_disks = (info->array.raid_disks + info->delta_disks); re->after.data_disks = re->before.data_disks; re->before.layout = info->new_layout; - re->backup_blocks = 0; return NULL; } @@ -1155,11 +1199,15 @@ delta_parity = 1; re->level = 5; re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); break; case 6: delta_parity = 2; re->level = 6; re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); break; default: return "Impossible level change requested"; @@ -1219,7 +1267,8 @@ return "Cannot set raid_disk when " "converting RAID5->RAID1"; re->level = 1; - break; + info->new_chunk = 0; + return NULL; default: return "Impossible level change requested"; } @@ -1257,9 +1306,12 @@ switch (re->level) { case 4: - re->after.layout = 0 ; break; + re->before.layout = 0; + re->after.layout = 0; + break; case 5: - re->after.layout = ALGORITHM_PARITY_N; break; + re->after.layout = ALGORITHM_PARITY_N; + break; } break; @@ -1270,9 +1322,12 @@ switch (re->level) { case 4: - re->after.layout = 0 ; break; + re->before.layout = 0; + re->after.layout = 0; + break; case 5: - re->after.layout = ALGORITHM_PARITY_N; break; + re->after.layout = ALGORITHM_PARITY_N; + break; } break; @@ -1344,25 +1399,31 @@ + info->delta_disks - delta_parity); switch (re->level) { - case 6: re->parity = 2; break; + case 6: re->parity = 2; + break; case 4: - case 5: re->parity = 1; break; - default: re->parity = 0; break; + case 5: re->parity = 1; + break; + default: re->parity = 0; + break; } /* So we have a restripe operation, we need to calculate the number * of blocks per reshape operation. */ + re->new_size = info->component_size * re->before.data_disks; if (info->new_chunk == 0) info->new_chunk = info->array.chunk_size; if (re->after.data_disks == re->before.data_disks && re->after.layout == re->before.layout && info->new_chunk == info->array.chunk_size) { - /* Nothing to change */ + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; re->backup_blocks = 0; return NULL; } if (re->after.data_disks == 1 && re->before.data_disks == 1) { /* chunk and layout changes make no difference */ + re->level = info->new_level; re->backup_blocks = 0; return NULL; } @@ -1379,6 +1440,7 @@ info->new_chunk, info->array.chunk_size, re->after.data_disks, re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; re->new_size = info->component_size * re->after.data_disks; return NULL; @@ -1425,7 +1487,8 @@ static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, int force, struct mddev_dev *devlist, - char *backup_file, int quiet, int forked, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, int restart, int freeze_reshape); static int reshape_container(char *container, char *devname, int mdfd, @@ -1433,13 +1496,12 @@ struct mdinfo *info, int force, char *backup_file, - int quiet, int restart, int freeze_reshape); + int verbose, int restart, int freeze_reshape); -int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, - long long size, - int level, char *layout_str, int chunksize, int raid_disks, +int Grow_reshape(char *devname, int fd, struct mddev_dev *devlist, - int assume_clean, int force) + unsigned long long data_offset, + struct context *c, struct shape *s) { /* Make some changes in the shape of an array. * The kernel must support the change. @@ -1466,7 +1528,6 @@ int frozen; int changed = 0; char *container = NULL; - char container_buf[20]; int cfd = -1; struct mddev_dev *dv; @@ -1476,35 +1537,40 @@ struct mdinfo *sra; if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) { - fprintf(stderr, Name ": %s is not an active md array - aborting\n", + pr_err("%s is not an active md array - aborting\n", devname); return 1; } + if (data_offset != INVALID_SECTORS && array.level != 10 + && (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } - if (size >= 0 && - (chunksize || level!= UnSet || layout_str || raid_disks)) { - fprintf(stderr, Name ": cannot change component size at the same time " + if (s->size > 0 && + (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { + pr_err("cannot change component size at the same time " "as other changes.\n" " Change size first, then check data is intact before " "making other changes.\n"); return 1; } - if (raid_disks && raid_disks < array.raid_disks && array.level > 1 && + if (s->raiddisks && s->raiddisks < array.raid_disks && array.level > 1 && get_linux_version() < 2006032 && !check_env("MDADM_FORCE_FEWER")) { - fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n" + pr_err("reducing the number of devices is not safe before Linux 2.6.32\n" " Please use a newer kernel\n"); return 1; } st = super_by_fd(fd, &subarray); if (!st) { - fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname); + pr_err("Unable to determine metadata format for %s\n", devname); return 1; } - if (raid_disks > st->max_devs) { - fprintf(stderr, Name ": Cannot increase raid-disks on this array" + if (s->raiddisks > st->max_devs) { + pr_err("Cannot increase raid-disks on this array" " beyond %d\n", st->max_devs); return 1; } @@ -1514,32 +1580,28 @@ * pre-requisite spare devices (mdmon owns final validation) */ if (st->ss->external) { - int container_dev; int rv; if (subarray) { - container_dev = st->container_dev; - cfd = open_dev_excl(st->container_dev); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); } else { - container_dev = st->devnum; + container = st->devnm; close(fd); - cfd = open_dev_excl(st->devnum); + cfd = open_dev_excl(st->devnm); fd = cfd; } if (cfd < 0) { - fprintf(stderr, Name ": Unable to open container for %s\n", + pr_err("Unable to open container for %s\n", devname); free(subarray); return 1; } - fmt_devname(container_buf, container_dev); - container = container_buf; - rv = st->ss->load_container(st, cfd, NULL); if (rv) { - fprintf(stderr, Name ": Cannot read superblock for %s\n", + pr_err("Cannot read superblock for %s\n", devname); free(subarray); return 1; @@ -1563,11 +1625,10 @@ & (1<update_tail = &st->updates; } added_disks = 0; for (dv = devlist; dv; dv = dv->next) added_disks++; - if (raid_disks > array.raid_disks && - array.spare_disks +added_disks < (raid_disks - array.raid_disks) && - !force) { - fprintf(stderr, - Name ": Need %d spare%s to avoid degraded array," - " and only have %d.\n" - " Use --force to over-ride this check.\n", - raid_disks - array.raid_disks, - raid_disks - array.raid_disks == 1 ? "" : "s", - array.spare_disks + added_disks); + if (s->raiddisks > array.raid_disks && + array.spare_disks +added_disks < (s->raiddisks - array.raid_disks) && + !c->force) { + pr_err("Need %d spare%s to avoid degraded array," + " and only have %d.\n" + " Use --force to over-ride this check.\n", + s->raiddisks - array.raid_disks, + s->raiddisks - array.raid_disks == 1 ? "" : "s", + array.spare_disks + added_disks); return 1; } - sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS | GET_STATE | GET_VERSION); if (sra) { if (st->ss->external && subarray == NULL) { @@ -1603,7 +1663,7 @@ sra->array.level = LEVEL_CONTAINER; } } else { - fprintf(stderr, Name ": failed to read sysfs parameters for %s\n", + pr_err("failed to read sysfs parameters for %s\n", devname); return 1; } @@ -1613,24 +1673,30 @@ sysfs_free(sra); return 1; } else if (frozen < 0) { - fprintf(stderr, Name ": %s is performing resync/recovery and cannot" + pr_err("%s is performing resync/recovery and cannot" " be reshaped\n", devname); sysfs_free(sra); return 1; } /* ========= set size =============== */ - if (size >= 0 && (size == 0 || size != array.size)) { - long long orig_size = get_component_size(fd)/2; - long long min_csize; + if (s->size > 0 && (s->size == MAX_SIZE || s->size != (unsigned)array.size)) { + unsigned long long orig_size = get_component_size(fd)/2; + unsigned long long min_csize; struct mdinfo *mdi; int raid0_takeover = 0; if (orig_size == 0) - orig_size = array.size; + orig_size = (unsigned) array.size; + + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } - if (reshape_super(st, size, UnSet, UnSet, 0, 0, UnSet, NULL, - devname, APPLY_METADATA_CHANGES, !quiet)) { + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, APPLY_METADATA_CHANGES, c->verbose > 0)) { rv = 1; goto release; } @@ -1652,7 +1718,7 @@ dprintf("Metadata size correction from %llu to " "%llu (%llu)\n", orig_size, new_size, new_size * data_disks); - size = new_size; + s->size = new_size; sysfs_free(sizeinfo); } } @@ -1665,7 +1731,8 @@ min_csize = 0; rv = 0; for (mdi = sra->devs; mdi; mdi = mdi->next) { - if (sysfs_set_num(sra, mdi, "size", size) < 0) { + if (sysfs_set_num(sra, mdi, "size", + s->size == MAX_SIZE ? 0 : s->size) < 0) { /* Probably kernel refusing to let us * reduce the size - not an error. */ @@ -1680,29 +1747,29 @@ if (csize >= 2ULL*1024*1024*1024) csize = 2ULL*1024*1024*1024; if ((min_csize == 0 || (min_csize - > (long long)csize))) + > csize))) min_csize = csize; } } } if (rv) { - fprintf(stderr, Name ": Cannot set size on " + pr_err("Cannot set size on " "array members.\n"); goto size_change_error; } - if (min_csize && size > min_csize) { - fprintf(stderr, Name ": Cannot safely make this array " + if (min_csize && s->size > min_csize) { + pr_err("Cannot safely make this array " "use more than 2TB per device on this kernel.\n"); rv = 1; goto size_change_error; } - if (min_csize && size == 0) { + if (min_csize && s->size == MAX_SIZE) { /* Don't let the kernel choose a size - it will get * it wrong */ - fprintf(stderr, Name ": Limited v0.90 array to " - "2TB per device\n"); - size = min_csize; + pr_err("Limited v0.90 array to " + "2TB per device\n"); + s->size = min_csize; } if (st->ss->external) { if (sra->array.level == 0) { @@ -1718,22 +1785,24 @@ } /* make sure mdmon is * aware of the new level */ - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); ping_monitor(container); - if (mdmon_running(st->container_dev) && + if (mdmon_running(st->container_devnm) && st->update_tail == NULL) st->update_tail = &st->updates; } - array.size = size; - if (array.size != size) { + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + if ((unsigned)array.size != s->size) { /* got truncated to 32bit, write to * component_size instead */ if (sra) rv = sysfs_set_num(sra, NULL, - "component_size", size); + "component_size", s->size); else rv = -1; } else { @@ -1764,53 +1833,54 @@ if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, UnSet, NULL, devname, ROLLBACK_METADATA_CHANGES, - !quiet) == 0) + c->verbose) == 0) sync_metadata(st); - fprintf(stderr, Name ": Cannot set device size for %s: %s\n", + pr_err("Cannot set device size for %s: %s\n", devname, strerror(err)); if (err == EBUSY && (array.state & (1<assume_clean) { + /* This will fail on kernels older than 3.0 unless * a backport has been arranged. */ if (sra == NULL || sysfs_set_str(sra, NULL, "resync_start", "none") < 0) - fprintf(stderr, Name ": --assume-clean not support with --grow on this kernel\n"); + pr_err("--assume-clean not supported with --grow on this kernel\n"); } ioctl(fd, GET_ARRAY_INFO, &array); - size = get_component_size(fd)/2; - if (size == 0) - size = array.size; - if (!quiet) { - if (size == orig_size) - fprintf(stderr, Name ": component size of %s " + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + if (c->verbose >= 0) { + if (s->size == orig_size) + pr_err("component size of %s " "unchanged at %lluK\n", - devname, size); + devname, s->size); else - fprintf(stderr, Name ": component size of %s " + pr_err("component size of %s " "has been set to %lluK\n", - devname, size); + devname, s->size); } changed = 1; } else if (array.level != LEVEL_CONTAINER) { - size = get_component_size(fd)/2; - if (size == 0) - size = array.size; + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; } /* See if there is anything else to do */ - if ((level == UnSet || level == array.level) && - (layout_str == NULL) && - (chunksize == 0 || chunksize == array.chunk_size) && - (raid_disks == 0 || raid_disks == array.raid_disks)) { + if ((s->level == UnSet || s->level == array.level) && + (s->layout_str == NULL) && + (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && + (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { /* Nothing more to do */ - if (!changed && !quiet) - fprintf(stderr, Name ": %s: no change requested\n", + if (!changed && c->verbose >= 0) + pr_err("%s: no change requested\n", devname); goto release; } @@ -1821,9 +1891,9 @@ * - far_copies == 1 * - near_copies == 2 */ - if ((level == 0 && array.level == 10 && sra && + if ((s->level == 0 && array.level == 10 && sra && array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || - (level == 0 && array.level == 1 && sra)) { + (s->level == 0 && array.level == 1 && sra)) { int err; err = remove_disks_for_takeover(st, sra, array.layout); if (err) { @@ -1843,36 +1913,34 @@ memset(&info, 0, sizeof(info)); info.array = array; - sysfs_init(&info, fd, NoMdDev); + sysfs_init(&info, fd, NULL); strcpy(info.text_version, sra->text_version); - info.component_size = size*2; - info.new_level = level; - info.new_chunk = chunksize * 1024; + info.component_size = s->size*2; + info.new_level = s->level; + info.new_chunk = s->chunk * 1024; if (info.array.level == LEVEL_CONTAINER) { info.delta_disks = UnSet; - info.array.raid_disks = raid_disks; - } else if (raid_disks) - info.delta_disks = raid_disks - info.array.raid_disks; + info.array.raid_disks = s->raiddisks; + } else if (s->raiddisks) + info.delta_disks = s->raiddisks - info.array.raid_disks; else info.delta_disks = UnSet; - if (layout_str == NULL) { + if (s->layout_str == NULL) { info.new_layout = UnSet; if (info.array.level == 6 && (info.new_level == 6 || info.new_level == UnSet) && info.array.layout >= 16) { - fprintf(stderr, Name - ": %s has a non-standard layout. If you" - " wish to preserve this\n" - " during the reshape, please specify" - " --layout=preserve\n" - " If you want to change it, specify a" - " layout or use --layout=normalise\n", - devname); + pr_err("%s has a non-standard layout. If you" + " wish to preserve this\n", devname); + cont_err("during the reshape, please specify" + " --layout=preserve\n"); + cont_err("If you want to change it, specify a" + " layout or use --layout=normalise\n"); rv = 1; goto release; } - } else if (strcmp(layout_str, "normalise") == 0 || - strcmp(layout_str, "normalize") == 0) { + } else if (strcmp(s->layout_str, "normalise") == 0 || + strcmp(s->layout_str, "normalize") == 0) { /* If we have a -6 RAID6 layout, remove the '-6'. */ info.new_layout = UnSet; if (info.array.level == 6 && info.new_level == UnSet) { @@ -1884,13 +1952,12 @@ info.new_layout = map_name(r6layout, l); } } else { - fprintf(stderr, Name - ": %s is only meaningful when reshaping" - " a RAID6 array.\n", layout_str); + pr_err("%s is only meaningful when reshaping" + " a RAID6 array.\n", s->layout_str); rv = 1; goto release; } - } else if (strcmp(layout_str, "preserve") == 0) { + } else if (strcmp(s->layout_str, "preserve") == 0) { /* This means that a non-standard RAID6 layout * is OK. * In particular: @@ -1908,9 +1975,8 @@ strcat(l, "-6"); info.new_layout = map_name(r6layout, l); } else { - fprintf(stderr, Name - ": %s in only meaningful when reshaping" - " to RAID6\n", layout_str); + pr_err("%s in only meaningful when reshaping" + " to RAID6\n", s->layout_str); rv = 1; goto release; } @@ -1920,55 +1986,55 @@ l = info.array.level; switch (l) { case 5: - info.new_layout = map_name(r5layout, layout_str); + info.new_layout = map_name(r5layout, s->layout_str); break; case 6: - info.new_layout = map_name(r6layout, layout_str); + info.new_layout = map_name(r6layout, s->layout_str); break; case 10: - info.new_layout = parse_layout_10(layout_str); + info.new_layout = parse_layout_10(s->layout_str); break; case LEVEL_FAULTY: - info.new_layout = parse_layout_faulty(layout_str); + info.new_layout = parse_layout_faulty(s->layout_str); break; default: - fprintf(stderr, Name ": layout not meaningful" + pr_err("layout not meaningful" " with this level\n"); rv = 1; goto release; } if (info.new_layout == UnSet) { - fprintf(stderr, Name ": layout %s not understood" + pr_err("layout %s not understood" " for this level\n", - layout_str); + s->layout_str); rv = 1; goto release; } } if (array.level == LEVEL_FAULTY) { - if (level != UnSet && level != array.level) { - fprintf(stderr, Name ": cannot change level of Faulty device\n"); + if (s->level != UnSet && s->level != array.level) { + pr_err("cannot change level of Faulty device\n"); rv =1 ; } - if (chunksize) { - fprintf(stderr, Name ": cannot set chunksize of Faulty device\n"); + if (s->chunk) { + pr_err("cannot set chunksize of Faulty device\n"); rv =1 ; } - if (raid_disks && raid_disks != 1) { - fprintf(stderr, Name ": cannot set raid_disks of Faulty device\n"); + if (s->raiddisks && s->raiddisks != 1) { + pr_err("cannot set raid_disks of Faulty device\n"); rv =1 ; } - if (layout_str) { + if (s->layout_str) { if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { dprintf("Cannot get array information.\n"); goto release; } array.layout = info.new_layout; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": failed to set new layout\n"); + pr_err("failed to set new layout\n"); rv = 1; - } else if (!quiet) + } else if (c->verbose >= 0) printf("layout for %s set to %d\n", devname, array.layout); } @@ -1982,7 +2048,7 @@ * performed at the level of the container */ rv = reshape_container(container, devname, -1, st, &info, - force, backup_file, quiet, 0, 0); + c->force, c->backup_file, c->verbose, 0, 0); frozen = 0; } else { /* get spare devices from external metadata @@ -2001,17 +2067,18 @@ /* Impose these changes on a single array. First * check that the metadata is OK with the change. */ - if (reshape_super(st, -1, info.new_level, + if (reshape_super(st, 0, info.new_level, info.new_layout, info.new_chunk, info.array.raid_disks, info.delta_disks, - backup_file, devname, APPLY_METADATA_CHANGES, - quiet)) { + c->backup_file, devname, APPLY_METADATA_CHANGES, + c->verbose)) { rv = 1; goto release; } sync_metadata(st); - rv = reshape_array(container, fd, devname, st, &info, force, - devlist, backup_file, quiet, 0, 0, 0); + rv = reshape_array(container, fd, devname, st, &info, c->force, + devlist, data_offset, c->backup_file, c->verbose, + 0, 0, 0); frozen = 0; } release: @@ -2056,11 +2123,11 @@ info->reshape_progress = position; ret_val = 1; } else if (info->reshape_progress > position) { - fprintf(stderr, Name ": Fatal error: array " - "reshape was not properly frozen " - "(expected reshape position is %llu, " - "but reshape progress is %llu.\n", - position, info->reshape_progress); + pr_err("Fatal error: array " + "reshape was not properly frozen " + "(expected reshape position is %llu, " + "but reshape progress is %llu.\n", + position, info->reshape_progress); ret_val = -1; } else { dprintf("Reshape position in md and metadata " @@ -2078,17 +2145,608 @@ return ret_val; } +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + return 1; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && dir == 1) || + (data_offset >= info2.data_offset && dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset - min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", + dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", + dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && + info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - + min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, data_offset, + min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Covert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + ioctl(fd, GET_ARRAY_INFO, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && + ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", + devname); + + if (err == EBUSY && + (array.state & (1<= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && + array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && + array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + ioctl(fd, GET_ARRAY_INFO, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + int cnt; + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + cnt = 5; + while (ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)) < 0 + && errno == EBUSY + && cnt--) { + usleep(10000); + } + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<= 0) + pr_err("level of %s changed to %s\n", + devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + static int reshape_array(char *container, int fd, char *devname, struct supertype *st, struct mdinfo *info, int force, struct mddev_dev *devlist, - char *backup_file, int quiet, int forked, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, int restart, int freeze_reshape) { struct reshape reshape; int spares_needed; char *msg; int orig_level = UnSet; - int disks, odisks; + int odisks; + int delayed; struct mdu_array_info_s array; char *c; @@ -2102,7 +2760,6 @@ int nrdisks; int err; unsigned long blocks; - unsigned long cache; unsigned long long array_size; int done; struct mdinfo *sra = NULL; @@ -2119,12 +2776,16 @@ info->component_size = array_size / array.raid_disks; } + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + if (info->reshape_active) { int new_level = info->new_level; info->new_level = UnSet; if (info->delta_disks > 0) info->array.raid_disks -= info->delta_disks; - msg = analyse_change(info, &reshape); + msg = analyse_change(devname, info, &reshape); info->new_level = new_level; if (info->delta_disks > 0) info->array.raid_disks += info->delta_disks; @@ -2132,9 +2793,11 @@ /* Make sure the array isn't read-only */ ioctl(fd, RESTART_ARRAY_RW, 0); } else - msg = analyse_change(info, &reshape); + msg = analyse_change(devname, info, &reshape); if (msg) { - fprintf(stderr, Name ": %s\n", msg); + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); goto release; } if (restart && @@ -2142,7 +2805,7 @@ reshape.before.layout != info->array.layout || reshape.before.data_disks + reshape.parity != info->array.raid_disks - max(0, info->delta_disks))) { - fprintf(stderr, Name ": reshape info is not in native format -" + pr_err("reshape info is not in native format -" " cannot continue.\n"); goto release; } @@ -2163,6 +2826,18 @@ /* reshape already started. just skip to monitoring the reshape */ if (reshape.backup_blocks == 0) return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } goto started; } /* The container is frozen but the array may not be. @@ -2182,13 +2857,12 @@ if (!force && info->new_level > 1 && info->array.level > 1 && spares_needed > info->array.spare_disks + added_disks) { - fprintf(stderr, - Name ": Need %d spare%s to avoid degraded array," - " and only have %d.\n" - " Use --force to over-ride this check.\n", - spares_needed, - spares_needed == 1 ? "" : "s", - info->array.spare_disks + added_disks); + pr_err("Need %d spare%s to avoid degraded array," + " and only have %d.\n" + " Use --force to over-ride this check.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); goto release; } /* Check we have enough spares to not fail */ @@ -2197,47 +2871,32 @@ - array.raid_disks; if ((info->new_level > 1 || info->new_level == 0) && spares_needed > info->array.spare_disks +added_disks) { - fprintf(stderr, - Name ": Need %d spare%s to create working array," - " and only have %d.\n", - spares_needed, - spares_needed == 1 ? "" : "s", - info->array.spare_disks + added_disks); + pr_err("Need %d spare%s to create working array," + " and only have %d.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); goto release; } if (reshape.level != array.level) { - char *c = map_num(pers, reshape.level); - int err; - if (c == NULL) - goto release; - - err = sysfs_set_str(info, NULL, "level", c); - if (err) { - err = errno; - fprintf(stderr, Name ": %s: could not set level to %s\n", - devname, c); - if (err == EBUSY && - (info->array.state & (1<new_layout = UnSet; /* after level change, + * layout is meaningless */ orig_level = array.level; sysfs_freeze_array(info); if (reshape.level > 0 && st->ss->external) { /* make sure mdmon is aware of the new level */ - if (mdmon_running(st->container_dev)) + if (mdmon_running(container)) flush_mdmon(container); - if (!mdmon_running(st->container_dev)) - start_mdmon(st->container_dev); + if (!mdmon_running(container)) + start_mdmon(container); ping_monitor(container); - if (mdmon_running(st->container_dev) && + if (mdmon_running(container) && st->update_tail == NULL) st->update_tail = &st->updates; } @@ -2254,7 +2913,7 @@ struct mdinfo *d; if (info2) { - sysfs_init(info2, fd, st->devnum); + sysfs_init(info2, fd, st->devnm); /* When increasing number of devices, we need to set * new raid_disks before adding these, or they might * be rejected. @@ -2281,9 +2940,11 @@ * level and frozen, we can safely add them. */ if (devlist) - Manage_subdevs(devname, fd, devlist, !quiet, + Manage_subdevs(devname, fd, devlist, verbose, 0,NULL, 0); + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; if (reshape.backup_blocks == 0) { /* No restriping needed, but we might need to impose * some more changes: layout, raid_disks, chunk_size @@ -2299,9 +2960,9 @@ info->new_layout != array.layout) { array.layout = info->new_layout; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": failed to set new layout\n"); + pr_err("failed to set new layout\n"); goto release; - } else if (!quiet) + } else if (verbose >= 0) printf("layout for %s set to %d\n", devname, array.layout); } @@ -2310,9 +2971,9 @@ array.raid_disks != (info->array.raid_disks + info->delta_disks)) { array.raid_disks += info->delta_disks; if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - fprintf(stderr, Name ": failed to set raid disks\n"); + pr_err("failed to set raid disks\n"); goto release; - } else if (!quiet) { + } else if (verbose >= 0) { printf("raid_disks for %s set to %d\n", devname, array.raid_disks); } @@ -2321,9 +2982,9 @@ info->new_chunk != array.chunk_size) { if (sysfs_set_num(info, NULL, "chunk_size", info->new_chunk) != 0) { - fprintf(stderr, Name ": failed to set chunk size\n"); + pr_err("failed to set chunk size\n"); goto release; - } else if (!quiet) + } else if (verbose >= 0) printf("chunk size for %s set to %d\n", devname, array.chunk_size); } @@ -2364,27 +3025,90 @@ * - request the shape change. * - fork to handle backup etc. */ -started: /* Check that we can hold all the data */ get_dev_size(fd, NULL, &array_size); if (reshape.new_size < (array_size/512)) { - fprintf(stderr, - Name ": this change will reduce the size of the array.\n" - " use --grow --array-size first to truncate array.\n" - " e.g. mdadm --grow %s --array-size %llu\n", - devname, reshape.new_size/2); + pr_err("this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, reshape.new_size/2); goto release; } - sra = sysfs_read(fd, 0, + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, + force, verbose); + } + sra = sysfs_read(fd, NULL, GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| GET_CACHE); if (!sra) { - fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n", + pr_err("%s: Cannot get array details from sysfs\n", devname); goto release; } + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: /* Decide how many blocks (sectors) for a reshape * unit. The number we have so far is just a minimum */ @@ -2399,11 +3123,11 @@ blocks < 16*1024*2) blocks *= 2; } else - fprintf(stderr, Name ": Need to backup %luK of critical " + pr_err("Need to backup %luK of critical " "section..\n", blocks/2); if (blocks >= sra->component_size/2) { - fprintf(stderr, Name ": %s: Something wrong" + pr_err("%s: Something wrong" " - reshape aborted\n", devname); goto release; @@ -2414,12 +3138,8 @@ nrdisks = max(reshape.before.data_disks, reshape.after.data_disks) + reshape.parity + sra->array.spare_disks; - fdlist = malloc((1+nrdisks) * sizeof(int)); - offsets = malloc((1+nrdisks) * sizeof(offsets[0])); - if (!fdlist || !offsets) { - fprintf(stderr, Name ": malloc failed: grow aborted\n"); - goto release; - } + fdlist = xcalloc((1+nrdisks), sizeof(int)); + offsets = xcalloc((1+nrdisks), sizeof(offsets[0])); odisks = reshape.before.data_disks + reshape.parity; d = reshape_prepare_fdlist(devname, sra, odisks, @@ -2433,11 +3153,12 @@ if (backup_file == NULL) { if (reshape.after.data_disks <= reshape.before.data_disks) { - fprintf(stderr, Name ": %s: Cannot grow - " - "need backup-file\n", devname); + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); goto release; } else if (sra->array.spare_disks == 0) { - fprintf(stderr, Name ": %s: Cannot grow - " + pr_err("%s: Cannot grow - " "need a spare or backup-file to backup " "critical section\n", devname); goto release; @@ -2453,23 +3174,9 @@ } } - /* lastly, check that the internal stripe cache is - * large enough, or it won't work. - * It must hold at least 4 stripes of the larger - * chunk size - */ - cache = max(info->array.chunk_size, info->new_chunk); - cache *= 4; /* 4 stripes minimum */ - cache /= 512; /* convert to sectors */ - disks = min(reshape.before.data_disks, reshape.after.data_disks); - /* make sure there is room for 'blocks' with a bit to spare */ - if (cache < 16 + blocks / disks) - cache = 16 + blocks / disks; - cache /= (4096/512); /* Covert from sectors to pages */ - - if (sra->cache_size < cache) - subarray_set_num(container, sra, "stripe_cache_size", - cache+1); + update_cache_size(container, sra, info, + min(reshape.before.data_disks, reshape.after.data_disks), + blocks); /* Right, everything seems fine. Let's kick things off. * If only changing raid_disks, use ioctl, else use @@ -2477,81 +3184,16 @@ */ sync_metadata(st); - sra->new_chunk = info->new_chunk; - - if (restart) { - /* for external metadata checkpoint saved by mdmon can be lost - * or missed /due to e.g. crash/. Check if md is not during - * restart farther than metadata points to. - * If so, this means metadata information is obsolete. - */ - if (st->ss->external) - verify_reshape_position(info, reshape.level); - sra->reshape_progress = info->reshape_progress; - } else { - sra->reshape_progress = 0; - if (reshape.after.data_disks < reshape.before.data_disks) - /* start from the end of the new array */ - sra->reshape_progress = (sra->component_size - * reshape.after.data_disks); - } - - if (info->array.chunk_size == info->new_chunk && - reshape.before.layout == reshape.after.layout && - st->ss->external == 0) { - /* use SET_ARRAY_INFO but only if reshape hasn't started */ - ioctl(fd, GET_ARRAY_INFO, &array); - array.raid_disks = reshape.after.data_disks + reshape.parity; - if (!restart && - ioctl(fd, SET_ARRAY_INFO, &array) != 0) { - int err = errno; - - fprintf(stderr, - Name ": Cannot set device shape for %s: %s\n", - devname, strerror(errno)); - - if (err == EBUSY && - (array.state & (1<new_chunk) < 0) - err = errno; - if (!err && sysfs_set_num(sra, NULL, "layout", - reshape.after.layout) < 0) - err = errno; - if (!err && subarray_set_num(container, sra, "raid_disks", - reshape.after.data_disks + - reshape.parity) < 0) - err = errno; - if (err) { - fprintf(stderr, Name ": Cannot set device shape for %s\n", - devname); - - if (err == EBUSY && - (array.state & (1<reshape_progress); return 1; @@ -2572,7 +3214,7 @@ */ switch(forked ? 0 : fork()) { case -1: - fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n", + pr_err("Cannot run child to monitor reshape: %s\n", strerror(errno)); abort_reshape(sra); goto release; @@ -2586,6 +3228,39 @@ break; } + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && + m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); close(fd); if (check_env("MDADM_GROW_VERIFY")) fd = open(devname, O_RDONLY | O_DIRECT); @@ -2593,6 +3268,8 @@ fd = -1; mlockall(MCL_FUTURE); + signal(SIGTERM, catch_term); + if (st->ss->external) { /* metadata handler takes it from here */ done = st->ss->manage_reshape( @@ -2632,7 +3309,7 @@ if (st->ss->external) { /* Re-load the metadata as much could have changed */ - int cfd = open_dev(st->container_dev); + int cfd = open_dev(st->container_devnm); if (cfd >= 0) { flush_mdmon(container); st->ss->free_super(st); @@ -2650,15 +3327,10 @@ set_array_size(st, info, info->text_version); if (info->new_level != reshape.level) { - - c = map_num(pers, info->new_level); - if (c) { - err = sysfs_set_str(sra, NULL, "level", c); - if (err) - fprintf(stderr, Name\ - ": %s: could not set level " - "to %s\n", devname, c); - } + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); if (info->new_level == 0) st->update_tail = NULL; } @@ -2675,7 +3347,7 @@ if (orig_level != UnSet && sra) { c = map_num(pers, orig_level); if (c && sysfs_set_str(sra, NULL, "level", c) == 0) - fprintf(stderr, Name ": aborting level change\n"); + pr_err("aborting level change\n"); } sysfs_free(sra); if (!forked) @@ -2691,21 +3363,21 @@ struct mdinfo *info, int force, char *backup_file, - int quiet, int restart, int freeze_reshape) + int verbose, int restart, int freeze_reshape) { struct mdinfo *cc = NULL; int rv = restart; - int last_devnum = -1; + char last_devnm[32] = ""; /* component_size is not meaningful for a container, - * so pass '-1' meaning 'no change' + * so pass '0' meaning 'no change' */ if (!restart && - reshape_super(st, -1, info->new_level, + reshape_super(st, 0, info->new_level, info->new_layout, info->new_chunk, info->array.raid_disks, info->delta_disks, backup_file, devname, APPLY_METADATA_CHANGES, - quiet)) { + verbose)) { unfreeze(st); return 1; } @@ -2752,6 +3424,7 @@ int fd; struct mdstat_ent *mdstat; char *adev; + int devid; sysfs_free(cc); @@ -2763,13 +3436,12 @@ continue; subarray = strchr(content->text_version+1, '/')+1; - mdstat = mdstat_by_subdev(subarray, - devname2devnum(container)); + mdstat = mdstat_by_subdev(subarray, container); if (!mdstat) continue; if (mdstat->active == 0) { - fprintf(stderr, Name ": Skipping inactive " - "array md%i.\n", mdstat->devnum); + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); free_mdstat(mdstat); mdstat = NULL; continue; @@ -2779,20 +3451,19 @@ if (!content) break; - adev = map_dev(dev2major(mdstat->devnum), - dev2minor(mdstat->devnum), - 0); + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); if (!adev) adev = content->text_version; - fd = open_dev(mdstat->devnum); + fd = open_dev(mdstat->devnm); if (fd < 0) { printf(Name ": Device %s cannot be opened for reshape.", adev); break; } - if (last_devnum == mdstat->devnum) { + if (strcmp(last_devnm, mdstat->devnm) == 0) { /* Do not allow for multiple reshape_array() calls for * the same array. * It can happen when reshape_array() returns without @@ -2808,16 +3479,16 @@ close(fd); break; } - last_devnum = mdstat->devnum; + strcpy(last_devnm, mdstat->devnm); - sysfs_init(content, fd, mdstat->devnum); + sysfs_init(content, fd, mdstat->devnm); - if (mdmon_running(devname2devnum(container))) + if (mdmon_running(container)) flush_mdmon(container); rv = reshape_array(container, fd, adev, st, - content, force, NULL, - backup_file, quiet, 1, restart, + content, force, NULL, INVALID_SECTORS, + backup_file, verbose, 1, restart, freeze_reshape); close(fd); @@ -2830,7 +3501,7 @@ if (rv) break; - if (mdmon_running(devname2devnum(container))) + if (mdmon_running(container)) flush_mdmon(container); } if (!rv) @@ -2867,7 +3538,7 @@ unsigned long long backup_point, unsigned long long wait_point, unsigned long long *suspend_point, - unsigned long long *reshape_completed) + unsigned long long *reshape_completed, int *frozen) { /* This function is called repeatedly by the reshape manager. * It determines how much progress can safely be made and allows @@ -3084,7 +3755,8 @@ wait_point = info->component_size - wait_point; } - sysfs_set_num(info, NULL, "sync_max", max_progress); + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); /* Now wait. If we have already reached the point that we were * asked to wait to, don't wait at all, else wait for any change. @@ -3104,7 +3776,6 @@ * waiting forever on a dead array */ char action[20]; - fd_set rfds; if (sysfs_get_str(info, NULL, "sync_action", action, 20) <= 0 || strncmp(action, "reshape", 7) != 0) @@ -3120,9 +3791,7 @@ && info->reshape_progress < (info->component_size * reshape->after.data_disks)) break; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, NULL, NULL, &rfds, NULL); + sysfs_wait(fd, NULL); if (sysfs_fd_get_ll(fd, &completed) < 0) goto check_progress; } @@ -3167,23 +3836,24 @@ /* The abort might only be temporary. Wait up to 10 * seconds for fd to contain a valid number again. */ - struct timeval tv; + int wait = 10000; int rv = -2; - tv.tv_sec = 10; - tv.tv_usec = 0; - while (fd >= 0 && rv < 0 && tv.tv_sec > 0) { - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - if (select(fd+1, NULL, NULL, &rfds, &tv) != 1) + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) break; switch (sysfs_fd_get_ll(fd, &completed)) { case 0: /* all good again */ rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", &new_sync_max); + *frozen = (new_sync_max != max_progress); break; case -2: /* read error - abort */ - tv.tv_sec = 0; + wait = 0; break; } } @@ -3433,8 +4103,8 @@ free(abuf); free(bbuf); abuflen = len; - abuf = malloc(abuflen); - bbuf = malloc(abuflen); + abuf = xmalloc(abuflen); + bbuf = xmalloc(abuflen); } lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); @@ -3475,6 +4145,7 @@ struct mdinfo *sd; unsigned long stripes; int uuid[4]; + int frozen = 0; /* set up the backup-super-block. This requires the * uuid from the array. @@ -3496,7 +4167,7 @@ break; } if (!sd) { - fprintf(stderr, Name ": Cannot find a superblock\n"); + pr_err("Cannot find a superblock\n"); return 0; } @@ -3552,9 +4223,11 @@ wait_point = __le64_to_cpu(bsb.arraystart2); } + reshape_completed = sra->reshape_progress; rv = progress_reshape(sra, reshape, backup_point, wait_point, - &suspend_point, &reshape_completed); + &suspend_point, &reshape_completed, + &frozen); /* external metadata would need to ping_monitor here */ sra->reshape_progress = reshape_completed; @@ -3580,7 +4253,8 @@ forget_backup(dests, destfd, destoffsets, 1); } - + if (sigterm) + rv = -2; if (rv < 0) { if (rv == -1) done = 1; @@ -3588,6 +4262,7 @@ } if (rv == 0 && increasing && !st->ss->external) { /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); done = 1; break; } @@ -3691,7 +4366,7 @@ if (i == old_disks-1) { fd = open(backup_file, O_RDONLY); if (fd<0) { - fprintf(stderr, Name ": backup file %s inaccessible: %s\n", + pr_err("backup file %s inaccessible: %s\n", backup_file, strerror(errno)); continue; } @@ -3709,7 +4384,7 @@ if (lseek64(fd, (dinfo.data_offset + dinfo.component_size - 8) <<9, 0) < 0) { - fprintf(stderr, Name ": Cannot seek on device %d\n", i); + pr_err("Cannot seek on device %d\n", i); continue; /* Cannot seek */ } sprintf(namebuf, "device-%d", i); @@ -3717,29 +4392,29 @@ } if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) { if (verbose) - fprintf(stderr, Name ": Cannot read from %s\n", devname); + pr_err("Cannot read from %s\n", devname); continue; /* Cannot read */ } if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 && memcmp(bsb.magic, "md_backup_data-2", 16) != 0) { if (verbose) - fprintf(stderr, Name ": No backup metadata on %s\n", devname); + pr_err("No backup metadata on %s\n", devname); continue; } if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) { if (verbose) - fprintf(stderr, Name ": Bad backup-metadata checksum on %s\n", devname); + pr_err("Bad backup-metadata checksum on %s\n", devname); continue; /* bad checksum */ } if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { if (verbose) - fprintf(stderr, Name ": Bad backup-metadata checksum2 on %s\n", devname); + pr_err("Bad backup-metadata checksum2 on %s\n", devname); continue; /* Bad second checksum */ } if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { if (verbose) - fprintf(stderr, Name ": Wrong uuid on backup-metadata on %s\n", devname); + pr_err("Wrong uuid on backup-metadata on %s\n", devname); continue; /* Wrong uuid */ } @@ -3750,14 +4425,13 @@ if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 || info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) { if (check_env("MDADM_GROW_ALLOW_OLD")) { - fprintf(stderr, Name ": accepting backup with timestamp %lu " + pr_err("accepting backup with timestamp %lu " "for array with timestamp %lu\n", (unsigned long)__le64_to_cpu(bsb.mtime), (unsigned long)info->array.utime); } else { - if (verbose) - fprintf(stderr, Name ": too-old timestamp on " - "backup-metadata on %s\n", devname); + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); continue; /* time stamp is too bad */ } } @@ -3772,8 +4446,7 @@ < info->reshape_progress) { nonew: if (verbose) - fprintf(stderr, Name - ": backup-metadata found on %s but is not needed\n", devname); + pr_err("backup-metadata found on %s but is not needed\n", devname); continue; /* No new data here */ } } else { @@ -3807,9 +4480,8 @@ if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { second_fail: if (verbose) - fprintf(stderr, Name - ": Failed to verify secondary backup-metadata block on %s\n", - devname); + pr_err("Failed to verify secondary backup-metadata block on %s\n", + devname); continue; /* Cannot seek */ } /* There should be a duplicate backup superblock 4k before here */ @@ -3824,7 +4496,7 @@ goto second_fail; /* Cannot find leading superblock */ /* Now need the data offsets for all devices. */ - offsets = malloc(sizeof(*offsets)*info->array.raid_disks); + offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks); for(j=0; jarray.raid_disks; j++) { if (fdlist[j] < 0) continue; @@ -3847,7 +4519,7 @@ __le64_to_cpu(bsb.length)*512, NULL)) { /* didn't succeed, so giveup */ if (verbose) - fprintf(stderr, Name ": Error restoring backup from %s\n", + pr_err("Error restoring backup from %s\n", devname); free(offsets); return 1; @@ -3865,7 +4537,7 @@ __le64_to_cpu(bsb.length2)*512, NULL)) { /* didn't succeed, so giveup */ if (verbose) - fprintf(stderr, Name ": Error restoring second backup from %s\n", + pr_err("Error restoring second backup from %s\n", devname); free(offsets); return 1; @@ -3914,7 +4586,8 @@ } } for (j=0; jarray.raid_disks; j++) { - if (fdlist[j] < 0) continue; + if (fdlist[j] < 0) + continue; if (st->ss->load_super(st, fdlist[j], NULL)) continue; st->ss->getinfo_super(st, &dinfo, NULL); @@ -3964,7 +4637,7 @@ } /* needed to recover critical section! */ if (verbose) - fprintf(stderr, Name ": Failed to find backup of critical section\n"); + pr_err("Failed to find backup of critical section\n"); return 1; } @@ -3978,7 +4651,6 @@ char *subarray = NULL; struct mdinfo *cc = NULL; struct mdstat_ent *mdstat = NULL; - char buf[40]; int cfd = -1; int fd2 = -1; @@ -3987,51 +4659,84 @@ st = super_by_fd(fd, &subarray); if (!st || !st->ss) { - fprintf(stderr, - Name ": Unable to determine metadata format for %s\n", - devname); + pr_err("Unable to determine metadata format for %s\n", + devname); return 1; } dprintf("Grow continue is run for "); if (st->ss->external == 0) { + int d; dprintf("native array (%s)\n", devname); - if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) { - fprintf(stderr, Name ": %s is not an active md array -" + if (ioctl(fd, GET_ARRAY_INFO, &array.array) < 0) { + pr_err("%s is not an active md array -" " aborting\n", devname); ret_val = 1; goto Grow_continue_command_exit; } content = &array; - sysfs_init(content, fd, st->devnum); + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + /* invalidate fd2 to avoid possible double close() */ + fd2 = -1; + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); } else { - int container_dev; + char *container; if (subarray) { dprintf("subarray (%s)\n", subarray); - container_dev = st->container_dev; - cfd = open_dev_excl(st->container_dev); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); } else { - container_dev = st->devnum; + container = st->devnm; close(fd); - cfd = open_dev_excl(st->devnum); - dprintf("container (%i)\n", container_dev); + cfd = open_dev_excl(st->devnm); + dprintf("container (%s)\n", container); fd = cfd; } if (cfd < 0) { - fprintf(stderr, Name ": Unable to open container " + pr_err("Unable to open container " "for %s\n", devname); ret_val = 1; goto Grow_continue_command_exit; } - fmt_devname(buf, container_dev); /* find in container array under reshape */ ret_val = st->ss->load_container(st, cfd, NULL); if (ret_val) { - fprintf(stderr, - Name ": Cannot read superblock for %s\n", - devname); + pr_err("Cannot read superblock for %s\n", + devname); ret_val = 1; goto Grow_continue_command_exit; } @@ -4057,22 +4762,21 @@ allow_reshape = 0; if (!allow_reshape) { - fprintf(stderr, Name - ": cannot continue reshape of an array" - " in container with unsupported" - " metadata: %s(%s)\n", - devname, buf); + pr_err("cannot continue reshape of an array" + " in container with unsupported" + " metadata: %s(%s)\n", + devname, container); ret_val = 1; goto Grow_continue_command_exit; } array = strchr(content->text_version+1, '/')+1; - mdstat = mdstat_by_subdev(array, container_dev); + mdstat = mdstat_by_subdev(array, container); if (!mdstat) continue; if (mdstat->active == 0) { - fprintf(stderr, Name ": Skipping inactive " - "array md%i.\n", mdstat->devnum); + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); free_mdstat(mdstat); mdstat = NULL; continue; @@ -4080,32 +4784,30 @@ break; } if (!content) { - fprintf(stderr, - Name ": Unable to determine reshaped " - "array for %s\n", devname); + pr_err("Unable to determine reshaped " + "array for %s\n", devname); ret_val = 1; goto Grow_continue_command_exit; } - fd2 = open_dev(mdstat->devnum); + fd2 = open_dev(mdstat->devnm); if (fd2 < 0) { - fprintf(stderr, Name ": cannot open (md%i)\n", - mdstat->devnum); + pr_err("cannot open (%s)\n", mdstat->devnm); ret_val = 1; goto Grow_continue_command_exit; } - sysfs_init(content, fd2, mdstat->devnum); + sysfs_init(content, fd2, mdstat->devnm); /* start mdmon in case it is not running */ - if (!mdmon_running(container_dev)) - start_mdmon(container_dev); - ping_monitor(buf); + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); - if (mdmon_running(container_dev)) + if (mdmon_running(container)) st->update_tail = &st->updates; else { - fprintf(stderr, Name ": No mdmon found. " + pr_err("No mdmon found. " "Grow cannot continue.\n"); ret_val = 1; goto Grow_continue_command_exit; @@ -4115,8 +4817,7 @@ /* verify that array under reshape is started from * correct position */ - if (verify_reshape_position(content, - map_name(pers, mdstat->level)) < 0) { + if (verify_reshape_position(content, content->array.level) < 0) { ret_val = 1; goto Grow_continue_command_exit; } @@ -4147,21 +4848,23 @@ return ret_val; if (st->ss->external) { - char container[40]; - int cfd = open_dev(st->container_dev); + int cfd = open_dev(st->container_devnm); if (cfd < 0) return 1; - fmt_devname(container, st->container_dev); - st->ss->load_container(st, cfd, container); + st->ss->load_container(st, cfd, st->container_devnm); close(cfd); - ret_val = reshape_container(container, NULL, mdfd, + ret_val = reshape_container(st->container_devnm, NULL, mdfd, st, info, 0, backup_file, - 0, 1, freeze_reshape); + 0, + 1 | info->reshape_active, + freeze_reshape); } else ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, - NULL, backup_file, 0, 0, 1, + NULL, INVALID_SECTORS, + backup_file, 0, 0, + 1 | info->reshape_active, freeze_reshape); return ret_val; diff -Nru mdadm-3.2.5/Incremental.c mdadm-3.3/Incremental.c --- mdadm-3.2.5/Incremental.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Incremental.c 2013-09-03 04:47:47.000000000 +0000 @@ -2,7 +2,7 @@ * Incremental.c - support --incremental. Part of: * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2006-2009 Neil Brown + * Copyright (C) 2006-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -29,6 +29,7 @@ */ #include "mdadm.h" +#include #include #include @@ -43,13 +44,10 @@ struct supertype *st, int verbose); static int Incremental_container(struct supertype *st, char *devname, - char *homehost, - int verbose, int runstop, int autof, - int freeze_reshape); - -int Incremental(char *devname, int verbose, int runstop, - struct supertype *st, char *homehost, int require_homehost, - int autof, int freeze_reshape) + struct context *c); + +int Incremental(char *devname, struct context *c, + struct supertype *st) { /* Add this device to an array, creating the array if necessary * and starting the array if sensible or - if runstop>0 - if possible. @@ -108,21 +106,21 @@ struct createinfo *ci = conf_get_create_info(); if (stat(devname, &stb) < 0) { - if (verbose >= 0) - fprintf(stderr, Name ": stat failed for %s: %s.\n", + if (c->verbose >= 0) + pr_err("stat failed for %s: %s.\n", devname, strerror(errno)); return rv; } if ((stb.st_mode & S_IFMT) != S_IFBLK) { - if (verbose >= 0) - fprintf(stderr, Name ": %s is not a block device.\n", + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", devname); return rv; } - dfd = dev_open(devname, O_RDONLY|O_EXCL); + dfd = dev_open(devname, O_RDONLY); if (dfd < 0) { - if (verbose >= 0) - fprintf(stderr, Name ": cannot open %s: %s.\n", + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", devname, strerror(errno)); return rv; } @@ -130,22 +128,22 @@ if (must_be_container(dfd)) { if (!st) st = super_by_fd(dfd, NULL); + if (st) + st->ignore_hw_compat = 1; if (st && st->ss->load_container) rv = st->ss->load_container(st, dfd, NULL); close(dfd); if (!rv && st->ss->container_content) { if (map_lock(&map)) - fprintf(stderr, Name ": failed to get " - "exclusive lock on mapfile\n"); - rv = Incremental_container(st, devname, homehost, - verbose, runstop, autof, - freeze_reshape); + pr_err("failed to get " + "exclusive lock on mapfile\n"); + rv = Incremental_container(st, devname, c); map_unlock(&map); return rv; } - fprintf(stderr, Name ": %s is not part of an md array.\n", + pr_err("%s is not part of an md array.\n", devname); return rv; } @@ -153,10 +151,9 @@ /* 1/ Check if device is permitted by mdadm.conf */ if (!conf_test_dev(devname)) { - if (verbose >= 0) - fprintf(stderr, Name - ": %s not permitted by mdadm.conf.\n", - devname); + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); goto out; } @@ -164,14 +161,14 @@ * version/name from args) */ if (fstat(dfd, &stb) < 0) { - if (verbose >= 0) - fprintf(stderr, Name ": fstat failed for %s: %s.\n", + if (c->verbose >= 0) + pr_err("fstat failed for %s: %s.\n", devname, strerror(errno)); goto out; } if ((stb.st_mode & S_IFMT) != S_IFBLK) { - if (verbose >= 0) - fprintf(stderr, Name ": %s is not a block device.\n", + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", devname); goto out; } @@ -183,23 +180,23 @@ have_target = policy_check_path(&dinfo, &target_array); if (st == NULL && (st = guess_super(dfd)) == NULL) { - if (verbose >= 0) - fprintf(stderr, Name - ": no recognisable superblock on %s.\n", - devname); + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); rv = try_spare(devname, &dfd, policy, have_target ? &target_array : NULL, - st, verbose); + st, c->verbose); goto out; } + st->ignore_hw_compat = 1; if (st->ss->compare_super == NULL || st->ss->load_super(st, dfd, NULL)) { - if (verbose >= 0) - fprintf(stderr, Name ": no RAID superblock on %s.\n", + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", devname); rv = try_spare(devname, &dfd, policy, have_target ? &target_array : NULL, - st, verbose); + st, c->verbose); free(st); goto out; } @@ -208,14 +205,14 @@ st->ss->getinfo_super(st, &info, NULL); /* 3/ Check if there is a match in mdadm.conf */ - match = conf_match(st, &info, devname, verbose, &rv); + match = conf_match(st, &info, devname, c->verbose, &rv); if (!match && rv == 2) goto out; if (match && match->devname && strcasecmp(match->devname, "") == 0) { - if (verbose >= 0) - fprintf(stderr, Name ": array containing %s is explicitly" + if (c->verbose >= 0) + pr_err("array containing %s is explicitly" " ignored by mdadm.conf\n", devname); goto out; @@ -227,21 +224,19 @@ * on that. */ if (match) trustworthy = LOCAL; - else if (st->ss->match_home(st, homehost) == 1) + else if (st->ss->match_home(st, c->homehost) == 1) trustworthy = LOCAL; else if (st->ss->match_home(st, "any") == 1) trustworthy = LOCAL_ANY; else trustworthy = FOREIGN; - if (!match && !conf_test_metadata(st->ss->name, policy, (trustworthy == LOCAL))) { - if (verbose >= 1) - fprintf(stderr, Name - ": %s has metadata type %s for which " - "auto-assembly is disabled\n", - devname, st->ss->name); + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which " + "auto-assembly is disabled\n", + devname, st->ss->name); goto out; } if (trustworthy == LOCAL_ANY) @@ -253,9 +248,9 @@ * CREATE. */ if (match && match->autof) - autof = match->autof; - if (autof == 0) - autof = ci->autof; + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; name_to_use = info.name; if (name_to_use[0] == 0 && @@ -264,7 +259,7 @@ trustworthy = METADATA; } if (name_to_use[0] && trustworthy != LOCAL && - ! require_homehost && + ! c->require_homehost && conf_name_is_free(name_to_use)) trustworthy = LOCAL; @@ -277,11 +272,29 @@ /* 4/ Check if array exists. */ if (map_lock(&map)) - fprintf(stderr, Name ": failed to get exclusive lock on " + pr_err("failed to get exclusive lock on " "mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + mp = map_by_uuid(&map, info.uuid); if (mp) - mdfd = open_dev(mp->devnum); + mdfd = open_dev(mp->devnm); else mdfd = -1; @@ -289,15 +302,15 @@ /* Couldn't find an existing array, maybe make a new one */ mdfd = create_mddev(match ? match->devname : NULL, - name_to_use, autof, trustworthy, chosen_name); + name_to_use, c->autof, trustworthy, chosen_name); if (mdfd < 0) goto out_unlock; - sysfs_init(&info, mdfd, 0); + sysfs_init(&info, mdfd, NULL); if (set_array_info(mdfd, st, &info) != 0) { - fprintf(stderr, Name ": failed to set array info for %s: %s\n", + pr_err("failed to set array info for %s: %s\n", chosen_name, strerror(errno)); rv = 2; goto out_unlock; @@ -307,30 +320,29 @@ dinfo.disk.major = major(stb.st_rdev); dinfo.disk.minor = minor(stb.st_rdev); if (add_disk(mdfd, st, &info, &dinfo) != 0) { - fprintf(stderr, Name ": failed to add %s to %s: %s.\n", + pr_err("failed to add %s to new array %s: %s.\n", devname, chosen_name, strerror(errno)); ioctl(mdfd, STOP_ARRAY, 0); rv = 2; goto out_unlock; } - sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE | - GET_OFFSET | GET_SIZE)); - + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { /* It really should be 'none' - must be old buggy * kernel, and mdadm -I may not be able to complete. * So reject it. */ ioctl(mdfd, STOP_ARRAY, NULL); - fprintf(stderr, Name - ": You have an old buggy kernel which cannot support\n" - " --incremental reliably. Aborting.\n"); + pr_err("You have an old buggy kernel which cannot support\n" + " --incremental reliably. Aborting.\n"); rv = 2; goto out_unlock; } info.array.working_disks = 1; /* 6/ Make sure /var/run/mdadm.map contains this array. */ - map_update(&map, fd2devnum(mdfd), + map_update(&map, fd2devnm(mdfd), info.text_version, info.uuid, chosen_name); } else { @@ -344,13 +356,13 @@ struct supertype *st2; struct mdinfo info2, *d; - sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE | + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | GET_OFFSET | GET_SIZE)); - + if (mp->path) strcpy(chosen_name, mp->path); else - strcpy(chosen_name, devnum2devname(mp->devnum)); + strcpy(chosen_name, mp->devnm); /* It is generally not OK to add non-spare drives to a * running array as they are probably missing because @@ -367,11 +379,10 @@ && (info.disk.state & (1<ss->name, act_re_add) - && runstop < 1) { + && c->runstop < 1) { if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { - fprintf(stderr, Name - ": not adding %s to active array (without --run) %s\n", - devname, chosen_name); + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); rv = 2; goto out_unlock; } @@ -385,18 +396,16 @@ sra->devs->disk.minor); dfd2 = dev_open(dn, O_RDONLY); if (dfd2 < 0) { - fprintf(stderr, Name - ": unable to open %s\n", devname); + pr_err("unable to open %s\n", devname); rv = 2; goto out_unlock; } st2 = dup_super(st); if (st2->ss->load_super(st2, dfd2, NULL) || st->ss->compare_super(st, st2) != 0) { - fprintf(stderr, Name - ": metadata mismatch between %s and " - "chosen array %s\n", - devname, chosen_name); + pr_err("metadata mismatch between %s and " + "chosen array %s\n", + devname, chosen_name); close(dfd2); rv = 2; goto out_unlock; @@ -407,9 +416,8 @@ if (info.array.level != info2.array.level || memcmp(info.uuid, info2.uuid, 16) != 0 || info.array.raid_disks != info2.array.raid_disks) { - fprintf(stderr, Name - ": unexpected difference between %s and %s.\n", - chosen_name, devname); + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); rv = 2; goto out_unlock; } @@ -425,11 +433,23 @@ * disk.number. Find and reject any such */ find_reject(mdfd, st, sra, info.disk.number, - info.events, verbose, chosen_name); + info.events, c->verbose, chosen_name); err = add_disk(mdfd, st, sra, &info); } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<ss->name, + act_force_spare)) { + info.disk.state &= ~(1<verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } if (err < 0) { - fprintf(stderr, Name ": failed to add %s to %s: %s.\n", + pr_err("failed to add %s to existing array %s: %s.\n", devname, chosen_name, strerror(errno)); rv = 2; goto out_unlock; @@ -437,31 +457,28 @@ info.array.working_disks = 0; for (d = sra->devs; d; d=d->next) info.array.working_disks ++; - + } /* 7/ Is there enough devices to possibly start the array? */ /* 7a/ if not, finish with success. */ if (info.array.level == LEVEL_CONTAINER) { - int devnum = devnum; /* defined and used iff ->external */ + char devnm[32]; /* Try to assemble within the container */ sysfs_uevent(sra, "change"); - if (verbose >= 0) - fprintf(stderr, Name - ": container %s now has %d device%s\n", - chosen_name, info.array.working_disks, - info.array.working_disks==1?"":"s"); + if (c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); wait_for(chosen_name, mdfd); if (st->ss->external) - devnum = fd2devnum(mdfd); + strcpy(devnm, fd2devnm(mdfd)); if (st->ss->load_container) rv = st->ss->load_container(st, mdfd, NULL); close(mdfd); sysfs_free(sra); if (!rv) - rv = Incremental_container(st, chosen_name, homehost, - verbose, runstop, autof, - freeze_reshape); + rv = Incremental_container(st, chosen_name, c); map_unlock(&map); if (rv == 1) /* Don't fail the whole -I if a subarray didn't @@ -471,7 +488,7 @@ /* after spare is added, ping monitor for external metadata * so that it can eg. try to rebuild degraded array */ if (st->ss->external) - ping_monitor_by_id(devnum); + ping_monitor(devnm); return rv; } @@ -480,16 +497,15 @@ * things change. */ sysfs_free(sra); - sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE | + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | GET_OFFSET | GET_SIZE)); active_disks = count_active(st, sra, mdfd, &avail, &info); if (enough(info.array.level, info.array.raid_disks, info.array.layout, info.array.state & 1, avail) == 0) { - if (verbose >= 0) - fprintf(stderr, Name - ": %s attached to %s, not enough to start (%d).\n", - devname, chosen_name, active_disks); + if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); rv = 0; goto out_unlock; } @@ -501,31 +517,35 @@ /* + start the array (auto-readonly). */ if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { - if (verbose >= 0) - fprintf(stderr, Name - ": %s attached to %s which is already active.\n", - devname, chosen_name); + if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); rv = 0; goto out_unlock; } map_unlock(&map); - if (runstop > 0 || active_disks >= info.array.working_disks) { + if (c->runstop > 0 || active_disks >= info.array.working_disks) { struct mdinfo *dsk; /* Let's try to start it */ + + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } if (match && match->bitmap_file) { int bmfd = open(match->bitmap_file, O_RDWR); if (bmfd < 0) { - fprintf(stderr, Name - ": Could not open bitmap file %s.\n", - match->bitmap_file); + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); goto out; } if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { close(bmfd); - fprintf(stderr, Name - ": Failed to set bitmapfile for %s.\n", - chosen_name); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); goto out; } close(bmfd); @@ -544,10 +564,9 @@ rv = sysfs_set_str(sra, NULL, "array_state", "read-auto"); if (rv == 0) { - if (verbose >= 0) - fprintf(stderr, Name - ": %s attached to %s, which has been started.\n", - devname, chosen_name); + if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); rv = 0; wait_for(chosen_name, mdfd); /* We just started the array, so some devices @@ -559,21 +578,18 @@ for (dsk = sra->devs; dsk ; dsk = dsk->next) { if (disk_action_allows(dsk, st->ss->name, act_re_add) && add_disk(mdfd, st, sra, dsk) == 0) - fprintf(stderr, Name - ": %s re-added to %s\n", - dsk->sys_name, chosen_name); + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); } } else { - fprintf(stderr, Name - ": %s attached to %s, but failed to start: %s.\n", - devname, chosen_name, strerror(errno)); + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); rv = 1; } } else { - if (verbose >= 0) - fprintf(stderr, Name - ": %s attached to %s, not enough to start safely.\n", - devname, chosen_name); + if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); rv = 0; } out: @@ -630,9 +646,8 @@ sysfs_set_str(sra, d, "slot", "none"); if (sysfs_set_str(sra, d, "state", "remove") == 0) if (verbose >= 0) - fprintf(stderr, Name - ": removing old device %s from %s\n", - d->sys_name+4, array_name); + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); } } @@ -643,6 +658,7 @@ /* count how many devices in sra think they are active */ struct mdinfo *d; int cnt = 0; + int replcnt = 0; __u64 max_events = 0; char *avail = NULL; int *best = NULL; @@ -657,7 +673,7 @@ for (d = sra->devs ; d ; d = d->next) numdevs++; - for (d = sra->devs, devnum=0 ; d ; d = d->next, devnum++) { + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { char dn[30]; int dfd; int ok; @@ -675,15 +691,11 @@ st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); if (!avail) { raid_disks = info.array.raid_disks; - avail = calloc(raid_disks, 1); - if (!avail) { - fprintf(stderr, Name ": out of memory.\n"); - exit(1); - } + avail = xcalloc(raid_disks, 1); *availp = avail; - best = calloc(raid_disks, sizeof(int)); - devmap = calloc(raid_disks * numdevs, 1); + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); st->ss->getinfo_super(st, &info, devmap); } @@ -709,7 +721,7 @@ else if (info.events == max_events+1) { int i; max_events = info.events; - for (i=0; i < raid_disks; i++) + for (i = 0; i < raid_disks; i++) if (avail[i]) avail[i]--; avail[info.disk.raid_disk] = 2; @@ -722,7 +734,8 @@ best[info.disk.raid_disk] = devnum; st->ss->getinfo_super(st, bestinfo, NULL); } - } + } else if (info.disk.state & (1<ss->free_super(st); } if (!avail) @@ -747,9 +760,17 @@ if (avail[i]) cnt++; } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } free(best); free(devmap); - return cnt; + return cnt + replcnt; } /* test if container has degraded member(s) */ @@ -760,10 +781,9 @@ int max_degraded = 0; for(; map; map = map->next) { - if (!is_subarray(map->metadata) || - devname2devnum(map->metadata+1) != me->devnum) + if (!metadata_container_matches(map->metadata, me->devnm)) continue; - afd = open_dev(map->devnum); + afd = open_dev(map->devnm); if (afd < 0) continue; /* most accurate information regarding array degradation */ @@ -811,7 +831,7 @@ */ if (map_lock(&map)) { - fprintf(stderr, Name ": failed to get exclusive lock on " + pr_err("failed to get exclusive lock on " "mapfile\n"); return 1; } @@ -830,19 +850,19 @@ (st->minor_version >= 0 && st->minor_version != st2->minor_version)) { if (verbose > 1) - fprintf(stderr, Name ": not adding %s to %s as metadata type doesn't match\n", + pr_err("not adding %s to %s as metadata type doesn't match\n", devname, mp->path); free(st2); continue; } free(st2); } - sra = sysfs_read(-1, mp->devnum, + sra = sysfs_read(-1, mp->devnm, GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| GET_DEGRADED|GET_COMPONENT|GET_VERSION); if (!sra) { /* Probably a container - no degraded info */ - sra = sysfs_read(-1, mp->devnum, + sra = sysfs_read(-1, mp->devnm, GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| GET_COMPONENT|GET_VERSION); if (sra) @@ -853,12 +873,12 @@ if (st == NULL) { int i; st2 = NULL; - for(i=0; !st2 && superlist[i]; i++) + for(i = 0; !st2 && superlist[i]; i++) st2 = superlist[i]->match_metadata_desc( sra->text_version); if (!st2) { if (verbose > 1) - fprintf(stderr, Name ": not adding %s to %s" + pr_err("not adding %s to %s" " as metadata not recognised.\n", devname, mp->path); goto next; @@ -883,7 +903,7 @@ /* true for containers, here we must read superblock * to obtain minimum spare size */ struct supertype *st3 = dup_super(st2); - int mdfd = open_dev(mp->devnum); + int mdfd = open_dev(mp->devnm); if (mdfd < 0) { free(st3); goto next; @@ -897,11 +917,15 @@ close(mdfd); } if ((sra->component_size > 0 && - st2->ss->avail_size(st2, devsize) < sra->component_size) + st2->ss->avail_size(st2, devsize, + sra->devs + ? sra->devs->data_offset + : INVALID_SECTORS) + < sra->component_size) || (sra->component_size == 0 && devsize < component_size)) { if (verbose > 1) - fprintf(stderr, Name ": not adding %s to %s as it is too small\n", + pr_err("not adding %s to %s as it is too small\n", devname, mp->path); goto next; } @@ -937,7 +961,7 @@ if (domain_test(dl, pol, st2->ss->name) != 1) { /* domain test fails */ if (verbose > 1) - fprintf(stderr, Name ": not adding %s to %s as" + pr_err("not adding %s to %s as" " it is not in a compatible domain\n", devname, mp->path); @@ -962,13 +986,12 @@ } if (chosen) { /* add current device to chosen array as a spare */ - int mdfd = open_dev(devname2devnum(chosen->sys_name)); + int mdfd = open_dev(chosen->sys_name); if (mdfd >= 0) { struct mddev_dev devlist; char devname[20]; devlist.next = NULL; devlist.used = 0; - devlist.re_add = 0; devlist.writemostly = 0; devlist.devname = devname; sprintf(devname, "%d:%d", major(stb.st_rdev), @@ -982,10 +1005,10 @@ } if (verbose > 0) { if (rv == 0) - fprintf(stderr, Name ": added %s as spare for %s\n", + pr_err("added %s as spare for %s\n", devname, chosen->sys_name); else - fprintf(stderr, Name ": failed to add %s as spare for %s\n", + pr_err("failed to add %s as spare for %s\n", devname, chosen->sys_name); } sysfs_free(chosen); @@ -1211,7 +1234,7 @@ !policy_action_allows(pol, st?st->ss->name:NULL, act_spare_same_slot)) { if (verbose > 1) - fprintf(stderr, Name ": %s is not bare, so not " + pr_err("%s is not bare, so not " "considering as a spare\n", devname); return 1; @@ -1255,7 +1278,7 @@ return rv; } -int IncrementalScan(int verbose) +int IncrementalScan(int verbose, char *devnm) { /* look at every device listed in the 'map' file. * If one is found that is not running then: @@ -1275,7 +1298,11 @@ mdu_array_info_t array; mdu_bitmap_file_t bmf; struct mdinfo *sra; - int mdfd = open_dev(me->devnum); + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + mdfd = open_dev(me->devnm); if (mdfd < 0) continue; @@ -1305,28 +1332,27 @@ } if (verbose >= 0) { if (added == 0) - fprintf(stderr, Name - ": Added bitmap %s to %s\n", - mddev->bitmap_file, me->path); + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); else if (errno != EEXIST) - fprintf(stderr, Name - ": Failed to add bitmap to %s: %s\n", - me->path, strerror(errno)); + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); } } - sra = sysfs_read(mdfd, 0, 0); + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); if (sra) { if (sysfs_set_str(sra, NULL, "array_state", "read-auto") == 0) { if (verbose >= 0) - fprintf(stderr, Name - ": started array %s\n", - me->path ?: devnum2devname(me->devnum)); + pr_err("started array %s\n", + me->path ?: me->devnm); } else { - fprintf(stderr, Name - ": failed to start array %s: %s\n", - me->path ?: devnum2devname(me->devnum), - strerror(errno)); + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); rv = 1; } sysfs_free(sra); @@ -1342,18 +1368,18 @@ if (devname[0] == '/') { int fd = open(devname, O_RDONLY); if (fd >= 0) { - mdname = devnum2devname(fd2devnum(fd)); + mdname = xstrdup(fd2devnm(fd)); close(fd); } } else { int uuid[4]; struct map_ent *mp, *map = NULL; - + if (!parse_uuid(devname, uuid)) return mdname; mp = map_by_uuid(&map, uuid); if (mp) - mdname = devnum2devname(mp->devnum); + mdname = xstrdup(mp->devnm); map_free(map); } @@ -1361,8 +1387,7 @@ } static int Incremental_container(struct supertype *st, char *devname, - char *homehost, int verbose, - int runstop, int autof, int freeze_reshape) + struct context *c) { /* Collect the contents of this container and for each * array, choose a device name and assemble the array. @@ -1384,23 +1409,23 @@ st->ss->getinfo_super(st, &info, NULL); - if ((runstop > 0 && info.container_enough >= 0) || + if ((c->runstop > 0 && info.container_enough >= 0) || info.container_enough > 0) /* pass */; else { - if (verbose) - fprintf(stderr, Name ": not enough devices to start the container\n"); + if (c->verbose) + pr_err("not enough devices to start the container\n"); return 0; } - match = conf_match(st, &info, devname, verbose, &rv); + match = conf_match(st, &info, devname, c->verbose, &rv); if (match == NULL && rv == 2) return rv; /* Need to compute 'trustworthy' */ if (match) trustworthy = LOCAL; - else if (st->ss->match_home(st, homehost) == 1) + else if (st->ss->match_home(st, c->homehost) == 1) trustworthy = LOCAL; else if (st->ss->match_home(st, "any") == 1) trustworthy = LOCAL; @@ -1420,7 +1445,7 @@ ra_all++; /* do not activate arrays blocked by metadata handler */ if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { - fprintf(stderr, Name ": Cannot activate array %s in %s.\n", + pr_err("Cannot activate array %s in %s.\n", ra->text_version, devname); ra_blocked++; continue; @@ -1428,11 +1453,11 @@ mp = map_by_uuid(&map, ra->uuid); if (mp) { - mdfd = open_dev(mp->devnum); + mdfd = open_dev(mp->devnm); if (mp->path) strcpy(chosen_name, mp->path); else - strcpy(chosen_name, devnum2devname(mp->devnum)); + strcpy(chosen_name, mp->devnm); } else { /* Check in mdadm.conf for container == devname and @@ -1467,18 +1492,18 @@ free(dn); /* we have a match */ match = array_list; - if (verbose>0) - fprintf(stderr, Name ": match found for member %s\n", + if (c->verbose>0) + pr_err("match found for member %s\n", array_list->member); break; } if (match && match->devname && strcasecmp(match->devname, "") == 0) { - if (verbose > 0) - fprintf(stderr, Name ": array %s/%s is " - "explicitly ignored by mdadm.conf\n", - match->container, match->member); + if (c->verbose > 0) + pr_err("array %s/%s is " + "explicitly ignored by mdadm.conf\n", + match->container, match->member); return 2; } if (match) @@ -1486,20 +1511,19 @@ mdfd = create_mddev(match ? match->devname : NULL, ra->name, - autof, + c->autof, trustworthy, chosen_name); } if (mdfd < 0) { - fprintf(stderr, Name ": failed to open %s: %s.\n", + pr_err("failed to open %s: %s.\n", chosen_name, strerror(errno)); return 2; } - assemble_container_content(st, mdfd, ra, runstop, - chosen_name, verbose, NULL, - freeze_reshape); + assemble_container_content(st, mdfd, ra, c, + chosen_name); close(mdfd); } @@ -1544,9 +1568,8 @@ disks = disks->next; } if (count) - fprintf(stderr, Name - ": Added %d spare%s to %s\n", - count, count>1?"s":"", devname); + pr_err("Added %d spare%s to %s\n", + count, count>1?"s":"", devname); } sysfs_free(sinfo); } else @@ -1556,6 +1579,19 @@ return 0; } +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + /* * IncrementalRemove - Attempt to see if the passed in device belongs to any * raid arrays, and if so first fail (if needed) and then remove the device. @@ -1569,9 +1605,11 @@ int IncrementalRemove(char *devname, char *id_path, int verbose) { int mdfd; - int rv; + int rv = 0; struct mdstat_ent *ent; struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; if (!id_path) dprintf(Name ": incremental removal without --path " @@ -1579,26 +1617,36 @@ "port\n"); if (strchr(devname, '/')) { - fprintf(stderr, Name ": incremental removal requires a " + pr_err("incremental removal requires a " "kernel device name, not a file: %s\n", devname); return 1; } ent = mdstat_by_component(devname); if (!ent) { - fprintf(stderr, Name ": %s does not appear to be a component " - "of any array\n", devname); + if (verbose >= 0) + pr_err("%s does not appear to be a component " + "of any array\n", devname); return 1; } - mdfd = open_dev(ent->devnum); + sysfs_init(&mdi, -1, ent->devnm); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + mdfd = open_dev(ent->devnm); if (mdfd < 0) { - fprintf(stderr, Name ": Cannot open array %s!!\n", ent->dev); + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->dev); free_mdstat(ent); return 1; } if (id_path) { struct map_ent *map = NULL, *me; - me = map_by_devnum(&map, ent->devnum); + me = map_by_devnm(&map, ent->devnm); if (me) policy_save_path(id_path, me); map_free(map); @@ -1614,19 +1662,38 @@ struct mdstat_ent *memb; for (memb = mdstat ; memb ; memb = memb->next) if (is_container_member(memb, ent->dev)) { - int subfd = open_dev(memb->devnum); + int subfd = open_dev(memb->devnm); if (subfd >= 0) { - Manage_subdevs(memb->dev, subfd, - &devlist, verbose, 0, - NULL, 0); + rv |= Manage_subdevs( + memb->dev, subfd, + &devlist, verbose, 0, + NULL, 0); close(subfd); } } free_mdstat(mdstat); } else - Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL, 0); - devlist.disposition = 'r'; - rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL, 0); + rv |= Manage_subdevs(ent->dev, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + int devid = devnm2devid(ent->devnm); + run_udisks("--unmount", map_dev(major(devid),minor(devid), 0)); + rv = Manage_stop(ent->dev, mdfd, verbose, 1); + if (rv) + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(&mdi, "remove"); + if (verbose) { + if (rv) + pr_err("Fail to stop %s too.\n", ent->devnm); + } + } else { + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->dev, mdfd, &devlist, + verbose, 0, NULL, 0); + } close(mdfd); free_mdstat(ent); return rv; diff -Nru mdadm-3.2.5/inventory mdadm-3.3/inventory --- mdadm-3.2.5/inventory 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/inventory 2013-09-03 04:47:47.000000000 +0000 @@ -16,12 +16,15 @@ ANNOUNCE-3.2.3 ANNOUNCE-3.2.4 ANNOUNCE-3.2.5 +ANNOUNCE-3.2.6 +ANNOUNCE-3.3 Assemble.c Build.c COPYING ChangeLog Create.c Detail.c +Dump.c Examine.c Grow.c INSTALL @@ -99,10 +102,13 @@ super1.c swap_super.c sysfs.c +systemd/ +systemd/mdmon@.service test tests/ tests/00linear tests/00multipath +tests/00names tests/00raid0 tests/00raid1 tests/00raid10 @@ -113,16 +119,25 @@ tests/01r5fail tests/01r5integ tests/01raid6integ +tests/01replace tests/02lineargrow tests/02r1add tests/02r1grow tests/02r5grow tests/02r6grow +tests/03assem-incr tests/03r0assem tests/03r5assem tests/03r5assemV1 tests/04r0update tests/04r1update +tests/04r5swap +tests/04update-metadata +tests/04update-uuid +tests/05r1-add-internalbitmap +tests/05r1-add-internalbitmap-v1a +tests/05r1-add-internalbitmap-v1b +tests/05r1-add-internalbitmap-v1c tests/05r1-bitmapfile tests/05r1-grow-external tests/05r1-grow-internal @@ -134,13 +149,16 @@ tests/05r1-n3-bitmapfile tests/05r1-re-add tests/05r1-re-add-nosuper +tests/05r1-remove-internalbitmap +tests/05r1-remove-internalbitmap-v1a +tests/05r1-remove-internalbitmap-v1b +tests/05r1-remove-internalbitmap-v1c tests/05r5-bitmapfile tests/05r5-internalbitmap tests/05r6-bitmapfile +tests/05r6tor0 tests/06name -tests/06r5swap tests/06sysfs -tests/06update-uuid tests/06wrmostly tests/07autoassemble tests/07autodetect @@ -148,11 +166,20 @@ tests/07changelevels tests/07layouts tests/07reshape5intr +tests/07revert-grow +tests/07revert-inplace +tests/07revert-shrink tests/07testreshape5 -tests/08imsm-overlap tests/09imsm-assemble tests/09imsm-create-fail-rebuild +tests/09imsm-overlap tests/10ddf-create +tests/10ddf-create-fail-rebuild +tests/10ddf-fail-create-race +tests/10ddf-fail-spare +tests/10ddf-fail-twice +tests/10ddf-fail-two-spares +tests/10ddf-geometry tests/11spare-migration tests/12imsm-r0_2d-grow-r0_3d tests/12imsm-r0_2d-grow-r0_4d @@ -190,11 +217,17 @@ tests/18imsm-r0_2d-takeover-r10_4d tests/18imsm-r10_4d-takeover-r0_2d tests/18imsm-r1_2d-takeover-r0_1d +tests/19raid6auto-repair +tests/19raid6repair +tests/19repair-does-not-destroy tests/ToTest tests/check +tests/env-ddf-template tests/env-imsm-template tests/imsm-grow-template tests/testdev tests/utils -udev-md-raid.rules +udev-md-raid-arrays.rules +udev-md-raid-assembly.rules util.c +xmalloc.c diff -Nru mdadm-3.2.5/Kill.c mdadm-3.3/Kill.c --- mdadm-3.2.5/Kill.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Kill.c 2013-09-03 04:47:47.000000000 +0000 @@ -29,7 +29,7 @@ #include "md_u.h" #include "md_p.h" -int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl) +int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) { /* * Nothing fancy about Kill. It just zeroes out a superblock @@ -46,16 +46,16 @@ noexcl = 1; fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); if (fd < 0) { - if (!quiet) - fprintf(stderr, Name ": Couldn't open %s for write - not zeroing\n", + if (verbose >= 0) + pr_err("Couldn't open %s for write - not zeroing\n", dev); return 2; } if (st == NULL) st = guess_super(fd); if (st == NULL || st->ss->init_super == NULL) { - if (!quiet) - fprintf(stderr, Name ": Unrecognised md component device - %s\n", dev); + if (verbose >= 0) + pr_err("Unrecognised md component device - %s\n", dev); close(fd); return 2; } @@ -63,15 +63,16 @@ rv = st->ss->load_super(st, fd, dev); if (rv == 0 || (force && rv >= 2)) { st->ss->free_super(st); - st->ss->init_super(st, NULL, 0, "", NULL, NULL); + st->ss->init_super(st, NULL, 0, "", NULL, NULL, + INVALID_SECTORS); if (st->ss->store_super(st, fd)) { - if (!quiet) - fprintf(stderr, Name ": Could not zero superblock on %s\n", + if (verbose >= 0) + pr_err("Could not zero superblock on %s\n", dev); rv = 1; } else if (rv) { - if (!quiet) - fprintf(stderr, Name ": superblock zeroed anyway\n"); + if (verbose >= 0) + pr_err("superblock zeroed anyway\n"); rv = 0; } } @@ -79,7 +80,7 @@ return rv; } -int Kill_subarray(char *dev, char *subarray, int quiet) +int Kill_subarray(char *dev, char *subarray, int verbose) { /* Delete a subarray out of a container, the subarry must be * inactive. The subarray string must be a subarray index @@ -95,36 +96,33 @@ memset(st, 0, sizeof(*st)); - fd = open_subarray(dev, subarray, st, quiet); + fd = open_subarray(dev, subarray, st, verbose < 0); if (fd < 0) return 2; if (!st->ss->kill_subarray) { - if (!quiet) - fprintf(stderr, - Name ": Operation not supported for %s metadata\n", - st->ss->name); + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); goto free_super; } - if (is_subarray_active(subarray, st->devname)) { - if (!quiet) - fprintf(stderr, - Name ": Subarray-%s still active, aborting\n", - subarray); + if (is_subarray_active(subarray, st->devnm)) { + if (verbose >= 0) + pr_err("Subarray-%s still active, aborting\n", + subarray); goto free_super; } - if (mdmon_running(st->devnum)) + if (mdmon_running(st->devnm)) st->update_tail = &st->updates; /* ok we've found our victim, drop the axe */ rv = st->ss->kill_subarray(st); if (rv) { - if (!quiet) - fprintf(stderr, - Name ": Failed to delete subarray-%s from %s\n", - subarray, dev); + if (verbose >= 0) + pr_err("Failed to delete subarray-%s from %s\n", + subarray, dev); goto free_super; } @@ -134,10 +132,9 @@ else st->ss->sync_metadata(st); - if (!quiet) - fprintf(stderr, - Name ": Deleted subarray-%s from %s, UUIDs may have changed\n", - subarray, dev); + if (verbose >= 0) + pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); rv = 0; diff -Nru mdadm-3.2.5/lib.c mdadm-3.3/lib.c --- mdadm-3.2.5/lib.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/lib.c 2013-09-03 04:47:47.000000000 +0000 @@ -23,6 +23,7 @@ */ #include "mdadm.h" +#include "dlink.h" #include /* This fill contains various 'library' style function. They @@ -57,85 +58,96 @@ return mdp_major; } - -void fmt_devname(char *name, int num) -{ - if (num >= 0) - sprintf(name, "md%d", num); - else - sprintf(name, "md_d%d", -1-num); -} - -char *devnum2devname(int num) +char *devid2kname(int devid) { - char name[100]; - fmt_devname(name,num); - return strdup(name); -} + char path[30]; + char link[200]; + static char devnm[32]; + char *cp; + int n; -int devname2devnum(char *name) -{ - char *ep; - int num; - if (strncmp(name, "md_d", 4)==0) - num = -1-strtoul(name+4, &ep, 10); - else - num = strtoul(name+2, &ep, 10); - return num; + /* Look at the + * /sys/dev/block/%d:%d link which must look like + * and take the last component. + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { + link[n] = 0; + cp = strrchr(link, '/'); + if (cp) { + strcpy(devnm, cp+1); + return devnm; + } + } + return NULL; } -int stat2devnum(struct stat *st) +char *devid2devnm(int devid) { char path[30]; char link[200]; - char *cp; + static char devnm[32]; + char *cp, *ep; int n; - if ((S_IFMT & st->st_mode) == S_IFBLK) { - if (major(st->st_rdev) == MD_MAJOR) - return minor(st->st_rdev); - else if (major(st->st_rdev) == (unsigned)get_mdp_major()) - return -1- (minor(st->st_rdev)>>MdpMinorShift); - - /* must be an extended-minor partition. Look at the - * /sys/dev/block/%d:%d link which must look like - * ../../block/mdXXX/mdXXXpYY - */ - sprintf(path, "/sys/dev/block/%d:%d", major(st->st_rdev), - minor(st->st_rdev)); - n = readlink(path, link, sizeof(link)-1); - if (n <= 0) - return NoMdDev; + /* Might be an extended-minor partition or a + * named md device. Look at the + * /sys/dev/block/%d:%d link which must look like + * ../../block/mdXXX/mdXXXpYY + * or + * ...../block/md_FOO + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { link[n] = 0; - cp = strrchr(link, '/'); - if (cp) *cp = 0; - cp = strrchr(link, '/'); - if (cp && strncmp(cp, "/md", 3) == 0) - return devname2devnum(cp+1); + cp = strstr(link, "/block/"); + if (cp) { + cp += 7; + ep = strchr(cp, '/'); + if (ep) + *ep = 0; + strcpy(devnm, cp); + return devnm; + } } - return NoMdDev; + if (major(devid) == MD_MAJOR) + sprintf(devnm,"md%d", minor(devid)); + else if (major(devid) == (unsigned)get_mdp_major()) + sprintf(devnm,"md_d%d", + (minor(devid)>>MdpMinorShift)); + else + return NULL; + return devnm; +} +char *stat2devnm(struct stat *st) +{ + if ((S_IFMT & st->st_mode) != S_IFBLK) + return NULL; + return devid2devnm(st->st_rdev); } -int fd2devnum(int fd) +char *fd2devnm(int fd) { struct stat stb; if (fstat(fd, &stb) == 0) - return stat2devnum(&stb); - return NoMdDev; + return stat2devnm(&stb); + return NULL; } - - /* * convert a major/minor pair for a block device into a name in /dev, if possible. * On the first call, walk /dev collecting name. * Put them in a simple linked listfor now. */ struct devmap { - int major, minor; - char *name; - struct devmap *next; + int major, minor; + char *name; + struct devmap *next; } *devlist = NULL; int devlist_ready = 0; @@ -150,8 +162,8 @@ } if ((stb->st_mode&S_IFMT)== S_IFBLK) { - char *n = strdup(name); - struct devmap *dm = malloc(sizeof(*dm)); + char *n = xstrdup(name); + struct devmap *dm = xmalloc(sizeof(*dm)); if (strncmp(n, "/dev/./", 7)==0) strcpy(n+4, name+6); if (dm) { @@ -246,8 +258,6 @@ return preferred ? preferred : regular; } - - /* conf_word gets one word from the conf file. * if "allow_key", then accept words at the start of a line, * otherwise stop when such a word is found. @@ -262,9 +272,7 @@ int c; int quote; int wordfound = 0; - char *word = malloc(wsize); - - if (!word) abort(); + char *word = xmalloc(wsize); while (wordfound==0) { /* at the end of a word.. */ @@ -294,8 +302,7 @@ else { if (len == wsize-1) { wsize += 100; - word = realloc(word, wsize); - if (!word) abort(); + word = xrealloc(word, wsize); } word[len++] = c; } @@ -325,3 +332,144 @@ } return word; } + +void print_quoted(char *str) +{ + /* Printf the string with surrounding quotes + * iff needed. + * If no space, tab, or quote - leave unchanged. + * Else print surrounded by " or ', swapping quotes + * when we find one that will cause confusion. + */ + + char first_quote = 0, q; + char *c; + + for (c = str; *c; c++) { + switch(*c) { + case '\'': + case '"': + first_quote = *c; + break; + case ' ': + case '\t': + first_quote = *c; + continue; + default: + continue; + } + break; + } + if (!first_quote) { + printf("%s", str); + return; + } + + if (first_quote == '"') + q = '\''; + else + q = '"'; + putchar(q); + for (c = str; *c; c++) { + if (*c == q) { + putchar(q); + q ^= '"' ^ '\''; + putchar(q); + } + putchar(*c); + } + putchar(q); +} + +void print_escape(char *str) +{ + /* print str, but change space and tab to '_' + * as is suitable for device names + */ + for (; *str ; str++) { + switch (*str) { + case ' ': + case '\t': + putchar('_'); + break; + case '/': + putchar('-'); + break; + default: + putchar(*str); + } + } +} + +int check_env(char *name) +{ + char *val = getenv(name); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + +int use_udev(void) +{ + static int use = -1; + struct stat stb; + + if (use < 0) { + use = ((stat("/dev/.udev", &stb) == 0 + || stat("/run/udev", &stb) == 0) + && check_env("MDADM_NO_UDEV") == 0); + } + return use; +} + +unsigned long GCD(unsigned long a, unsigned long b) +{ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + return a; +} + +/* + * conf_line reads one logical line from the conffile or mdstat. + * It skips comments and continues until it finds a line that starts + * with a non blank/comment. This character is pushed back for the next call + * A doubly linked list of words is returned. + * the first word will be a keyword. Other words will have had quotes removed. + */ + +char *conf_line(FILE *file) +{ + char *w; + char *list; + + w = conf_word(file, 1); + if (w == NULL) return NULL; + + list = dl_strdup(w); + free(w); + dl_init(list); + + while ((w = conf_word(file,0))){ + char *w2 = dl_strdup(w); + free(w); + dl_add(list, w2); + } +/* printf("got a line\n");*/ + return list; +} + +void free_line(char *line) +{ + char *w; + for (w=dl_next(line); w != line; w=dl_next(line)) { + dl_del(w); + dl_free(w); + } + dl_free(line); +} diff -Nru mdadm-3.2.5/makedist mdadm-3.3/makedist --- mdadm-3.2.5/makedist 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/makedist 2013-09-03 04:47:47.000000000 +0000 @@ -14,8 +14,8 @@ else echo $target is not a directory exit 2 fi -set `grep '^char Version' ReadMe.c ` -version=`echo $7 | sed 's/v//'` +set `grep '^#define VERSION' ReadMe.c ` +version=`echo $3 | sed -e 's/"//g'` grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 || { echo mdadm.8.in does not mention version $version. @@ -47,8 +47,8 @@ base=mdadm-$version.tar.gz if [ " $arg" != " diff" ] then - if [ -f $target/$base ] - then + if [ -f $target/$base ] + then echo $target/$base exists. exit 1 fi @@ -83,14 +83,14 @@ fi fi else - if [ ! -f $target/$base ] - then + if [ ! -f $target/$base ] + then echo $target/$base does not exist. exit 1 fi ( cd .. ; ln -s mdadm.v2 mdadm-$version ; tar chf - --exclude=.git --exclude="TAGS" --exclude='*,v' --exclude='*~' --exclude='*.o' --exclude mdadm --exclude=mdadm'.[^ch0-9]' --exclude=RCS mdadm-$version ; rm mdadm-$version ) | gzip --best > /var/tmp/mdadm-new.tgz mkdir /var/tmp/mdadm-old ; zcat $target/$base | ( cd /var/tmp/mdadm-old ; tar xf - ) mkdir /var/tmp/mdadm-new ; zcat /var/tmp/mdadm-new.tgz | ( cd /var/tmp/mdadm-new ; tar xf - ) - diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new + diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new rm -rf /var/tmp/mdadm-old /var/tmp/mdadm-new /var/tmp/mdadm-new.tgz fi diff -Nru mdadm-3.2.5/Makefile mdadm-3.3/Makefile --- mdadm-3.2.5/Makefile 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Makefile 2013-09-03 04:47:47.000000000 +0000 @@ -2,6 +2,7 @@ # mdadm - manage Linux "md" devices aka RAID arrays. # # Copyright (C) 2001-2002 Neil Brown +# Copyright (C) 2013 Neil Brown # # # This program is free software; you can redistribute it and/or modify @@ -32,7 +33,7 @@ TCC = tcc UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) #DIET_GCC = diet gcc -# sorry, but diet-libc doesn't know about posix_memalign, +# sorry, but diet-libc doesn't know about posix_memalign, # so we cannot use it any more. DIET_GCC = gcc -DHAVE_STDINT_H @@ -41,24 +42,24 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 CC = $(CROSS_COMPILE)gcc -CXFLAGS = -ggdb +CXFLAGS ?= -ggdb CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter ifdef WARN_UNUSED -CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 endif ifdef DEBIAN -CPPFLAGS := -DDEBIAN -else -CPPFLAGS := +CPPFLAGS += -DDEBIAN endif ifdef DEFAULT_OLD_METADATA - CPPFLAG += -DDEFAULT_OLD_METADATA + CPPFLAGS += -DDEFAULT_OLD_METADATA DEFAULT_METADATA=0.90 else DEFAULT_METADATA=1.2 endif +PKG_CONFIG ?= pkg-config + SYSCONFDIR = /etc CONFFILE = $(SYSCONFDIR)/mdadm.conf CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf @@ -66,18 +67,28 @@ CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the # pivotroot from early boot to late boot. -# /run is best, but for distros that don't support that, /dev can work. -MAP_DIR=/run/mdadm +# /run is best, but for distros that don't support that. +# /dev can work, in which case you probably want /dev/.mdadm +RUN_DIR=/run/mdadm +CHECK_RUN_DIR=1 +MAP_DIR=$(RUN_DIR) MAP_FILE = map MAP_PATH = $(MAP_DIR)/$(MAP_FILE) -MDMON_DIR = $(MAP_DIR) +MDMON_DIR = $(RUN_DIR) # place for autoreplace cookies -FAILED_SLOTS_DIR = /run/mdadm/failed-slots +FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots +SYSTEMD_DIR=/lib/systemd/system DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) +VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') +VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/') +DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",) +DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",) +CFLAGS += $(DVERS) $(DDATE) + # The glibc TLS ABI requires applications that call clone(2) to set up # TLS data structures, use pthreads until mdmon implements this support USE_PTHREADS = 1 @@ -91,33 +102,38 @@ # STRIP = -s INSTALL = /usr/bin/install -DESTDIR = +DESTDIR = BINDIR = /sbin MANDIR = /usr/share/man MAN4DIR = $(MANDIR)/man4 MAN5DIR = $(MANDIR)/man5 MAN8DIR = $(MANDIR)/man8 +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \ Manage.o Assemble.o Build.o \ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ - Incremental.o \ + Incremental.o Dump.o \ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ super-mbr.o super-gpt.o \ - restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ platform-intel.o probe_roms.o -CHECK_OBJS = restripe.o sysfs.o maps.o lib.o +CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o SRCS = $(patsubst %.o,%.c,$(OBJS)) INCL = mdadm.h part.h bitmap.h MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \ - config.o policy.o lib.o \ + policy.o lib.o \ Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \ super-mbr.o super-gpt.o \ - super-ddf.o sha1.o crc32.o msg.o bitmap.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ platform-intel.o probe_roms.o MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) @@ -126,7 +142,7 @@ STATICOBJS = pwgr.o ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \ - maps.c lib.c \ + maps.c lib.c xmalloc.c \ super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \ platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c ASSEMBLE_AUTO_SRCS := mdopen.c @@ -136,10 +152,17 @@ ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO endif -all : mdadm mdmon +all : check_rundir mdadm mdmon man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man -everything: all mdadm.static swap_super test_stripe \ +check_rundir: + @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" == 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ + fi + +everything: all mdadm.static swap_super test_stripe raid6check \ mdassemble mdassemble.auto mdassemble.static mdassemble.man \ mdadm.Os mdadm.O2 man everything-test: all mdadm.static swap_super test_stripe \ @@ -148,7 +171,7 @@ # mdadm.uclibc and mdassemble.uclibc don't work on x86-64 # mdadm.tcc doesn't work.. -mdadm : $(OBJS) +mdadm : check_rundir $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) mdadm.static : $(OBJS) $(STATICOBJS) @@ -158,7 +181,7 @@ $(TCC) -o mdadm.tcc $(SRCS) mdadm.klibc : $(SRCS) $(INCL) - rm -f $(OBJS) + rm -f $(OBJS) $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) mdadm.Os : $(SRCS) $(INCL) @@ -171,17 +194,20 @@ $(CC) -o mdmon.O2 $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) # use '-z now' to guarantee no dynamic linker interactions with the monitor thread -mdmon : $(MON_OBJS) +mdmon : check_rundir $(MON_OBJS) $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) msg.o: msg.c msg.h -test_stripe : restripe.c mdadm.h - $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c +test_stripe : restripe.c xmalloc.o mdadm.h + $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c raid6check : raid6check.o mdadm.h $(CHECK_OBJS) $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) mdassemble : $(ASSEMBLE_SRCS) $(INCL) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.diet : $(ASSEMBLE_SRCS) $(INCL) rm -f $(OBJS) $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) @@ -253,16 +279,20 @@ $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 -install-udev: udev-md-raid.rules - $(INSTALL) -D -m 644 udev-md-raid.rules $(DESTDIR)/lib/udev/rules.d/64-md-raid.rules +install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules + $(INSTALL) -D -m 644 udev-md-raid-arrays.rules $(DESTDIR)$(UDEVDIR)/rules.d/63-md-raid-arrays.rules + $(INSTALL) -D -m 644 udev-md-raid-assembly.rules $(DESTDIR)$(UDEVDIR)/rules.d/64-md-raid-assembly.rules + +install-systemd: systemd/mdmon@.service + $(INSTALL) -D -m 644 systemd/mdmon@.service $(DESTDIR)$(SYSTEMD_DIR)/mdmon@.service uninstall: rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm -test: mdadm mdmon test_stripe swap_super - @echo "Please run 'sh ./test' as root" +test: mdadm mdmon test_stripe swap_super raid6check + @echo "Please run './test' as root" -clean : +clean : rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt .merge_file_* \ mdadm.Os mdadm.O2 mdmon.O2 \ @@ -284,4 +314,3 @@ ifdef DISTRO_MAKEFILE include $(DISTRO_MAKEFILE) endif - diff -Nru mdadm-3.2.5/Manage.c mdadm-3.3/Manage.c --- mdadm-3.2.5/Manage.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Manage.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -27,9 +27,9 @@ #include "md_p.h" #include -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) int Manage_ro(char *devname, int fd, int readonly) { @@ -47,14 +47,14 @@ int rv = 0; if (md_get_version(fd) < 9000) { - fprintf(stderr, Name ": need md driver version 0.90.0 or later\n"); + pr_err("need md driver version 0.90.0 or later\n"); return 1; } #ifndef MDASSEMBLE - /* If this is an externally-manage array, we need to modify the + /* If this is an externally-managed array, we need to modify the * metadata_version so that mdmon doesn't undo our change. */ - mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION); + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION); if (mdi && mdi->array.major_version == -1 && is_subarray(mdi->text_version)) { @@ -71,7 +71,7 @@ rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); if (rv < 0) { - fprintf(stderr, Name ": failed to set readonly for %s: %s\n", + pr_err("failed to set readonly for %s: %s\n", devname, strerror(errno)); vers[9] = mdi->text_version[0]; @@ -96,22 +96,22 @@ } #endif if (ioctl(fd, GET_ARRAY_INFO, &array)) { - fprintf(stderr, Name ": %s does not appear to be active.\n", + pr_err("%s does not appear to be active.\n", devname); rv = 1; goto out; } - if (readonly>0) { + if (readonly > 0) { if (ioctl(fd, STOP_ARRAY_RO, NULL)) { - fprintf(stderr, Name ": failed to set readonly for %s: %s\n", + pr_err("failed to set readonly for %s: %s\n", devname, strerror(errno)); rv = 1; goto out; } } else if (readonly < 0) { if (ioctl(fd, RESTART_ARRAY_RW, NULL)) { - fprintf(stderr, Name ": failed to set writable for %s: %s\n", + pr_err("failed to set writable for %s: %s\n", devname, strerror(errno)); rv = 1; goto out; @@ -127,12 +127,12 @@ #ifndef MDASSEMBLE -static void remove_devices(int devnum, char *path) +static void remove_devices(char *devnm, char *path) { - /* + /* * Remove names at 'path' - possibly with * partition suffixes - which link to the 'standard' - * name for devnum. These were probably created + * name for devnm. These were probably created * by mdadm when the array was assembled. */ char base[40]; @@ -146,16 +146,13 @@ if (!path) return; - if (devnum >= 0) - sprintf(base, "/dev/md%d", devnum); - else - sprintf(base, "/dev/md_d%d", -1-devnum); + sprintf(base, "/dev/%s", devnm); be = base + strlen(base); - path2 = malloc(strlen(path)+20); + path2 = xmalloc(strlen(path)+20); strcpy(path2, path); pe = path2 + strlen(path2); - + for (part = 0; part < 16; part++) { if (part) { sprintf(be, "p%d", part); @@ -172,261 +169,1116 @@ } free(path2); } - -int Manage_runstop(char *devname, int fd, int runstop, int quiet) +int Manage_run(char *devname, int fd, int verbose) { - /* Run or stop the array. array must already be configured - * required >= 0.90.0 - * Only print failure messages if quiet == 0; - * quiet > 0 means really be quiet - * quiet < 0 means we will try again if it fails. + /* Run the array. Array must already be configured + * Requires >= 0.90.0 */ - mdu_param_t param; /* unused */ - int rv = 0; + char nm[32], *nmp; - if (runstop == -1 && md_get_version(fd) < 9000) { - if (ioctl(fd, STOP_MD, 0)) { - if (quiet == 0) fprintf(stderr, - Name ": stopping device %s " - "failed: %s\n", - devname, strerror(errno)); - return 1; - } + if (md_get_version(fd) < 9000) { + pr_err("need md driver version 0.90.0 or later\n"); + return 1; } + nmp = fd2devnm(fd); + if (!nmp) { + pr_err("Cannot find %s in sysfs!!\n", devname); + return 1; + } + strcpy(nm, nmp); + return IncrementalScan(verbose, nm); +} + +int Manage_stop(char *devname, int fd, int verbose, int will_retry) +{ + /* Stop the array. Array must already be configured + * 'will_retry' means that error messages are not wanted. + */ + int rv = 0; + struct map_ent *map = NULL; + struct mdinfo *mdi; + char devnm[32]; + char container[32]; + int err; + int count; + char buf[32]; + unsigned long long rd1, rd2; + + if (will_retry && verbose == 0) + verbose = -1; if (md_get_version(fd) < 9000) { - fprintf(stderr, Name ": need md driver version 0.90.0 or later\n"); + if (ioctl(fd, STOP_MD, 0) == 0) + return 0; + pr_err("stopping device %s " + "failed: %s\n", + devname, strerror(errno)); return 1; } - /* - if (ioctl(fd, GET_ARRAY_INFO, &array)) { - fprintf(stderr, Name ": %s does not appear to be active.\n", - devname); + + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + strcpy(devnm, fd2devnm(fd)); + /* Get EXCL access first. If this fails, then attempting + * to stop is probably a bad idea. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION); + if (mdi && is_subarray(mdi->text_version)) { + char *sl; + strncpy(container, mdi->text_version+1, sizeof(container)); + container[sizeof(container)-1] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + } else + container[0] = 0; + close(fd); + count = 5; + while (((fd = ((devnm[0] == '/') + ?open(devname, O_RDONLY|O_EXCL) + :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 + || strcmp(fd2devnm(fd), devnm) != 0) + && container[0] + && mdmon_running(container) + && count) { + if (fd >= 0) + close(fd); + flush_mdmon(container); + count--; + } + if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) { + if (fd >= 0) + close(fd); + if (verbose >= 0) + pr_err("Cannot get exclusive access to %s:" + "Perhaps a running " + "process, mounted filesystem " + "or active volume group?\n", + devname); return 1; } - */ - if (runstop>0) { - if (ioctl(fd, RUN_ARRAY, ¶m)) { - fprintf(stderr, Name ": failed to run array %s: %s\n", - devname, strerror(errno)); - return 1; - } - if (quiet <= 0) - fprintf(stderr, Name ": started %s\n", devname); - } else if (runstop < 0){ - struct map_ent *map = NULL; - struct stat stb; - struct mdinfo *mdi; - int devnum; + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { int err; - int count; - /* If this is an mdmon managed array, just write 'inactive' - * to the array state and let mdmon clear up. - */ - devnum = fd2devnum(fd); - /* Get EXCL access first. If this fails, then attempting - * to stop is probably a bad idea. - */ + /* This is mdmon managed. */ close(fd); - fd = open(devname, O_RDONLY|O_EXCL); - if (fd < 0 || fd2devnum(fd) != devnum) { - if (fd >= 0) - close(fd); - fprintf(stderr, - Name ": Cannot get exclusive access to %s:" - "Perhaps a running " - "process, mounted filesystem " - "or active volume group?\n", - devname); - return 1; + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; + while (count && + (err = sysfs_set_str(mdi, NULL, + "array_state", + "inactive")) < 0 + && errno == EBUSY) { + usleep(200000); + count--; + } + if (err) { + if (verbose >= 0) + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; } - mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION); - if (mdi && - mdi->array.level > 0 && - is_subarray(mdi->text_version)) { - int err; - /* This is mdmon managed. */ - close(fd); - count = 25; - while (count && - (err = sysfs_set_str(mdi, NULL, - "array_state", - "inactive")) < 0 - && errno == EBUSY) { - usleep(200000); - count--; - } - if (err && !quiet) { - fprintf(stderr, Name - ": failed to stop array %s: %s\n", - devname, strerror(errno)); - rv = 1; - goto out; - } + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); - /* Give monitor a chance to act */ - ping_monitor(mdi->text_version); + fd = open_dev_excl(devnm); + if (fd < 0) { + if (verbose >= 0) + pr_err("failed to completely stop %s" + ": Device is busy\n", + devname); + rv = 1; + goto out; + } + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); - fd = open_dev_excl(devnum); - if (fd < 0) { - fprintf(stderr, Name - ": failed to completely stop %s" - ": Device is busy\n", - devname); + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m = mds; m; m = m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + metadata_container_matches(m->metadata_version+9, + devnm)) { + if (verbose >= 0) + pr_err("Cannot stop container %s: " + "member %s still active\n", + devname, m->dev); + free_mdstat(mds); rv = 1; goto out; } - } else if (mdi && - mdi->array.major_version == -1 && - mdi->array.minor_version == -2 && - !is_subarray(mdi->text_version)) { - struct mdstat_ent *mds, *m; - /* container, possibly mdmon-managed. - * Make sure mdmon isn't opening it, which - * would interfere with the 'stop' - */ - ping_monitor(mdi->sys_name); + } - /* now check that there are no existing arrays - * which are members of this array + /* If the array is undergoing a reshape which changes the number + * of devices, then it would be nice to stop it at a point where + * it has completed a full number of stripes in both old and + * new layouts as this will allow the reshape to be reverted. + * So if 'sync_action' is "reshape" and 'raid_disks' shows two + * different numbers, then + * - freeze reshape + * - set sync_max to next multiple of both data_disks and + * chunk sizes (or next but one) + * - unfreeze reshape + * - wait on 'sync_completed' for that point to be reached. + */ + if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) && + sysfs_attribute_available(mdi, NULL, "sync_action") && + sysfs_attribute_available(mdi, NULL, "reshape_direction") && + sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "reshape\n") == 0 && + sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 && + sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) { + /* Array is frozen */ + unsigned long long position, curr; + unsigned long long chunk1, chunk2; + unsigned long long rddiv, chunkdiv; + unsigned long long sectors; + unsigned long long sync_max, old_sync_max; + unsigned long long completed; + int backwards = 0; + int delay; + int scfd; + + rd1 -= mdi->array.level == 6 ? 2 : 1; + rd2 -= mdi->array.level == 6 ? 2 : 1; + sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf)); + if (strncmp(buf, "back", 4) == 0) + backwards = 1; + sysfs_get_ll(mdi, NULL, "reshape_position", &position); + sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2); + chunk1 /= 512; + chunk2 /= 512; + rddiv = GCD(rd1, rd2); + chunkdiv = GCD(chunk1, chunk2); + sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2; + + if (backwards) { + /* Need to subtract 'reshape_position' from + * array size to get equivalent of sync_max. + * Size calculation based on raid5_size in kernel. */ - mds = mdstat_read(0, 0); - for (m=mds; m; m=m->next) - if (m->metadata_version && - strncmp(m->metadata_version, "external:", 9)==0 && - is_subarray(m->metadata_version+9) && - devname2devnum(m->metadata_version+10) == devnum) { - if (!quiet) - fprintf(stderr, Name - ": Cannot stop container %s: " - "member %s still active\n", - devname, m->dev); - free_mdstat(mds); - rv = 1; - goto out; + unsigned long long size = mdi->component_size; + size &= ~(chunk1-1); + size &= ~(chunk2-1); + /* rd1 must be smaller */ + position = (position / sectors - 1) * sectors; + sync_max = size - position/rd1; + } else { + position = (position / sectors + 2) * sectors; + sync_max = position/rd1; + } + if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0) + old_sync_max = mdi->component_size; + /* Must not advance sync_max as that could confuse + * the reshape monitor */ + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + /* That should have set things going again. Now we + * wait a little while (3 second max) for sync_completed + * to reach the target. + * The reshape process can block for 500msec if + * the sync speed limit is hit, so we need to wait + * a lot longer than that. 1 second is usually + * enough. 3 is safe. + */ + delay = 3000; + scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed"); + while (scfd >= 0 && delay > 0 && old_sync_max > 0) { + sysfs_get_ll(mdi, NULL, "reshape_position", &curr); + sysfs_fd_get_str(scfd, buf, sizeof(buf)); + if (strncmp(buf, "none", 4) == 0) { + /* Either reshape has aborted, or hasn't + * quite started yet. Wait a bit and + * check 'sync_action' to see. + */ + usleep(10000); + sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)); + if (strncmp(buf, "reshape", 7) != 0) + break; + } + + if (sysfs_fd_get_ll(scfd, &completed) == 0 && + (completed > sync_max || + (completed == sync_max && curr != position))) { + while (completed > sync_max) { + sync_max += sectors / rd1; + if (backwards) + position -= sectors; + else + position += sectors; } + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + } + + if (!backwards && curr >= position) + break; + if (backwards && curr <= position) + break; + sysfs_wait(scfd, &delay); } + if (scfd >= 0) + close(scfd); - /* As we have an O_EXCL open, any use of the device - * which blocks STOP_ARRAY is probably a transient use, - * so it is reasonable to retry for a while - 5 seconds. + } + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; err = 0; + while (count && fd >= 0 + && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 + && errno == EBUSY) { + usleep(200000); + count --; + } + if (fd >= 0 && err) { + if (verbose >= 0) { + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + if (errno == EBUSY) + cont_err("Perhaps a running " + "process, mounted filesystem " + "or active volume group?\n"); + } + rv = 1; + goto out; + } + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + + if (devnm[0] && use_udev()) { + struct map_ent *mp = map_by_devnm(&map, devnm); + remove_devices(devnm, mp ? mp->path : NULL); + } + + if (verbose >= 0) + pr_err("stopped %s\n", devname); + map_lock(&map); + map_remove(&map, devnm); + map_unlock(&map); +out: + if (mdi) + sysfs_free(mdi); + + return rv; +} + +static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp) +{ + struct mddev_dev *new; + new = xmalloc(sizeof(*new)); + memset(new, 0, sizeof(*new)); + new->devname = xstrdup(name); + new->disposition = disp; + new->next = dv->next; + dv->next = new; + return new; +} + +static void add_faulty(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if ((disk.state & 1) == 0) /* not faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, disp); + } +} + +static void add_detached(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + int sfd; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + sfd = dev_open(buf, O_RDONLY); + if (sfd >= 0) { + /* Not detached */ + close(sfd); + continue; + } + if (errno != ENXIO) + /* Probably not detached */ + continue; + dv = add_one(dv, buf, disp); + } +} + +static void add_set(struct mddev_dev *dv, int fd, char set_char) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int copies, set; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + if (array.level != 10) + return; + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + set = disk.raid_disk % copies; + if (set_char != set + 'A') + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, dv->disposition); + } +} + +int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *dev_st, struct supertype *tst, + unsigned long rdev, + char *update, char *devname, int verbose, + mdu_array_info_t *array) +{ + struct mdinfo mdi; + int duuid[4]; + int ouuid[4]; + + dev_st->ss->getinfo_super(dev_st, &mdi, NULL); + dev_st->ss->uuid_from_super(dev_st, ouuid); + if (tst->sb) + tst->ss->uuid_from_super(tst, duuid); + else + /* Assume uuid matches: kernel will check */ + memcpy(duuid, ouuid, sizeof(ouuid)); + if ((mdi.disk.state & (1<= 0 - && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 - && errno == EBUSY) { - usleep(200000); - count --; + mdu_disk_info_t disc; + /* re-add doesn't work for version-1 superblocks + * before 2.6.18 :-( + */ + if (array->major_version == 1 && + get_linux_version() <= 2006018) + goto skip_re_add; + disc.number = mdi.disk.number; + if (ioctl(fd, GET_DISK_INFO, &disc) != 0 + || disc.major != 0 || disc.minor != 0 + ) + goto skip_re_add; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number = mdi.disk.number; + disc.raid_disk = mdi.disk.raid_disk; + disc.state = mdi.disk.state; + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->writemostly == 2) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + remove_partitions(tfd); + if (update || dv->writemostly > 0) { + int rv = -1; + tfd = dev_open(dv->devname, O_RDWR); + if (tfd < 0) { + pr_err("failed to open %s for" + " superblock update during re-add\n", dv->devname); + return -1; + } + + if (dv->writemostly == 1) + rv = dev_st->ss->update_super( + dev_st, NULL, "writemostly", + devname, verbose, 0, NULL); + if (dv->writemostly == 2) + rv = dev_st->ss->update_super( + dev_st, NULL, "readwrite", + devname, verbose, 0, NULL); + if (update) + rv = dev_st->ss->update_super( + dev_st, NULL, update, + devname, verbose, 0, NULL); + if (rv == 0) + rv = dev_st->ss->store_super(dev_st, tfd); + close(tfd); + if (rv != 0) { + pr_err("failed to update" + " superblock during re-add\n"); + return -1; + } } - if (fd >= 0 && err) { - if (quiet == 0) { - fprintf(stderr, Name - ": failed to stop array %s: %s\n", - devname, strerror(errno)); - if (errno == EBUSY) - fprintf(stderr, "Perhaps a running " - "process, mounted filesystem " - "or active volume group?\n"); + /* don't even try if disk is marked as faulty */ + errno = 0; + if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (verbose >= 0) + pr_err("re-added %s\n", dv->devname); + return 1; + } + if (errno == ENOMEM || errno == EROFS) { + pr_err("add new device failed for %s: %s\n", + dv->devname, strerror(errno)); + if (dv->disposition == 'M') + return 0; + return -1; + } + } +skip_re_add: + return 0; +} + +int Manage_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *tst, mdu_array_info_t *array, + int force, int verbose, char *devname, + char *update, unsigned long rdev, unsigned long long array_size) +{ + unsigned long long ldsize; + struct supertype *dev_st = NULL; + int j; + mdu_disk_info_t disc; + + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (dv->disposition == 'M') + return 0; + else + return -1; + } + + if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) { + /* More than 4TB is wasted on v0.90 */ + if (!force) { + pr_err("%s is larger than %s can " + "effectively use.\n" + " Add --force is you " + "really want to add this device.\n", + dv->devname, devname); + return -1; + } + pr_err("%s is larger than %s can " + "effectively use.\n" + " Adding anyway as --force " + "was given.\n", + dv->devname, devname); + } + if (!tst->ss->external && + array->major_version == 0 && + md_get_version(fd)%100 < 2) { + if (ioctl(fd, HOT_ADD_DISK, rdev)==0) { + if (verbose >= 0) + pr_err("hot added %s\n", + dv->devname); + return 1; + } + + pr_err("hot add failed for %s: %s\n", + dv->devname, strerror(errno)); + return -1; + } + + if (array->not_persistent == 0 || tst->ss->external) { + + /* need to find a sample superblock to copy, and + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array-> + */ + int array_failed; + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_container(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { + char *dev; + int dfd; + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major==0 && disc.minor==0) + continue; + if ((disc.state & 4)==0) /* sync */ + continue; + /* Looks like a good device to try */ + dev = map_dev(disc.major, disc.minor, 1); + if (!dev) + continue; + dfd = dev_open(dev, O_RDONLY); + if (dfd < 0) + continue; + if (tst->ss->load_super(tst, dfd, + NULL)) { + close(dfd); + continue; + } + close(dfd); + break; } - rv = 1; - goto out; + /* FIXME this is a bad test to be using */ + if (!tst->sb && dv->disposition != 'a') { + /* we are re-adding a device to a + * completely dead array - have to depend + * on kernel to check + */ + } else if (!tst->sb) { + pr_err("cannot load array metadata from %s\n", devname); + return -1; + } + + /* Make sure device is large enough */ + if (tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < + array_size) { + if (dv->disposition == 'M') + return 0; + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; } - /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array - * was stopped, so We'll do it here just to be sure. Drop any - * partitions as well... + + /* Possibly this device was recently part of + * the array and was temporarily removed, and + * is now being re-added. If so, we can + * simply re-add it. */ - if (fd >= 0) - ioctl(fd, BLKRRPART, 0); - if (mdi) - sysfs_uevent(mdi, "change"); - - - if (devnum != NoMdDev && - (stat("/dev/.udev", &stb) != 0 || - check_env("MDADM_NO_UDEV"))) { - struct map_ent *mp = map_by_devnum(&map, devnum); - remove_devices(devnum, mp ? mp->path : NULL); + + if (array->not_persistent==0) { + dev_st = dup_super(tst); + dev_st->ss->load_super(dev_st, tfd, NULL); + } + if (dev_st && dev_st->sb) { + int rv = attempt_re_add(fd, tfd, dv, + dev_st, tst, + rdev, + update, devname, + verbose, + array); + dev_st->ss->free_super(dev_st); + if (rv) + return rv; } + if (dv->disposition == 'M') { + if (verbose > 0) + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return 0; + } + if (dv->disposition == 'A') { + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return -1; + } + if (array->active_disks < array->raid_disks) { + char *avail = xcalloc(array->raid_disks, 1); + int d; + int found = 0; + for (d = 0; d < MAX_DISKS && found < array->active_disks; d++) { + disc.number = d; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major == 0 && disc.minor == 0) + continue; + if (!(disc.state & (1<level, array->raid_disks, + array->layout, 1, avail); + } else + array_failed = 0; + if (array_failed) { + pr_err("%s has failed so using --add cannot work and might destroy\n", + devname); + pr_err("data on %s. You should stop the array and re-assemble it.\n", + dv->devname); + return -1; + } + } else { + /* non-persistent. Must ensure that new drive + * is at least array->size big. + */ + if (ldsize/512 < array_size) { + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + } + /* committed to really trying this device now*/ + remove_partitions(tfd); - if (quiet <= 0) - fprintf(stderr, Name ": stopped %s\n", devname); - map_lock(&map); - map_remove(&map, devnum); - map_unlock(&map); - out: - if (mdi) - sysfs_free(mdi); + /* in 2.6.17 and earlier, version-1 superblocks won't + * use the number we write, but will choose a free number. + * we must choose the same free number, which requires + * starting at 'raid_disks' and counting up + */ + for (j = array->raid_disks; j < tst->max_devs; j++) { + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + break; + if (disc.major==0 && disc.minor==0) + break; + if (disc.state & 8) /* removed */ + break; } - return rv; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number =j; + disc.state = 0; + if (array->not_persistent==0) { + int dfd; + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) + return -1; + if (tst->ss->write_init_super(tst)) + return -1; + } else if (dv->disposition == 'A') { + /* this had better be raid1. + * As we are "--re-add"ing we must find a spare slot + * to fill. + */ + char *used = xcalloc(array->raid_disks, 1); + for (j = 0; j < tst->max_devs; j++) { + mdu_disk_info_t disc2; + disc2.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc2)) + continue; + if (disc2.major==0 && disc2.minor==0) + continue; + if (disc2.state & 8) /* removed */ + continue; + if (disc2.raid_disk < 0) + continue; + if (disc2.raid_disk > array->raid_disks) + continue; + used[disc2.raid_disk] = 1; + } + for (j = 0 ; j < array->raid_disks; j++) + if (!used[j]) { + disc.raid_disk = j; + disc.state |= (1<writemostly == 1) + disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (tst->ss->external) { + /* add a disk + * to an external metadata container */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + char devnm[32]; + int dfd; + + strcpy(devnm, fd2devnm(fd)); + + container_fd = open_dev_excl(devnm); + if (container_fd < 0) { + pr_err("add failed for %s:" + " could not get exclusive access to container\n", + dv->devname); + tst->ss->free_super(tst); + return -1; + } + + Kill(dv->devname, NULL, 0, -1, 0); + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (mdmon_running(tst->container_devnm)) + tst->update_tail = &tst->updates; + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) { + close(dfd); + close(container_fd); + return -1; + } + if (tst->update_tail) + flush_metadata_updates(tst); + else + tst->ss->sync_metadata(tst); + + sra = sysfs_read(container_fd, NULL, 0); + if (!sra) { + pr_err("add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + tst->ss->free_super(tst); + return -1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi, NULL); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + /* Make sure fds are closed as they are O_EXCL which + * would block add_disk */ + tst->ss->free_super(tst); + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + pr_err("add new device to external metadata" + " failed for %s\n", dv->devname); + close(container_fd); + sysfs_free(sra); + return -1; + } + ping_monitor(devnm); + sysfs_free(sra); + close(container_fd); + } else { + tst->ss->free_super(tst); + if (ioctl(fd, ADD_NEW_DISK, &disc)) { + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); + return -1; + } + } + if (verbose >= 0) + pr_err("added %s\n", dv->devname); + return 1; } -int Manage_resize(char *devname, int fd, long long size, int raid_disks) +int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, + int sysfd, unsigned long rdev, int verbose, char *devname) { - mdu_array_info_t info; - if (ioctl(fd, GET_ARRAY_INFO, &info) != 0) { - fprintf(stderr, Name ": Cannot get array information for %s: %s\n", - devname, strerror(errno)); + int lfd = -1; + int err; + + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hot spare while we are checking, we + * get an O_EXCL open on the container + */ + int ret; + char devnm[32]; + strcpy(devnm, fd2devnm(fd)); + lfd = open_dev_excl(devnm); + if (lfd < 0) { + pr_err("Cannot get exclusive access " + " to container - odd\n"); + return -1; + } + /* We may not be able to check on holders in + * sysfs, either because we don't have the dev num + * (rdev == 0) or because the device has been detached + * and the 'holders' directory no longer exists + * (ret == -1). In that case, assume it is OK to + * remove. + */ + if (rdev == 0) + ret = -1; + else + ret = sysfs_unique_holder(devnm, rdev); + if (ret == 0) { + pr_err("%s is not a member, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + if (ret >= 2) { + pr_err("%s is still in use, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + } + /* FIXME check that it is a current member */ + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + int n = write(sysfd, "remove", 6); + if (n != 6) + err = -1; + else + err = 0; + } else { + err = ioctl(fd, HOT_REMOVE_DISK, rdev); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * is registered */ + struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(rdev) && + dv->disk.minor == (int)minor(rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + if (sra) + sysfs_free(sra); + } + } + if (err) { + pr_err("hot remove failed " + "for %s: %s\n", dv->devname, + strerror(errno)); + if (lfd >= 0) + close(lfd); + return -1; + } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *devnm = fd2devnm(fd); + + if (!devnm) { + pr_err("unable to get container name\n"); + return -1; + } + + ping_manager(devnm); + } + if (lfd >= 0) + close(lfd); + if (verbose >= 0) + pr_err("hot removed %s from %s\n", + dv->devname, devname); + return 1; +} + +int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + if (tst->ss->external) { + pr_err("--replace only supported for native metadata (0.90 or 1.x)\n"); + return -1; + } + /* Need to find the device in sysfs and add 'want_replacement' to the + * status. + */ + mdi = sysfs_read(fd, NULL, GET_DEVS); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.raid_disk < 0) { + pr_err("%s is not active and so cannot be replaced.\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_str(mdi, di, + "state", "want_replacement"); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to request replacement for %s\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s (device %d in %s) for replacement\n", + dv->devname, di->disk.raid_disk, devname); + /* If there is a matching 'with', we need to tell it which + * raid disk + */ + while (dv && dv->disposition != 'W') + dv = dv->next; + if (dv) { + dv->disposition = 'w'; + dv->used = di->disk.raid_disk; + } return 1; } - if (size >= 0) - info.size = size; - if (raid_disks > 0) - info.raid_disks = raid_disks; - if (ioctl(fd, SET_ARRAY_INFO, &info) != 0) { - fprintf(stderr, Name ": Cannot set device size/shape for %s: %s\n", - devname, strerror(errno)); + sysfs_free(mdi); + pr_err("%s not found in %s so cannot --replace it\n", + dv->devname, devname); + return -1; +} + +int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */ + mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.state & (1<devname); + sysfs_free(mdi); + return -1; + } + if (di->disk.raid_disk >= 0) { + pr_err("%s is active and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_num(mdi, di, + "slot", dv->used); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to set %s as preferred replacement.\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s in %s as replacement for device %d\n", + dv->devname, devname, dv->used); return 1; } - return 0; + sysfs_free(mdi); + pr_err("%s not found in %s so cannot make it preferred replacement\n", + dv->devname, devname); + return -1; } int Manage_subdevs(char *devname, int fd, struct mddev_dev *devlist, int verbose, int test, char *update, int force) { - /* do something to each dev. + /* Do something to each dev. * devmode can be * 'a' - add the device * try HOT_ADD_DISK * If that fails EINVAL, try ADD_NEW_DISK - * 'r' - remove the device HOT_REMOVE_DISK + * 'A' - re-add the device + * 'r' - remove the device: HOT_REMOVE_DISK * device can be 'faulty' or 'detached' in which case all * matching devices are removed. * 'f' - set the device faulty SET_DISK_FAULTY * device can be 'detached' in which case any device that * is inaccessible will be marked faulty. + * 'R' - mark this device as wanting replacement. + * 'W' - this device is added if necessary and activated as + * a replacement for a previous 'R' device. + * ----- + * 'w' - 'W' will be changed to 'w' when it is paired with + * a 'R' device. If a 'W' is found while walking the list + * it must be unpaired, and is an error. + * 'M' - this is created by a 'missing' target. It is a slight + * variant on 'A' + * 'F' - Another variant of 'A', where the device was faulty + * so must be removed from the array first. + * * For 'f' and 'r', the device can also be a kernel-internal * name such as 'sdb'. */ - struct mddev_dev *add_devlist = NULL; mdu_array_info_t array; - mdu_disk_info_t disc; unsigned long long array_size; - struct mddev_dev *dv, *next = NULL; + struct mddev_dev *dv; struct stat stb; - int j, jnext = 0; int tfd = -1; - struct supertype *st, *tst; + struct supertype *tst; char *subarray = NULL; - int duuid[4]; - int ouuid[4]; - int lfd = -1; int sysfd = -1; int count = 0; /* number of actions taken */ struct mdinfo info; int frozen = 0; + int busy = 0; if (ioctl(fd, GET_ARRAY_INFO, &array)) { - fprintf(stderr, Name ": cannot get array info for %s\n", + pr_err("Cannot get array info for %s\n", devname); goto abort; } - sysfs_init(&info, fd, 0); + sysfs_init(&info, fd, NULL); - /* array.size is only 32 bit and may be truncated. + /* array.size is only 32 bits and may be truncated. * So read from sysfs if possible, and record number of sectors */ @@ -436,132 +1288,104 @@ tst = super_by_fd(fd, &subarray); if (!tst) { - fprintf(stderr, Name ": unsupport array - version %d.%d\n", + pr_err("unsupport array - version %d.%d\n", array.major_version, array.minor_version); goto abort; } stb.st_rdev = 0; - for (dv = devlist, j=0 ; dv; dv = next, j = jnext) { - unsigned long long ldsize; - char dvname[20]; - char *dnprintable = dv->devname; - char *add_dev = dv->devname; - int err; - int array_failed; - - next = dv->next; - jnext = 0; + for (dv = devlist; dv; dv = dv->next) { + int rv; - if (strcmp(dv->devname, "failed")==0 || - strcmp(dv->devname, "faulty")==0) { - int remaining_disks = array.nr_disks; - if (dv->disposition != 'r') { - fprintf(stderr, Name ": %s only meaningful " - "with -r, not -%c\n", + if (strcmp(dv->devname, "failed") == 0 || + strcmp(dv->devname, "faulty") == 0) { + if (dv->disposition != 'A' + && dv->disposition != 'r') { + pr_err("%s only meaningful " + "with -r or --re-add, not -%c\n", dv->devname, dv->disposition); goto abort; } - for (; j < MAX_DISKS && remaining_disks > 0; j++) { - unsigned dev; - disc.number = j; - if (ioctl(fd, GET_DISK_INFO, &disc)) - continue; - if (disc.major == 0 && disc.minor == 0) - continue; - remaining_disks --; - if ((disc.state & 1) == 0) /* faulty */ - continue; - dev = makedev(disc.major, disc.minor); - if (stb.st_rdev == dev) - /* already did that one */ - continue; - stb.st_rdev = dev; - next = dv; - /* same slot again next time - things might - * have reshuffled */ - jnext = j; - sprintf(dvname,"%d:%d", disc.major, disc.minor); - dnprintable = dvname; - break; - } - if (next != dv) - continue; - } else if (strcmp(dv->devname, "detached") == 0) { - int remaining_disks = array.nr_disks; + add_faulty(dv, fd, (dv->disposition == 'A' + ? 'F' : 'r')); + continue; + } + if (strcmp(dv->devname, "detached") == 0) { if (dv->disposition != 'r' && dv->disposition != 'f') { - fprintf(stderr, Name ": %s only meaningful " + pr_err("%s only meaningful " "with -r of -f, not -%c\n", dv->devname, dv->disposition); goto abort; } - for (; j < MAX_DISKS && remaining_disks > 0; j++) { - int sfd; - unsigned dev; - disc.number = j; - if (ioctl(fd, GET_DISK_INFO, &disc)) - continue; - if (disc.major == 0 && disc.minor == 0) - continue; - remaining_disks --; - sprintf(dvname,"%d:%d", disc.major, disc.minor); - sfd = dev_open(dvname, O_RDONLY); - if (sfd >= 0) { - close(sfd); - continue; - } - if (dv->disposition == 'f' && - (disc.state & 1) == 1) /* already faulty */ - continue; - if (errno != ENXIO) - continue; - dev = makedev(disc.major, disc.minor); - if (stb.st_rdev == dev) - /* already did that one */ - continue; - stb.st_rdev = dev; - next = dv; - /* same slot again next time - things might - * have reshuffled */ - jnext = j; - dnprintable = dvname; - break; - } - if (next != dv) - continue; - } else if (strcmp(dv->devname, "missing") == 0) { - if (dv->disposition != 'a' || dv->re_add == 0) { - fprintf(stderr, Name ": 'missing' only meaningful " - "with --re-add\n"); + add_detached(dv, fd, dv->disposition); + continue; + } + + if (strcmp(dv->devname, "missing") == 0) { + struct mddev_dev *add_devlist = NULL; + struct mddev_dev **dp; + if (dv->disposition != 'A') { + pr_err("'missing' only meaningful " + "with --re-add\n"); goto abort; } - if (add_devlist == NULL) - add_devlist = conf_get_devs(); + add_devlist = conf_get_devs(); if (add_devlist == NULL) { - fprintf(stderr, Name ": no devices to scan for missing members."); + pr_err("no devices to scan for missing members."); continue; } - add_dev = add_devlist->devname; - add_devlist = add_devlist->next; - if (add_devlist != NULL) - next = dv; - if (stat(add_dev, &stb) < 0) - continue; - } else if (strchr(dv->devname, '/') == NULL && - strchr(dv->devname, ':') == NULL && - strlen(dv->devname) < 50) { + for (dp = &add_devlist; *dp; dp = & (*dp)->next) + /* 'M' (for 'missing') is like 'A' without errors */ + (*dp)->disposition = 'M'; + *dp = dv->next; + dv->next = add_devlist; + continue; + } + + if (strncmp(dv->devname, "set-", 4) == 0 && + strlen(dv->devname) == 5) { + int copies; + + if (dv->disposition != 'r' && + dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", + dv->devname); + goto abort; + } + if (array.level != 10) { + pr_err("'%s' only meaningful with RAID10 arrays\n", + dv->devname); + goto abort; + } + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies != 0 || + dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || + copies > 26) { + pr_err("'%s' not meaningful with this array\n", + dv->devname); + goto abort; + } + add_set(dv, fd, dv->devname[4]); + continue; + } + + if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { /* Assume this is a kernel-internal name like 'sda1' */ int found = 0; char dname[55]; if (dv->disposition != 'r' && dv->disposition != 'f') { - fprintf(stderr, Name ": %s only meaningful " + pr_err("%s only meaningful " "with -r or -f, not -%c\n", dv->devname, dv->disposition); goto abort; } sprintf(dname, "dev-%s", dv->devname); - sysfd = sysfs_open(fd2devnum(fd), dname, "block/dev"); + sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); if (sysfd >= 0) { char dn[20]; int mj,mn; @@ -574,61 +1398,82 @@ sysfd = -1; } if (!found) { - sysfd = sysfs_open(fd2devnum(fd), dname, "state"); + sysfd = sysfs_open(fd2devnm(fd), dname, "state"); if (sysfd < 0) { - fprintf(stderr, Name ": %s does not appear " + pr_err("%s does not appear " "to be a component of %s\n", dv->devname, devname); goto abort; } } } else { - j = 0; - tfd = dev_open(dv->devname, O_RDONLY); - if (tfd < 0 && dv->disposition == 'r' && - lstat(dv->devname, &stb) == 0) - /* Be happy, the lstat worked, that is - * enough for --remove - */ - ; + if (tfd >= 0) + fstat(tfd, &stb); else { - if (tfd < 0 || fstat(tfd, &stb) != 0) { - fprintf(stderr, Name ": cannot find %s: %s\n", - dv->devname, strerror(errno)); - if (tfd >= 0) - close(tfd); + int open_err = errno; + if (stat(dv->devname, &stb) != 0) { + pr_err("Cannot find %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (dv->disposition == 'M') + /* non-fatal. Also improbable */ + continue; + pr_err("%s is not a block device.\n", + dv->devname); + goto abort; + } + if (dv->disposition == 'r') + /* Be happy, the stat worked, that is + * enough for --remove + */ + ; + else { + if (dv->disposition == 'M') + /* non-fatal */ + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(open_err)); goto abort; } - close(tfd); - tfd = -1; - } - if ((stb.st_mode & S_IFMT) != S_IFBLK) { - fprintf(stderr, Name ": %s is not a " - "block device.\n", - dv->devname); - goto abort; } } switch(dv->disposition){ default: - fprintf(stderr, Name ": internal error - devmode[%s]=%d\n", + pr_err("internal error - devmode[%s]=%d\n", dv->devname, dv->disposition); goto abort; case 'a': + case 'A': + case 'M': /* --re-add missing */ + case 'F': /* --re-add faulty */ /* add the device */ if (subarray) { - fprintf(stderr, Name ": Cannot add disks to a" + pr_err("Cannot add disks to a" " \'member\' array, perform this" " operation on the parent container\n"); goto abort; } + if (dv->disposition == 'F') + /* Need to remove first */ + ioctl(fd, HOT_REMOVE_DISK, + (unsigned long)stb.st_rdev); /* Make sure it isn't in use (in 2.6 or later) */ - tfd = dev_open(add_dev, O_RDONLY|O_EXCL|O_DIRECT); - if (tfd < 0 && add_dev != dv->devname) - continue; + tfd = dev_open(dv->devname, O_RDONLY|O_EXCL); + if (tfd >= 0) { + /* We know no-one else is using it. We'll + * need non-exclusive access to add it, so + * do that now. + */ + close(tfd); + tfd = dev_open(dv->devname, O_RDONLY); + } if (tfd < 0) { - fprintf(stderr, Name ": Cannot open %s: %s\n", + if (dv->disposition == 'M') + continue; + pr_err("Cannot open %s: %s\n", dv->devname, strerror(errno)); goto abort; } @@ -638,537 +1483,35 @@ else frozen = -1; } - - st = dup_super(tst); - - if (array.not_persistent==0) - st->ss->load_super(st, tfd, NULL); - - if (add_dev == dv->devname) { - if (!get_dev_size(tfd, dv->devname, &ldsize)) { - st->ss->free_super(st); - close(tfd); - goto abort; - } - } else if (!get_dev_size(tfd, NULL, &ldsize)) { - st->ss->free_super(st); - close(tfd); - tfd = -1; - continue; - } - - if (tst->ss->validate_geometry( - tst, array.level, array.layout, - array.raid_disks, NULL, - ldsize >> 9, NULL, NULL, 0) == 0) { - if (!force) { - fprintf(stderr, Name - ": %s is larger than %s can " - "effectively use.\n" - " Add --force is you " - "really want to add this device.\n", - add_dev, devname); - st->ss->free_super(st); - close(tfd); - goto abort; - } - fprintf(stderr, Name - ": %s is larger than %s can " - "effectively use.\n" - " Adding anyway as --force " - "was given.\n", - add_dev, devname); - } - if (!tst->ss->external && - array.major_version == 0 && - md_get_version(fd)%100 < 2) { - close(tfd); - st->ss->free_super(st); - tfd = -1; - if (ioctl(fd, HOT_ADD_DISK, - (unsigned long)stb.st_rdev)==0) { - if (verbose >= 0) - fprintf(stderr, Name ": hot added %s\n", - add_dev); - continue; - } - - fprintf(stderr, Name ": hot add failed for %s: %s\n", - add_dev, strerror(errno)); + rv = Manage_add(fd, tfd, dv, tst, &array, + force, verbose, devname, update, + stb.st_rdev, array_size); + close(tfd); + tfd = -1; + if (rv < 0) goto abort; - } - - if (array.not_persistent == 0 || tst->ss->external) { - - /* need to find a sample superblock to copy, and - * a spare slot to use. - * For 'external' array (well, container based), - * We can just load the metadata for the array. - */ - if (tst->sb) - /* already loaded */; - else if (tst->ss->external) { - tst->ss->load_container(tst, fd, NULL); - } else for (j = 0; j < tst->max_devs; j++) { - char *dev; - int dfd; - disc.number = j; - if (ioctl(fd, GET_DISK_INFO, &disc)) - continue; - if (disc.major==0 && disc.minor==0) - continue; - if ((disc.state & 4)==0) continue; /* sync */ - /* Looks like a good device to try */ - dev = map_dev(disc.major, disc.minor, 1); - if (!dev) continue; - dfd = dev_open(dev, O_RDONLY); - if (dfd < 0) continue; - if (tst->ss->load_super(tst, dfd, - NULL)) { - close(dfd); - continue; - } - close(dfd); - break; - } - /* FIXME this is a bad test to be using */ - if (!tst->sb && - dv->re_add) { - /* we are re-adding a device to a - * completely dead array - have to depend - * on kernel to check - */ - } else if (!tst->sb) { - close(tfd); - st->ss->free_super(st); - fprintf(stderr, Name ": cannot load array metadata from %s\n", devname); - goto abort; - } - - /* Make sure device is large enough */ - if (tst->ss->avail_size(tst, ldsize/512) < - array_size) { - close(tfd); - tfd = -1; - st->ss->free_super(st); - if (add_dev != dv->devname) - continue; - fprintf(stderr, Name ": %s not large enough to join array\n", - dv->devname); - goto abort; - } - - /* Possibly this device was recently part of the array - * and was temporarily removed, and is now being re-added. - * If so, we can simply re-add it. - */ - - if (st->sb) { - struct mdinfo mdi; - st->ss->getinfo_super(st, &mdi, NULL); - st->ss->uuid_from_super(st, ouuid); - if (tst->sb) - tst->ss->uuid_from_super(tst, duuid); - else - /* Assume uuid matches: kernel will check */ - memcpy(duuid, ouuid, sizeof(ouuid)); - if ((mdi.disk.state & (1<writemostly == 1) - disc.state |= 1 << MD_DISK_WRITEMOSTLY; - if (dv->writemostly == 2) - disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); - remove_partitions(tfd); - close(tfd); - tfd = -1; - if (update || dv->writemostly > 0) { - int rv = -1; - tfd = dev_open(dv->devname, O_RDWR); - if (tfd < 0) { - fprintf(stderr, Name ": failed to open %s for" - " superblock update during re-add\n", dv->devname); - st->ss->free_super(st); - goto abort; - } - - if (dv->writemostly == 1) - rv = st->ss->update_super( - st, NULL, "writemostly", - devname, verbose, 0, NULL); - if (dv->writemostly == 2) - rv = st->ss->update_super( - st, NULL, "readwrite", - devname, verbose, 0, NULL); - if (update) - rv = st->ss->update_super( - st, NULL, update, - devname, verbose, 0, NULL); - if (rv == 0) - rv = st->ss->store_super(st, tfd); - close(tfd); - tfd = -1; - if (rv != 0) { - fprintf(stderr, Name ": failed to update" - " superblock during re-add\n"); - st->ss->free_super(st); - goto abort; - } - } - /* don't even try if disk is marked as faulty */ - errno = 0; - if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { - if (verbose >= 0) - fprintf(stderr, Name ": re-added %s\n", add_dev); - count++; - st->ss->free_super(st); - continue; - } - if (errno == ENOMEM || errno == EROFS) { - fprintf(stderr, Name ": add new device failed for %s: %s\n", - add_dev, strerror(errno)); - st->ss->free_super(st); - if (add_dev != dv->devname) - continue; - goto abort; - } - } - skip_re_add: - st->ss->free_super(st); - } - if (add_dev != dv->devname) { - if (verbose > 0) - fprintf(stderr, Name - ": --re-add for %s to %s is not possible\n", - add_dev, devname); - if (tfd >= 0) { - close(tfd); - tfd = -1; - } - continue; - } - if (dv->re_add) { - if (tfd >= 0) - close(tfd); - fprintf(stderr, Name - ": --re-add for %s to %s is not possible\n", - dv->devname, devname); - goto abort; - } - if (array.active_disks < array.raid_disks) { - char *avail = calloc(array.raid_disks, 1); - int d; - int found = 0; - - for (d = 0; d < MAX_DISKS && found < array.active_disks; d++) { - disc.number = d; - if (ioctl(fd, GET_DISK_INFO, &disc)) - continue; - if (disc.major == 0 && disc.minor == 0) - continue; - if (!(disc.state & (1<devname); - if (tfd >= 0) - close(tfd); - goto abort; - } - } else { - /* non-persistent. Must ensure that new drive - * is at least array.size big. - */ - if (ldsize/512 < array_size) { - fprintf(stderr, Name ": %s not large enough to join array\n", - dv->devname); - if (tfd >= 0) - close(tfd); - goto abort; - } - } - /* committed to really trying this device now*/ - if (tfd >= 0) { - remove_partitions(tfd); - close(tfd); - tfd = -1; - } - /* in 2.6.17 and earlier, version-1 superblocks won't - * use the number we write, but will choose a free number. - * we must choose the same free number, which requires - * starting at 'raid_disks' and counting up - */ - for (j = array.raid_disks; j< tst->max_devs; j++) { - disc.number = j; - if (ioctl(fd, GET_DISK_INFO, &disc)) - break; - if (disc.major==0 && disc.minor==0) - break; - if (disc.state & 8) /* removed */ - break; - } - disc.major = major(stb.st_rdev); - disc.minor = minor(stb.st_rdev); - disc.number =j; - disc.state = 0; - if (array.not_persistent==0) { - int dfd; - if (dv->writemostly == 1) - disc.state |= 1 << MD_DISK_WRITEMOSTLY; - dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); - if (tst->ss->add_to_super(tst, &disc, dfd, - dv->devname)) { - close(dfd); - goto abort; - } - if (tst->ss->write_init_super(tst)) { - close(dfd); - goto abort; - } - } else if (dv->re_add) { - /* this had better be raid1. - * As we are "--re-add"ing we must find a spare slot - * to fill. - */ - char *used = malloc(array.raid_disks); - memset(used, 0, array.raid_disks); - for (j=0; j< tst->max_devs; j++) { - mdu_disk_info_t disc2; - disc2.number = j; - if (ioctl(fd, GET_DISK_INFO, &disc2)) - continue; - if (disc2.major==0 && disc2.minor==0) - continue; - if (disc2.state & 8) /* removed */ - continue; - if (disc2.raid_disk < 0) - continue; - if (disc2.raid_disk > array.raid_disks) - continue; - used[disc2.raid_disk] = 1; - } - for (j=0 ; jwritemostly == 1) - disc.state |= (1 << MD_DISK_WRITEMOSTLY); - if (tst->ss->external) { - /* add a disk - * to an external metadata container */ - struct mdinfo new_mdi; - struct mdinfo *sra; - int container_fd; - int devnum = fd2devnum(fd); - int dfd; - - container_fd = open_dev_excl(devnum); - if (container_fd < 0) { - fprintf(stderr, Name ": add failed for %s:" - " could not get exclusive access to container\n", - dv->devname); - tst->ss->free_super(tst); - goto abort; - } - - dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); - if (mdmon_running(tst->container_dev)) - tst->update_tail = &tst->updates; - if (tst->ss->add_to_super(tst, &disc, dfd, - dv->devname)) { - close(dfd); - close(container_fd); - goto abort; - } - if (tst->update_tail) - flush_metadata_updates(tst); - else - tst->ss->sync_metadata(tst); - - sra = sysfs_read(container_fd, -1, 0); - if (!sra) { - fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n", - dv->devname); - close(container_fd); - tst->ss->free_super(tst); - goto abort; - } - sra->array.level = LEVEL_CONTAINER; - /* Need to set data_offset and component_size */ - tst->ss->getinfo_super(tst, &new_mdi, NULL); - new_mdi.disk.major = disc.major; - new_mdi.disk.minor = disc.minor; - new_mdi.recovery_start = 0; - /* Make sure fds are closed as they are O_EXCL which - * would block add_disk */ - tst->ss->free_super(tst); - if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { - fprintf(stderr, Name ": add new device to external metadata" - " failed for %s\n", dv->devname); - close(container_fd); - sysfs_free(sra); - goto abort; - } - ping_monitor_by_id(devnum); - sysfs_free(sra); - close(container_fd); - } else { - tst->ss->free_super(tst); - if (ioctl(fd, ADD_NEW_DISK, &disc)) { - fprintf(stderr, Name ": add new device failed for %s as %d: %s\n", - dv->devname, j, strerror(errno)); - goto abort; - } - } - if (verbose >= 0) - fprintf(stderr, Name ": added %s\n", dv->devname); + if (rv > 0) + count++; break; case 'r': /* hot remove */ if (subarray) { - fprintf(stderr, Name ": Cannot remove disks from a" + pr_err("Cannot remove disks from a" " \'member\' array, perform this" " operation on the parent container\n"); - if (sysfd >= 0) - close(sysfd); - goto abort; - } - if (tst->ss->external) { - /* To remove a device from a container, we must - * check that it isn't in use in an array. - * This involves looking in the 'holders' - * directory - there must be just one entry, - * the container. - * To ensure that it doesn't get used as a - * hold spare while we are checking, we - * get an O_EXCL open on the container - */ - int dnum = fd2devnum(fd); - lfd = open_dev_excl(dnum); - if (lfd < 0) { - fprintf(stderr, Name - ": Cannot get exclusive access " - " to container - odd\n"); - if (sysfd >= 0) - close(sysfd); - goto abort; - } - /* in the detached case it is not possible to - * check if we are the unique holder, so just - * rely on the 'detached' checks - */ - if (strcmp(dv->devname, "detached") == 0 || - sysfd >= 0 || - sysfs_unique_holder(dnum, stb.st_rdev)) - /* pass */; - else { - fprintf(stderr, Name - ": %s is %s, cannot remove.\n", - dnprintable, - errno == EEXIST ? "still in use": - "not a member"); - close(lfd); - goto abort; - } - } - /* FIXME check that it is a current member */ - if (sysfd >= 0) { - /* device has been removed and we don't know - * the major:minor number - */ - int n = write(sysfd, "remove", 6); - if (n != 6) - err = -1; - else - err = 0; + rv = -1; + } else + rv = Manage_remove(tst, fd, dv, sysfd, + stb.st_rdev, verbose, + devname); + if (sysfd >= 0) close(sysfd); - sysfd = -1; - } else { - err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev); - if (err && errno == ENODEV) { - /* Old kernels rejected this if no personality - * registered */ - struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS); - struct mdinfo *dv = NULL; - if (sra) - dv = sra->devs; - for ( ; dv ; dv=dv->next) - if (dv->disk.major == (int)major(stb.st_rdev) && - dv->disk.minor == (int)minor(stb.st_rdev)) - break; - if (dv) - err = sysfs_set_str(sra, dv, - "state", "remove"); - else - err = -1; - if (sra) - sysfs_free(sra); - } - } - if (err) { - fprintf(stderr, Name ": hot remove failed " - "for %s: %s\n", dnprintable, - strerror(errno)); - if (lfd >= 0) - close(lfd); + sysfd = -1; + if (rv < 0) goto abort; - } - if (tst->ss->external) { - /* - * Before dropping our exclusive open we make an - * attempt at preventing mdmon from seeing an - * 'add' event before reconciling this 'remove' - * event. - */ - char *name = devnum2devname(fd2devnum(fd)); - - if (!name) { - fprintf(stderr, Name ": unable to get container name\n"); - goto abort; - } - - ping_manager(name); - free(name); - } - if (lfd >= 0) - close(lfd); - count++; - if (verbose >= 0) - fprintf(stderr, Name ": hot removed %s from %s\n", - dnprintable, devname); + if (rv > 0) + count++; break; case 'f': /* set faulty */ @@ -1176,8 +1519,10 @@ if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, (unsigned long) stb.st_rdev))) { - fprintf(stderr, Name ": set device faulty failed for %s: %s\n", - dnprintable, strerror(errno)); + if (errno == EBUSY) + busy = 1; + pr_err("set device faulty failed for %s: %s\n", + dv->devname, strerror(errno)); if (sysfd >= 0) close(sysfd); goto abort; @@ -1187,8 +1532,40 @@ sysfd = -1; count++; if (verbose >= 0) - fprintf(stderr, Name ": set %s faulty in %s\n", - dnprintable, devname); + pr_err("set %s faulty in %s\n", + dv->devname, devname); + break; + case 'R': /* Mark as replaceable */ + if (subarray) { + pr_err("Cannot replace disks in a" + " \'member\' array, perform this" + " operation on the parent container\n"); + rv = -1; + } else { + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_replace(tst, fd, dv, + stb.st_rdev, verbose, + devname); + } + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + case 'W': /* --with device that doesn't match */ + pr_err("No matching --replace device for --with %s\n", + dv->devname); + goto abort; + case 'w': /* --with device which was matched */ + rv = Manage_with(tst, fd, dv, + stb.st_rdev, verbose, devname); + if (rv < 0) + goto abort; break; } } @@ -1201,7 +1578,7 @@ abort: if (frozen > 0) sysfs_set_str(&info, NULL, "sync_action","idle"); - return 1; + return !test && busy ? 2 : 1; } int autodetect(void) @@ -1217,43 +1594,41 @@ return rv; } -int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet) +int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose) { struct supertype supertype, *st = &supertype; int fd, rv = 2; memset(st, 0, sizeof(*st)); - fd = open_subarray(dev, subarray, st, quiet); + fd = open_subarray(dev, subarray, st, verbose < 0); if (fd < 0) return 2; if (!st->ss->update_subarray) { - if (!quiet) - fprintf(stderr, - Name ": Operation not supported for %s metadata\n", - st->ss->name); + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); goto free_super; } - if (mdmon_running(st->devnum)) + if (mdmon_running(st->devnm)) st->update_tail = &st->updates; rv = st->ss->update_subarray(st, subarray, update, ident); if (rv) { - if (!quiet) - fprintf(stderr, Name ": Failed to update %s of subarray-%s in %s\n", + if (verbose >= 0) + pr_err("Failed to update %s of subarray-%s in %s\n", update, subarray, dev); } else if (st->update_tail) flush_metadata_updates(st); else st->ss->sync_metadata(st); - if (rv == 0 && strcmp(update, "name") == 0 && !quiet) - fprintf(stderr, - Name ": Updated subarray-%s name from %s, UUIDs may have changed\n", - subarray, dev); + if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0) + pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); free_super: st->ss->free_super(st); @@ -1262,9 +1637,8 @@ return rv; } -/* Move spare from one array to another - * If adding to destination array fails - * add back to original array +/* Move spare from one array to another If adding to destination array fails + * add back to original array. * Returns 1 on success, 0 on failure */ int move_spare(char *from_devname, char *to_devname, dev_t devid) { @@ -1283,7 +1657,6 @@ devlist.next = NULL; devlist.used = 0; - devlist.re_add = 0; devlist.writemostly = 0; devlist.devname = devname; sprintf(devname, "%d:%d", major(devid), minor(devid)); diff -Nru mdadm-3.2.5/managemon.c mdadm-3.3/managemon.c --- mdadm-3.2.5/managemon.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/managemon.c 2013-09-03 04:47:47.000000000 +0000 @@ -134,7 +134,7 @@ /* Note that this doesn't close fds if they are being used * by a clone. ->container will be set for a clone */ - dprintf("%s: devnum: %d\n", __func__, aa->devnum); + dprintf("%s: sys_name: %s\n", __func__, aa->info.sys_name); if (!aa->container) close_aa(aa); while (aa->info.devs) { @@ -147,7 +147,7 @@ static struct active_array *duplicate_aa(struct active_array *aa) { - struct active_array *newa = malloc(sizeof(*newa)); + struct active_array *newa = xmalloc(sizeof(*newa)); struct mdinfo **dp1, **dp2; *newa = *aa; @@ -162,7 +162,7 @@ if ((*dp1)->state_fd < 0) continue; - d = malloc(sizeof(*d)); + d = xmalloc(sizeof(*d)); *d = **dp1; *dp2 = d; dp2 = & d->next; @@ -289,7 +289,7 @@ */ st2 = dup_super(st); if (st2->ss->load_super(st2, dfd, NULL) == 0) { - st2->ss->getinfo_super(st, &info, NULL); + st2->ss->getinfo_super(st2, &info, NULL); if (st->ss->compare_super(st, st2) == 0 && info.disk.raid_disk >= 0) { /* Looks like a good member of array. @@ -304,7 +304,7 @@ st2->ss->free_super(st2); st->update_tail = &update; - st->ss->add_to_super(st, &dk, dfd, NULL); + st->ss->add_to_super(st, &dk, dfd, NULL, INVALID_SECTORS); st->ss->write_init_super(st); queue_metadata_update(update); st->update_tail = NULL; @@ -343,7 +343,7 @@ struct supertype *container) { /* Of interest here are: - * - if a new device has been added to the container, we + * - if a new device has been added to the container, we * add it to the array ignoring any metadata on it. * - if a device has been removed from the container, we * remove it from the device list and update the metadata. @@ -359,7 +359,7 @@ * To see what is removed and what is added. * These need to be remove from, or added to, the array */ - mdi = sysfs_read(-1, mdstat->devnum, GET_DEVS); + mdi = sysfs_read(-1, mdstat->devnm, GET_DEVS); if (!mdi) { /* invalidate the current count so we can try again */ container->devcnt = -1; @@ -391,12 +391,8 @@ di->disk.minor == cd->disk.minor) break; if (!cd) { - struct mdinfo *newd = malloc(sizeof(*newd)); + struct mdinfo *newd = xmalloc(sizeof(*newd)); - if (!newd) { - container->devcnt = -1; - continue; - } *newd = *di; add_disk_to_container(container, newd); } @@ -413,10 +409,10 @@ return -1; *disk = *clone; - disk->recovery_fd = sysfs_open(aa->devnum, disk->sys_name, "recovery_start"); + disk->recovery_fd = sysfs_open(aa->info.sys_name, disk->sys_name, "recovery_start"); if (disk->recovery_fd < 0) return -1; - disk->state_fd = sysfs_open(aa->devnum, disk->sys_name, "state"); + disk->state_fd = sysfs_open(aa->info.sys_name, disk->sys_name, "state"); if (disk->state_fd < 0) { close(disk->recovery_fd); return -1; @@ -448,14 +444,20 @@ char buf[64]; int frozen; struct supertype *container = a->container; + unsigned long long int component_size = 0; if (container == NULL) /* Raced with something */ return; - // FIXME - a->info.array.raid_disks = mdstat->raid_disks; - // MORE + if (mdstat->active) { + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + // MORE + } + + if (sysfs_get_ll(&a->info, NULL, "component_size", &component_size) >= 0) + a->info.component_size = component_size << 1; /* honor 'frozen' */ if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0) @@ -492,6 +494,11 @@ if (a->container == NULL) return; + if (sigterm && a->info.safe_mode_delay != 1) { + sysfs_set_safemode(&a->info, 1); + a->info.safe_mode_delay = 1; + } + /* We don't check the array while any update is pending, as it * might container a change (such as a spare assignment) which * could affect our decisions. @@ -518,6 +525,7 @@ /* prevent the kernel from activating the disk(s) before we * finish adding them */ + dprintf("%s: freezing %s\n", __func__, a->info.sys_name); sysfs_set_str(&a->info, NULL, "sync_action", "frozen"); /* Add device to array and set offset/size/slot. @@ -525,9 +533,7 @@ for (d = newdev; d ; d = d->next) { struct mdinfo *newd; - newd = malloc(sizeof(*newd)); - if (!newd) - continue; + newd = xmalloc(sizeof(*newd)); if (sysfs_add_disk(&newa->info, d, 0) < 0) { free(newd); continue; @@ -536,8 +542,16 @@ } queue_metadata_update(updates); updates = NULL; + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } replace_array(container, a, newa); - sysfs_set_str(&a->info, NULL, "sync_action", "recover"); + if (sysfs_set_str(&a->info, NULL, "sync_action", "recover") + == 0) + newa->prev_action = recover; + dprintf("%s: recovery started on %s\n", __func__, + a->info.sys_name); out: while (newdev) { d = newdev->next; @@ -558,7 +572,7 @@ unsigned long long array_size; struct active_array *newa = NULL; a->check_reshape = 0; - info = sysfs_read(-1, mdstat->devnum, + info = sysfs_read(-1, mdstat->devnm, GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); if (!info) goto out2; @@ -577,9 +591,7 @@ if (!newa) break; } - newd = malloc(sizeof(*newd)); - if (!newd) - continue; + newd = xmalloc(sizeof(*newd)); disk_init_and_add(newd, d, newa); } if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0 @@ -639,36 +651,29 @@ strcmp(mdstat->level, "linear") == 0) return; - mdi = sysfs_read(-1, mdstat->devnum, + mdi = sysfs_read(-1, mdstat->devnm, GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| - GET_DEGRADED|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); - - new = malloc(sizeof(*new)); + GET_DEGRADED|GET_SAFEMODE| + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|GET_LAYOUT); - if (!new || !mdi) { - if (mdi) - sysfs_free(mdi); - if (new) - free(new); + if (!mdi) return; - } - memset(new, 0, sizeof(*new)); + new = xcalloc(1, sizeof(*new)); - new->devnum = mdstat->devnum; - strcpy(new->info.sys_name, devnum2devname(new->devnum)); + strcpy(new->info.sys_name, mdstat->devnm); new->prev_state = new->curr_state = new->next_state = inactive; new->prev_action= new->curr_action= new->next_action= idle; new->container = container; - inst = to_subarray(mdstat, container->devname); + inst = to_subarray(mdstat, container->devnm); new->info.array = mdi->array; new->info.component_size = mdi->component_size; for (i = 0; i < new->info.array.raid_disks; i++) { - struct mdinfo *newd = malloc(sizeof(*newd)); + struct mdinfo *newd = xmalloc(sizeof(*newd)); for (di = mdi->devs; di; di = di->next) if (i == di->disk.raid_disk) @@ -687,14 +692,24 @@ } } - new->action_fd = sysfs_open(new->devnum, NULL, "sync_action"); - new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state"); - new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start"); - new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version"); - new->sync_completed_fd = sysfs_open(new->devnum, NULL, "sync_completed"); + new->action_fd = sysfs_open(new->info.sys_name, NULL, "sync_action"); + new->info.state_fd = sysfs_open(new->info.sys_name, NULL, "array_state"); + new->resync_start_fd = sysfs_open(new->info.sys_name, NULL, "resync_start"); + new->metadata_fd = sysfs_open(new->info.sys_name, NULL, "metadata_version"); + new->sync_completed_fd = sysfs_open(new->info.sys_name, NULL, "sync_completed"); dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst), new->action_fd, new->info.state_fd); + if (sigterm) + new->info.safe_mode_delay = 1; + else if (mdi->safe_mode_delay >= 50) + /* Normal start, mdadm set this. */ + new->info.safe_mode_delay = mdi->safe_mode_delay; + else + /* Restart, just pick a number */ + new->info.safe_mode_delay = 5000; + sysfs_set_safemode(&new->info, new->info.safe_mode_delay); + /* reshape_position is set by mdadm in sysfs * read this information for new arrays only (empty victim) */ @@ -724,7 +739,7 @@ * manage this instance */ if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) { - fprintf(stderr, "mdmon: failed to monitor %s\n", + pr_err("failed to monitor %s\n", mdstat->metadata_version); new->container = NULL; free_aa(new); @@ -746,16 +761,16 @@ for ( ; mdstat ; mdstat = mdstat->next) { struct active_array *a; - if (mdstat->devnum == container->devnum) { + if (strcmp(mdstat->devnm, container->devnm) == 0) { manage_container(mdstat, container); continue; } - if (!is_container_member(mdstat, container->devname)) + if (!is_container_member(mdstat, container->devnm)) /* Not for this array */ continue; /* Looks like a member of this container */ for (a = container->arrays; a; a = a->next) { - if (mdstat->devnum == a->devnum) { + if (strcmp(mdstat->devnm, a->info.sys_name) == 0) { if (a->container && a->to_remove == 0) manage_member(mdstat, a); break; @@ -780,7 +795,7 @@ if (msg->len == 0) { /* ping_monitor */ int cnt; - + cnt = monitor_loop_cnt; if (cnt & 1) cnt += 2; /* wait until next pselect */ @@ -796,7 +811,7 @@ manage(mdstat, container); free_mdstat(mdstat); } else if (!sigterm) { - mu = malloc(sizeof(*mu)); + mu = xmalloc(sizeof(*mu)); mu->len = msg->len; mu->buf = msg->buf; msg->buf = NULL; diff -Nru mdadm-3.2.5/mapfile.c mdadm-3.3/mapfile.c --- mdadm-3.2.5/mapfile.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mapfile.c 2013-09-03 04:47:47.000000000 +0000 @@ -46,7 +46,6 @@ #include #include - #define MAP_READ 0 #define MAP_NEW 1 #define MAP_LOCK 2 @@ -88,10 +87,7 @@ for (; mel; mel = mel->next) { if (mel->bad) continue; - if (mel->devnum < 0) - fprintf(f, "mdp%d ", -1-mel->devnum); - else - fprintf(f, "md%d ", mel->devnum); + fprintf(f, "%s ", mel->devnm); fprintf(f, "%s ", mel->metadata); fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], mel->uuid[1], mel->uuid[2], mel->uuid[3]); @@ -108,7 +104,6 @@ mapname[0]) == 0; } - static FILE *lf = NULL; int map_lock(struct map_ent **melp) { @@ -164,14 +159,14 @@ } void map_add(struct map_ent **melp, - int devnum, char *metadata, int uuid[4], char *path) + char * devnm, char *metadata, int uuid[4], char *path) { - struct map_ent *me = malloc(sizeof(*me)); + struct map_ent *me = xmalloc(sizeof(*me)); - me->devnum = devnum; + strcpy(me->devnm, devnm); strcpy(me->metadata, metadata); memcpy(me->uuid, uuid, 16); - me->path = path ? strdup(path) : NULL; + me->path = path ? xstrdup(path) : NULL; me->next = *melp; me->bad = 0; *melp = me; @@ -182,9 +177,9 @@ FILE *f; char buf[8192]; char path[200]; - int devnum, uuid[4]; + int uuid[4]; + char devnm[32]; char metadata[30]; - char nam[4]; *melp = NULL; @@ -198,14 +193,10 @@ while (fgets(buf, sizeof(buf), f)) { path[0] = 0; - if (sscanf(buf, " %3[mdp]%d %s %x:%x:%x:%x %200s", - nam, &devnum, metadata, uuid, uuid+1, + if (sscanf(buf, " %s %s %x:%x:%x:%x %200s", + devnm, metadata, uuid, uuid+1, uuid+2, uuid+3, path) >= 7) { - if (strncmp(nam, "md", 2) != 0) - continue; - if (nam[2] == 'p') - devnum = -1 - devnum; - map_add(melp, devnum, metadata, uuid, path); + map_add(melp, devnm, metadata, uuid, path); } } fclose(f); @@ -221,7 +212,7 @@ } } -int map_update(struct map_ent **mpp, int devnum, char *metadata, +int map_update(struct map_ent **mpp, char *devnm, char *metadata, int *uuid, char *path) { struct map_ent *map, *mp; @@ -233,16 +224,16 @@ map_read(&map); for (mp = map ; mp ; mp=mp->next) - if (mp->devnum == devnum) { + if (strcmp(mp->devnm, devnm) == 0) { strcpy(mp->metadata, metadata); memcpy(mp->uuid, uuid, 16); free(mp->path); - mp->path = path ? strdup(path) : NULL; + mp->path = path ? xstrdup(path) : NULL; mp->bad = 0; break; } if (!mp) - map_add(&map, devnum, metadata, uuid, path); + map_add(&map, devnm, metadata, uuid, path); if (mpp) *mpp = NULL; rv = map_write(map); @@ -250,7 +241,7 @@ return rv; } -void map_delete(struct map_ent **mapp, int devnum) +void map_delete(struct map_ent **mapp, char *devnm) { struct map_ent *mp; @@ -258,7 +249,7 @@ map_read(mapp); for (mp = *mapp; mp; mp = *mapp) { - if (mp->devnum == devnum) { + if (strcmp(mp->devnm, devnm) == 0) { *mapp = mp->next; free(mp->path); free(mp); @@ -267,12 +258,12 @@ } } -void map_remove(struct map_ent **mapp, int devnum) +void map_remove(struct map_ent **mapp, char *devnm) { - if (devnum == NoMdDev) + if (devnm[0] == 0) return; - map_delete(mapp, devnum); + map_delete(mapp, devnm); map_write(*mapp); map_free(*mapp); } @@ -286,7 +277,7 @@ for (mp = *map ; mp ; mp = mp->next) { if (memcmp(uuid, mp->uuid, 16) != 0) continue; - if (!mddev_busy(mp->devnum)) { + if (!mddev_busy(mp->devnm)) { mp->bad = 1; continue; } @@ -295,16 +286,16 @@ return NULL; } -struct map_ent *map_by_devnum(struct map_ent **map, int devnum) +struct map_ent *map_by_devnm(struct map_ent **map, char *devnm) { struct map_ent *mp; if (!*map) map_read(map); for (mp = *map ; mp ; mp = mp->next) { - if (mp->devnum != devnum) + if (strcmp(mp->devnm, devnm) != 0) continue; - if (!mddev_busy(mp->devnum)) { + if (!mddev_busy(mp->devnm)) { mp->bad = 1; continue; } @@ -326,7 +317,7 @@ continue; if (strcmp(mp->path+8, name) != 0) continue; - if (!mddev_busy(mp->devnum)) { + if (!mddev_busy(mp->devnm)) { mp->bad = 1; continue; } @@ -360,7 +351,6 @@ struct mdstat_ent *mdstat = mdstat_read(0, 0); struct mdstat_ent *md; struct map_ent *map = NULL; - int mdp = get_mdp_major(); int require_homehost; char sys_hostname[256]; char *homehost = conf_get_homehost(&require_homehost); @@ -373,7 +363,7 @@ } for (md = mdstat ; md ; md = md->next) { - struct mdinfo *sra = sysfs_read(-1, md->devnum, GET_DEVS); + struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_DEVS); struct mdinfo *sd; if (!sra) @@ -384,6 +374,7 @@ char dn[30]; int dfd; int ok; + int devid; struct supertype *st; char *subarray = NULL; char *path; @@ -403,28 +394,31 @@ close(dfd); if (ok != 0) continue; - info = st->ss->container_content(st, subarray); + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } if (!info) continue; - if (md->devnum >= 0) - path = map_dev(MD_MAJOR, md->devnum, 0); - else - path = map_dev(mdp, (-1-md->devnum)<< 6, 0); + devid = devnm2devid(md->devnm); + path = map_dev(major(devid), minor(devid), 0); if (path == NULL || strncmp(path, "/dev/md/", 8) != 0) { /* We would really like a name that provides * an MD_DEVNAME for udev. * The name needs to be unique both in /dev/md/ * and in this mapfile. - * It needs to match watch -I or -As would come + * It needs to match what -I or -As would come * up with. * That means: - * Check if array is in mdadm.conf + * Check if array is in mdadm.conf * - if so use that. * determine trustworthy from homehost etc * find a unique name based on metadata name. - * + * */ struct mddev_ident *match = conf_match(st, info, NULL, 0, @@ -484,7 +478,7 @@ path = namebuf; } } - map_add(&map, md->devnum, + map_add(&map, md->devnm, info->text_version, info->uuid, path); st->ss->free_super(st); @@ -496,7 +490,7 @@ /* Only trigger a change if we wrote a new map file */ if (map_write(map)) for (md = mdstat ; md ; md = md->next) { - struct mdinfo *sra = sysfs_read(-1, md->devnum, + struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_VERSION); if (sra) sysfs_uevent(sra, "change"); diff -Nru mdadm-3.2.5/maps.c mdadm-3.3/maps.c --- mdadm-3.2.5/maps.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/maps.c 2013-09-03 04:47:47.000000000 +0000 @@ -24,7 +24,6 @@ #include "mdadm.h" - /* name/number mappings */ mapping_t r5layout[] = { @@ -97,7 +96,6 @@ { NULL, 0} }; - mapping_t modes[] = { { "assemble", ASSEMBLE}, { "build", BUILD}, @@ -150,4 +148,3 @@ } return UnSet; } - diff -Nru mdadm-3.2.5/md.4 mdadm-3.3/md.4 --- mdadm-3.2.5/md.4 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/md.4 2013-09-03 04:47:47.000000000 +0000 @@ -78,7 +78,7 @@ .TP LEVEL The manner in which the devices are arranged into the array -(linear, raid0, raid1, raid4, raid5, raid10, multipath). +(LINEAR, RAID0, RAID1, RAID4, RAID5, RAID10, MULTIPATH). .TP UUID a 128 bit Universally Unique Identifier that identifies the array that @@ -101,7 +101,7 @@ LEGACY ARRAYS Early versions of the .B md -driver only supported Linear and Raid0 configurations and did not use +driver only supported LINEAR and RAID0 configurations and did not use a superblock (which is less critical with these configurations). While such arrays should be rebuilt with superblocks if possible, .B md @@ -118,7 +118,7 @@ a MULTIPATH array with no superblock makes sense. .TP RAID1 -In some configurations it might be desired to create a raid1 +In some configurations it might be desired to create a RAID1 configuration that does not use a superblock, and to maintain the state of the array elsewhere. While not encouraged for general use, it does have special-purpose uses and is supported. @@ -159,7 +159,7 @@ .SS LINEAR -A linear array simply catenates the available space on each +A LINEAR array simply catenates the available space on each drive to form one large virtual drive. One advantage of this arrangement over the more common RAID0 @@ -317,7 +317,7 @@ A MULTIPATH array is composed of a number of logically different devices, often fibre channel interfaces, that all refer the the same real device. If one of these interfaces fails (e.g. due to cable -problems), the multipath driver will attempt to redirect requests to +problems), the MULTIPATH driver will attempt to redirect requests to another interface. The MULTIPATH drive is not receiving any ongoing development and @@ -325,7 +325,7 @@ multipath drivers should be preferred for new installations. .SS FAULTY -The FAULTY md module is provided for testing purposes. A faulty array +The FAULTY md module is provided for testing purposes. A FAULTY array has exactly one component device and is normally assembled without a superblock, so the md array created provides direct access to all of the data in the component device. @@ -551,6 +551,34 @@ In 2.6.13, intent bitmaps are only supported with RAID1. Other levels with redundancy are supported from 2.6.15. +.SS BAD BLOCK LOG + +From Linux 3.5 each device in an +.I md +array can store a list of known-bad-blocks. This list is 4K in size +and usually positioned at the end of the space between the superblock +and the data. + +When a block cannot be read and cannot be repaired by writing data +recovered from other devices, the address of the block is stored in +the bad block log. Similarly if an attempt to write a block fails, +the address will be recorded as a bad block. If attempting to record +the bad block fails, the whole device will be marked faulty. + +Attempting to read from a known bad block will cause a read error. +Attempting to write to a known bad block will be ignored if any write +errors have been reported by the device. If there have been no write +errors then the data will be written to the known bad block and if +that succeeds, the address will be removed from the list. + +This allows an array to fail more gracefully - a few blocks on +different devices can be faulty without taking the whole array out of +action. + +The log is particularly useful when recovering to a spare. If a few blocks +cannot be read from the other devices, the bulk of the recovery can +complete and those few bad blocks will be recorded in the bad block log. + .SS WRITE-BEHIND From Linux 2.6.14, @@ -582,7 +610,7 @@ new layout. This might involve changing the number of devices in the array (so the stripes are wider), changing the chunk size (so stripes are deeper or shallower), or changing the arrangement of data and -parity (possibly changing the raid level, e.g. 1 to 5 or 5 to 6). +parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6). As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to have a different number of devices (more or fewer) and to have a @@ -593,7 +621,7 @@ During any stripe process there is a 'critical section' during which live data is being overwritten on disk. For the operation of -increasing the number of drives in a raid5, this critical section +increasing the number of drives in a RAID5, this critical section covers the first few stripes (the number being the product of the old and new number of devices). After this critical section is passed, data is only written to areas of the array which no longer hold live @@ -773,7 +801,7 @@ .I n gives the md device number, .I l -gives the level, 0 for RAID0 or -1 for LINEAR, +gives the level, 0 for RAID0 or \-1 for LINEAR, .I c gives the chunk size as a base-2 logarithm offset by twelve, so 0 means 4K, 1 means 8K. @@ -800,4 +828,3 @@ .SH SEE ALSO .BR mdadm (8), -.BR mkraid (8). diff -Nru mdadm-3.2.5/mdadm.8.in mdadm-3.3/mdadm.8.in --- mdadm-3.2.5/mdadm.8.in 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdadm.8.in 2013-09-03 04:47:47.000000000 +0000 @@ -5,7 +5,7 @@ .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" See file COPYING in distribution for details. -.TH MDADM 8 "" v3.2.5 +.TH MDADM 8 "" v3.3 .SH NAME mdadm \- manage MD devices .I aka @@ -125,7 +125,7 @@ of component devices and changing the number of active devices in Linear and RAID levels 0/1/4/5/6, changing the RAID level between 0, 1, 5, and 6, and between 0 and 10, -changing the chunk size and layout for RAID 0,4,5,6, as well as adding or +changing the chunk size and layout for RAID 0,4,5,6,10 as well as adding or removing a write-intent bitmap. .TP @@ -216,8 +216,9 @@ If a device is given before any options, or if the first option is .BR \-\-add , .BR \-\-fail , -or .BR \-\-remove , +or +.BR \-\-replace , then the MANAGE mode is assumed. Anything other than these will cause the .B Misc @@ -254,17 +255,6 @@ .I mdadm will be silent unless there is something really important to report. -.TP -.BR \-\-offroot -Set first character of argv[0] to @ to indicate mdadm was launched -from initrd/initramfs and should not be shutdown by systemd as part of -the regular shutdown process. This option is normally only used by -the system's initscripts. Please see here for more details on how -systemd handled argv[0]: -.IP -.B http://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons -.PP - .TP .BR \-f ", " \-\-force @@ -273,16 +263,22 @@ .TP .BR \-c ", " \-\-config= -Specify the config file. Default is to use -.BR /etc/mdadm.conf , -or if that is missing then -.BR /etc/mdadm/mdadm.conf . +Specify the config file or directory. Default is to use +.B /etc/mdadm.conf +and +.BR /etc/mdadm.conf.d , +or if those are missing then +.B /etc/mdadm/mdadm.conf +and +.BR /etc/mdadm/mdadm.conf.d . If the config file given is .B "partitions" then nothing will be read, but .I mdadm will act as though the config file contained exactly -.B "DEVICE partitions containers" +.br +.B " DEVICE partitions containers" +.br and will read .B /proc/partitions to find a list of devices to scan, and @@ -294,6 +290,13 @@ .I mdadm will act as though the config file were empty. +If the name given is of a directory, then +.I mdadm +will collect all the files contained in the directory with a name ending +in +.BR .conf , +sort them lexically, and process all of those files as config files. + .TP .BR \-s ", " \-\-scan Scan config file or @@ -332,7 +335,6 @@ .IP "0, 0.90, default" .el .IP "0, 0.90" -.. Use the original 0.90 format superblock. This format limits arrays to 28 component devices and limits component devices of levels 1 and greater to 2 terabytes. It is also possible for there to be confusion @@ -342,7 +344,6 @@ .IP "1, 1.0, 1.1, 1.2" .el .IP "1, 1.0, 1.1, 1.2 default" -.. Use the new version-1 format superblock. This has fewer restrictions. It can easily be moved between hosts with different endian-ness, and a recovery operation can be checkpointed and restarted. The different @@ -698,6 +699,12 @@ Note: external bitmaps are only known to work on ext2 and ext3. Storing bitmap files on other filesystems may result in serious problems. +When creating an array on devices which are 100G or larger, +.I mdadm +automatically adds an internal bitmap as it will usually be +beneficial. This can be suppressed with +.B "\-\-bitmap=none". + .TP .BR \-\-bitmap\-chunk= Set the chunksize of the bitmap. Each bit corresponds to that many @@ -768,6 +775,45 @@ being reshaped. .TP +.B \-\-data\-offset= +Arrays with 1.x metadata can leave a gap between the start of the +device and the start of array data. This gap can be used for various +metadata. The start of data is known as the +.IR data\-offset . +Normally an appropriate data offset is computed automatically. +However it can be useful to set it explicitly such as when re-creating +an array which was originally created using a different version of +.I mdadm +which computed a different offset. + +Setting the offset explicitly over-rides the default. The value given +is in Kilobytes unless an 'M' or 'G' suffix is given. + +Since Linux 3.4, +.B \-\-data\-offset +can also be used with +.B --grow +for some RAID levels (initially on RAID10). This allows the +data\-offset to be changed as part of the reshape process. When the +data offset is changed, no backup file is required as the difference +in offsets is used to provide the same functionality. + +When the new offset is earlier than the old offset, the number of +devices in the array cannot shrink. When it is after the old offset, +the number of devices in the array cannot increase. + +When creating an array, +.B \-\-data\-offset +can be specified as +.BR variable . +In the case each member device is expected to have a offset appended +to the name, separated by a colon. This makes it possible to recreate +exactly an array which has varying data offsets (as can happen when +different versions of +.I mdadm +are used to add different devices). + +.TP .BR \-\-continue This option is complementary to the .B \-\-freeze-reshape @@ -832,6 +878,13 @@ will not try to be so clever. .TP +.BR \-o ", " \-\-readonly +Start the array +.B read only +rather than read-write as normal. No writes will be allowed to the +array, and no resync, recovery, or reshape will be started. + +.TP .BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" Instruct mdadm how to create the device file if needed, possibly allocating an unused minor number. "md" causes a non-partitionable array @@ -888,30 +941,6 @@ or .BR \-\-build . -.ig XX -.\".TP -.\".BR \-\-symlink = no -.\"Normally when -.\".B \-\-auto -.\"causes -.\".I mdadm -.\"to create devices in -.\".B /dev/md/ -.\"it will also create symlinks from -.\".B /dev/ -.\"with names starting with -.\".B md -.\"or -.\".BR md_ . -.\"Use -.\".B \-\-symlink=no -.\"to suppress this, or -.\".B \-\-symlink=yes -.\"to enforce this even if it is suppressing -.\".IR mdadm.conf . -.\" -.XX - .TP .BR \-a ", " "\-\-add" This option can be used in Grow mode in two cases. @@ -1053,6 +1082,9 @@ .BR byteorder , .BR devicesize , .BR no\-bitmap , +.BR bbl , +.BR no-\bbl , +.BR metadata , or .BR super\-minor . @@ -1150,11 +1182,36 @@ update the relevant field in the metadata. The +.B metadata +option only works on v0.90 metadata arrays and will convert them to +v1.0 metadata. The array must not be dirty (i.e. it must not need a +sync) and it must not have a write-intent bitmap. + +The old metadata will remain on the devices, but will appear older +than the new metadata and so will usually be ignored. The old metadata +(or indeed the new metadata) can be removed by giving the appropriate +.B \-\-metadata= +option to +.BR \-\-zero\-superblock . + +The .B no\-bitmap option can be used when an array has an internal bitmap which is corrupt in some way so that assembling the array normally fails. It will cause any internal bitmap to be ignored. +The +.B bbl +option will reserve space in each device for a bad block list. This +will be 4K in size and positioned near the end of any free space +between the superblock and the data. + +The +.B no\-bbl +option will cause any reservation of space for a bad block list to be +removed. If the bad block list contains entries, this will fail, as +removing the list could cause data corruption. + .TP .BR \-\-freeze\-reshape Option is intended to be used in start-up scripts during initrd boot phase. @@ -1200,7 +1257,7 @@ .TP .BR \-\-re\-add -re\-add a device that was previous removed from an array. +re\-add a device that was previously removed from an array. If the metadata on the device reports that it is a member of the array, and the slot that it used is still vacant, then the device will be added back to the array in the same position. This will normally @@ -1214,51 +1271,104 @@ it will be assumed that bitmap-based recovery is enough to make the device fully consistent with the array. -When +When used with v1.x metadata, .B \-\-re\-add can be accompanied by -.BR \-\-update=devicesize . -See the description of this option when used in Assemble mode for an -explanation of its use. +.BR \-\-update=devicesize , +.BR \-\-update=bbl ", or" +.BR \-\-update=no\-bbl . +See the description of these option when used in Assemble mode for an +explanation of their use. If the device name given is .B missing -then mdadm will try to find any device that looks like it should be +then +.I mdadm +will try to find any device that looks like it should be part of the array but isn't and will try to re\-add all such devices. +If the device name given is +.B faulty +then +.I mdadm +will find all devices in the array that are marked +.BR faulty , +remove them and attempt to immediately re\-add them. This can be +useful if you are certain that the reason for failure has been +resolved. + .TP .BR \-r ", " \-\-remove remove listed devices. They must not be active. i.e. they should -be failed or spare devices. As well as the name of a device file +be failed or spare devices. + +As well as the name of a device file (e.g. .BR /dev/sda1 ) the words -.B failed -and +.BR failed , .B detached +and names like +.B set-A can be given to .BR \-\-remove . The first causes all failed device to be removed. The second causes any device which is no longer connected to the system (i.e an 'open' returns .BR ENXIO ) -to be removed. This will only succeed for devices that are spares or -have already been marked as failed. +to be removed. +The third will remove a set as describe below under +.BR \-\-fail . .TP .BR \-f ", " \-\-fail -mark listed devices as faulty. +Mark listed devices as faulty. As well as the name of a device file, the word .B detached -can be given. This will cause any device that has been detached from +or a set name like +.B set\-A +can be given. The former will cause any device that has been detached from the system to be marked as failed. It can then be removed. +For RAID10 arrays where the number of copies evenly divides the number +of devices, the devices can be conceptually divided into sets where +each set contains a single complete copy of the data on the array. +Sometimes a RAID10 array will be configured so that these sets are on +separate controllers. In this case all the devices in one set can be +failed by giving a name like +.B set\-A +or +.B set\-B +to +.BR \-\-fail . +The appropriate set names are reported by +.BR \-\-detail . + .TP .BR \-\-set\-faulty same as .BR \-\-fail . .TP +.B \-\-replace +Mark listed devices as requiring replacement. As soon as a spare is +available, it will be rebuilt and will replace the marked device. +This is similar to marking a device as faulty, but the device remains +in service during the recovery process to increase resilience against +multiple failures. When the replacement process finishes, the +replaced device will be marked as faulty. + +.TP +.B \-\-with +This can follow a list of +.B \-\-replace +devices. The devices listed after +.B \-\-with +will be preferentially used to replace the devices listed after +.BR \-\-replace . +These device must already be spare devices in the array. + +.TP .BR \-\-write\-mostly Subsequent devices that are added or re\-added will have the 'write-mostly' flag set. This is only valid for RAID1 and means that the 'md' driver @@ -1309,12 +1419,16 @@ .TP .BR \-\-detail\-platform Print details of the platform's RAID capabilities (firmware / hardware -topology) for a given metadata format. +topology) for a given metadata format. If used without argument, mdadm +will scan all controllers looking for their capabilities. Otherwise, mdadm +will only look at the controller specified by the argument in form of an +absolute filepath or a link, e.g. +.IR /sys/devices/pci0000:00/0000:00:1f.2 . .TP .BR \-Y ", " \-\-export When used with -.B \-\-detail +.B \-\-detail , \-\-detail-platform or .BR \-\-examine , output will be formatted as @@ -1355,6 +1469,19 @@ does not report the bitmap for that array. .TP +.B \-\-examine\-badblocks +List the bad-blocks recorded for the device, if a bad-blocks list has +been configured. Currently only +.B 1.x +metadata supports bad-blocks lists. + +.TP +.BI \-\-dump= directory +.TP +.BI \-\-restore= directory +Save metadata from lists devices, or restore metadata to listed devices. + +.TP .BR \-R ", " \-\-run start a partially assembled array. If .B \-\-assemble @@ -1388,7 +1515,9 @@ If the device is a container and the argument to \-\-kill\-subarray specifies an inactive subarray in the container, then the subarray is deleted. Deleting all subarrays will leave an 'empty-container' or -spare superblock on the drives. See \-\-zero\-superblock for completely +spare superblock on the drives. See +.B \-\-zero\-superblock +for completely removing a superblock. Note that some formats depend on the subarray index for generating a UUID, this command will fail if it would change the UUID of an active subarray. @@ -1864,6 +1993,20 @@ .\".B \-\-size .\"is given, the apparent size of the smallest drive given is used. +If the array type supports a write-intent bitmap, and if the devices +in the array exceed 100G is size, an internal write-intent bitmap +will automatically be added unless some other option is explicitly +requested with the +.B \-\-bitmap +option. In any case space for a bitmap will be reserved so that one +can be added layer with +.BR "\-\-grow \-\-bitmap=internal" . + +If the metadata type supports it (currently only 1.x metadata), space +will be allocated to store a bad block list. This allows a modest +number of bad blocks to be recorded, allowing the drive to remain in +service while only partially functional. + When creating an array within a .B CONTAINER .I mdadm @@ -2028,6 +2171,49 @@ config file to be examined. .TP +.BI \-\-dump= directory +If the device contains RAID metadata, a file will be created in the +.I directory +and the metadata will be written to it. The file will be the same +size as the device and have the metadata written in the file at the +same locate that it exists in the device. However the file will be "sparse" so +that only those blocks containing metadata will be allocated. The +total space used will be small. + +The file name used in the +.I directory +will be the base name of the device. Further if any links appear in +.I /dev/disk/by-id +which point to the device, then hard links to the file will be created +in +.I directory +based on these +.I by-id +names. + +Multiple devices can be listed and their metadata will all be stored +in the one directory. + +.TP +.BI \-\-restore= directory +This is the reverse of +.BR \-\-dump . +.I mdadm +will locate a file in the directory that has a name appropriate for +the given device and will restore metadata from it. Names that match +.I /dev/disk/by-id +names are preferred, however if two of those refer to different files, +.I mdadm +will not choose between them but will abort the operation. + +If a file name is given instead of a +.I directory +then +.I mdadm +will restore from that file to a single device, always provided the +size of the file matches that of the device, and the file contains +valid metadata. +.TP .B \-\-stop The devices should be active md arrays which will be deactivated, as long as they are not currently in use. @@ -2290,7 +2476,7 @@ increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4, RAID5, and RAID6. .IP \(bu 4 -change the chunk-size and layout of RAID0, RAID4, RAID5 and RAID6. +change the chunk-size and layout of RAID0, RAID4, RAID5, RAID6 and RAID10. .IP \(bu 4 convert between RAID1 and RAID5, between RAID5 and RAID6, between RAID0, RAID4, and RAID5, and between RAID0 and RAID10 (in the near-2 mode). @@ -2637,6 +2823,43 @@ .I mdadm will create and devices that are needed. +.TP +.B IMSM_NO_PLATFORM +A key value of IMSM metadata is that it allows interoperability with +boot ROMs on Intel platforms, and with other major operating systems. +Consequently, +.I mdadm +will only allow an IMSM array to be created or modified if detects +that it is running on an Intel platform which supports IMSM, and +supports the particular configuration of IMSM that is being requested +(some functionality requires newer OROM support). + +These checks can be suppressed by setting IMSM_NO_PLATFORM=1 in the +environment. This can be useful for testing or for disaster +recovery. You should be aware that interoperability may be +compromised by setting this value. + +.TP +.B MDADM_CONF_AUTO +Any string given in this variable is added to the start of the +.B AUTO +line in the config file, or treated as the whole +.B AUTO +line if none is given. It can be used to disable certain metadata +types when +.I mdadm +is called from a boot script. For example +.br +.B " export MDADM_CONF_AUTO='-ddf -imsm' +.br +will make sure that +.I mdadm +does not automatically assemble any DDF or +IMSM arrays that are found. This can be useful on systems configured +to manage such arrays with +.BR dmraid . + + .SH EXAMPLES .B " mdadm \-\-query /dev/name-of-device" @@ -2788,6 +3011,11 @@ .BR mdadm.conf (5) for more details. +.SS /etc/mdadm.conf.d + +A directory containing configuration files which are read in lexical +order. + .SS {MAP_PATH} When .B \-\-incremental @@ -2834,31 +3062,38 @@ The standard names for non-partitioned arrays (the only sort of md array available in 2.4 and earlier) are of the form .IP -/dev/mdNN +.RB /dev/md NN .PP where NN is a number. The standard names for partitionable arrays (as available from 2.6 -onwards) are of the form +onwards) are of the form: .IP -/dev/md_dNN +.RB /dev/md_d NN +.PP +Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2". +.PP +From kernel version 2.6.28 the "non-partitioned array" can actually +be partitioned. So the "md_d\fBNN\fP" +names are no longer needed, and +partitions such as "/dev/md\fBNN\fPp\fBXX\fp" +are possible. .PP -Partition numbers should be indicated by added "pMM" to these, thus "/dev/md/d1p2". +From kernel version 2.6.29 standard names can be non-numeric following +the form: +.IP +.RB /dev/md_ XXX .PP -From kernel version, 2.6.28 the "non-partitioned array" can actually -be partitioned. So the "md_dNN" names are no longer needed, and -partitions such as "/dev/mdNNpXX" are possible. +where +.B XXX +is any string. These names are supported by +.I mdadm +since version 3.3 provided they are enabled in +.IR mdadm.conf . .SH NOTE .I mdadm was previously known as .IR mdctl . -.P -.I mdadm -is completely separate from the -.I raidtools -package, and does not use the -.I /etc/raidtab -configuration file at all. .SH SEE ALSO For further information on mdadm usage, MD and the various levels of @@ -2867,19 +3102,6 @@ .B http://raid.wiki.kernel.org/ .PP (based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) -.\".PP -.\"for new releases of the RAID driver check out: -.\" -.\".IP -.\".UR ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches -.\"ftp://ftp.kernel.org/pub/linux/kernel/people/mingo/raid-patches -.\".UE -.\".PP -.\"or -.\".IP -.\".UR http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ -.\"http://www.cse.unsw.edu.au/~neilb/patches/linux-stable/ -.\".UE .PP The latest version of .I mdadm @@ -2892,8 +3114,3 @@ .IR mdmon (8), .IR mdadm.conf (5), .IR md (4). -.PP -.IR raidtab (5), -.IR raid0run (8), -.IR raidstop (8), -.IR mkraid (8). diff -Nru mdadm-3.2.5/mdadm.c mdadm-3.3/mdadm.c --- mdadm-3.2.5/mdadm.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdadm.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -29,51 +29,34 @@ #include "md_p.h" #include +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident); +static int misc_scan(char devmode, struct context *c); +static int stop_scan(int verbose); +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c); int main(int argc, char *argv[]) { int mode = 0; int opt; int option_index; - char *c; int rv; int i; - int chunk = 0; - long long size = -1; - long long array_size = -1; - int level = UnSet; - int layout = UnSet; - char *layout_str = NULL; - int raiddisks = 0; - int sparedisks = 0; + unsigned long long array_size = 0; + unsigned long long data_offset = INVALID_SECTORS; struct mddev_ident ident; char *configfile = NULL; - char *cp; - char *update = NULL; - int scan = 0; int devmode = 0; - int runstop = 0; - int readonly = 0; - int write_behind = 0; int bitmap_fd = -1; - char *bitmap_file = NULL; - char *backup_file = NULL; - int invalid_backup = 0; - int bitmap_chunk = UnSet; - int SparcAdjust = 0; struct mddev_dev *devlist = NULL; struct mddev_dev **devlistend = & devlist; struct mddev_dev *dv; int devs_found = 0; - int verbose = 0; - int quiet = 0; - int brief = 0; - int force = 0; - int test = 0; - int export = 0; - int assume_clean = 0; - char *prefer = NULL; char *symlinks = NULL; int grow_continue = 0; /* autof indicates whether and how to create device node. @@ -86,36 +69,37 @@ * 5 - default to md if not is_standard (md in config file) * 6 - default to mdp if not is_standard (part, or mdp in config file) */ - int autof = 0; + struct context c = { + .require_homehost = 1, + }; + struct shape s = { + .level = UnSet, + .layout = UnSet, + .bitmap_chunk = UnSet, + }; - char *homehost = NULL; char sys_hostname[256]; - int require_homehost = 1; char *mailaddr = NULL; char *program = NULL; int increments = 20; - int delay = 0; int daemonise = 0; char *pidfile = NULL; int oneshot = 0; int spare_sharing = 1; struct supertype *ss = NULL; int writemostly = 0; - int re_add = 0; char *shortopt = short_options; int dosyslog = 0; int rebuild_map = 0; - char *subarray = NULL; char *remove_path = NULL; char *udev_filename = NULL; + char *dump_directory = NULL; int print_help = 0; FILE *outf; int mdfd = -1; - int freeze_reshape = 0; - srandom(time(0) ^ getpid()); ident.uuid_set=0; @@ -150,10 +134,10 @@ fputs(Version, stderr); exit(0); - case 'v': verbose++; + case 'v': c.verbose++; continue; - case 'q': quiet++; + case 'q': c.verbose--; continue; case 'b': @@ -162,34 +146,28 @@ || mode == MANAGE) break; /* b means bitmap */ case Brief: - brief = 1; + c.brief = 1; continue; - case 'Y': export++; + case 'Y': c.export++; continue; case HomeHost: if (strcasecmp(optarg, "") == 0) - require_homehost = 0; + c.require_homehost = 0; else - homehost = optarg; + c.homehost = optarg; continue; - /* - * --offroot sets first char of argv[0] to @. This is used - * by systemd to signal that the tast was launched from - * initrd/initramfs and should be preserved during shutdown - */ case OffRootOpt: - argv[0][0] = '@'; - __offroot = 1; + /* Silently ignore old option */ continue; case Prefer: - if (prefer) - free(prefer); - if (asprintf(&prefer, "/%s/", optarg) <= 0) - prefer = NULL; + if (c.prefer) + free(c.prefer); + if (asprintf(&c.prefer, "/%s/", optarg) <= 0) + c.prefer = NULL; continue; case ':': @@ -211,6 +189,8 @@ case Add: case 'r': case Remove: + case Replace: + case With: case 'f': case Fail: case ReAdd: /* re-add */ @@ -220,15 +200,23 @@ } break; - case 'A': newmode = ASSEMBLE; shortopt = short_bitmap_auto_options; break; - case 'B': newmode = BUILD; shortopt = short_bitmap_auto_options; break; - case 'C': newmode = CREATE; shortopt = short_bitmap_auto_options; break; - case 'F': newmode = MONITOR;break; + case 'A': newmode = ASSEMBLE; + shortopt = short_bitmap_auto_options; + break; + case 'B': newmode = BUILD; + shortopt = short_bitmap_auto_options; + break; + case 'C': newmode = CREATE; + shortopt = short_bitmap_auto_options; + break; + case 'F': newmode = MONITOR; + break; case 'G': newmode = GROW; shortopt = short_bitmap_options; break; case 'I': newmode = INCREMENTAL; - shortopt = short_bitmap_auto_options; break; + shortopt = short_bitmap_auto_options; + break; case AutoDetect: newmode = AUTODETECT; break; @@ -238,6 +226,9 @@ case 'E': case 'X': case 'Q': + case ExamineBB: + case Dump: + case Restore: newmode = MISC; break; @@ -252,7 +243,7 @@ case KillSubarray: case UpdateSubarray: case UdevRules: - case 'K': + case KillOpt: if (!mode) newmode = MISC; break; @@ -265,7 +256,7 @@ /* everybody happy ! */ } else if (mode && newmode != mode) { /* not allowed.. */ - fprintf(stderr, Name ": "); + pr_err(""); if (option_index >= 0) fprintf(stderr, "--%s", long_options[option_index].name); else @@ -277,9 +268,9 @@ } else if (!mode && newmode) { mode = newmode; if (mode == MISC && devs_found) { - fprintf(stderr, Name ": No action given for %s in --misc mode\n", + pr_err("No action given for %s in --misc mode\n", devlist->devname); - fprintf(stderr," Action options must come before device names\n"); + cont_err("Action options must come before device names\n"); exit(2); } } else { @@ -294,15 +285,10 @@ /* If first option is a device, don't force the mode yet */ if (opt == 1) { if (devs_found == 0) { - dv = malloc(sizeof(*dv)); - if (dv == NULL) { - fprintf(stderr, Name ": malloc failed\n"); - exit(3); - } + dv = xmalloc(sizeof(*dv)); dv->devname = optarg; dv->disposition = devmode; dv->writemostly = writemostly; - dv->re_add = re_add; dv->used = 0; dv->next = NULL; *devlistend = dv; @@ -312,14 +298,14 @@ continue; } /* No mode yet, and this is the second device ... */ - fprintf(stderr, Name ": An option must be given to set the mode before a second device\n" + pr_err("An option must be given to set the mode before a second device\n" " (%s) is listed\n", optarg); exit(2); } if (option_index >= 0) - fprintf(stderr, Name ": --%s", long_options[option_index].name); + pr_err("--%s", long_options[option_index].name); else - fprintf(stderr, Name ": -%c", opt); + pr_err("-%c", opt); fprintf(stderr, " does not set the mode, and so cannot be the first option.\n"); exit(2); } @@ -338,27 +324,29 @@ continue; } if (opt == 1) { - /* an undecorated option - must be a device name. + /* an undecorated option - must be a device name. */ + + if (devs_found > 0 && devmode == DetailPlatform) { + pr_err("controller may only be specified once. %s ignored\n", + optarg); + continue; + } + if (devs_found > 0 && mode == MANAGE && !devmode) { - fprintf(stderr, Name ": Must give one of -a/-r/-f" + pr_err("Must give one of -a/-r/-f" " for subsequent devices at %s\n", optarg); exit(2); } if (devs_found > 0 && mode == GROW && !devmode) { - fprintf(stderr, Name ": Must give -a/--add for" - " devices to add: %s\n", optarg); + pr_err("Must give -a/--add for" + " devices to add: %s\n", optarg); exit(2); } - dv = malloc(sizeof(*dv)); - if (dv == NULL) { - fprintf(stderr, Name ": malloc failed\n"); - exit(3); - } + dv = xmalloc(sizeof(*dv)); dv->devname = optarg; dv->disposition = devmode; dv->writemostly = writemostly; - dv->re_add = re_add; dv->used = 0; dv->next = NULL; *devlistend = dv; @@ -378,19 +366,20 @@ case O(CREATE,ChunkSize): case O(BUILD,'c'): /* chunk or rounding */ case O(BUILD,ChunkSize): /* chunk or rounding */ - if (chunk) { - fprintf(stderr, Name ": chunk/rounding may only be specified once. " + if (s.chunk) { + pr_err("chunk/rounding may only be specified once. " "Second value is %s.\n", optarg); exit(2); } - chunk = parse_size(optarg); - if (chunk < 8 || (chunk&1)) { - fprintf(stderr, Name ": invalid chunk/rounding value: %s\n", + s.chunk = parse_size(optarg); + if (s.chunk == INVALID_SECTORS || + s.chunk < 8 || (s.chunk&1)) { + pr_err("invalid chunk/rounding value: %s\n", optarg); exit(2); } /* Convert sectors to K */ - chunk /= 2; + s.chunk /= 2; continue; case O(INCREMENTAL, 'e'): @@ -398,14 +387,14 @@ case O(ASSEMBLE,'e'): case O(MISC,'e'): /* set metadata (superblock) information */ if (ss) { - fprintf(stderr, Name ": metadata information already given\n"); + pr_err("metadata information already given\n"); exit(2); } for(i=0; !ss && superlist[i]; i++) ss = superlist[i]->match_metadata_desc(optarg); if (!ss) { - fprintf(stderr, Name ": unrecognised metadata identifier: %s\n", optarg); + pr_err("unrecognised metadata identifier: %s\n", optarg); exit(2); } continue; @@ -425,85 +414,105 @@ writemostly = 2; continue; - case O(GROW,'z'): case O(CREATE,'z'): case O(BUILD,'z'): /* size */ - if (size >= 0) { - fprintf(stderr, Name ": size may only be specified once. " + if (s.size > 0) { + pr_err("size may only be specified once. " "Second value is %s.\n", optarg); exit(2); } if (strcmp(optarg, "max")==0) - size = 0; + s.size = MAX_SIZE; else { - size = parse_size(optarg); - if (size < 8) { - fprintf(stderr, Name ": invalid size: %s\n", + s.size = parse_size(optarg); + if (s.size == INVALID_SECTORS || + s.size < 8) { + pr_err("invalid size: %s\n", optarg); exit(2); } /* convert sectors to K */ - size /= 2; + s.size /= 2; } continue; case O(GROW,'Z'): /* array size */ - if (array_size >= 0) { - fprintf(stderr, Name ": array-size may only be specified once. " + if (array_size > 0) { + pr_err("array-size may only be specified once. " "Second value is %s.\n", optarg); exit(2); } if (strcmp(optarg, "max") == 0) - array_size = 0; + array_size = MAX_SIZE; else { array_size = parse_size(optarg); - if (array_size <= 0) { - fprintf(stderr, Name ": invalid array size: %s\n", + if (array_size == 0 || + array_size == INVALID_SECTORS) { + pr_err("invalid array size: %s\n", optarg); exit(2); } } continue; + case O(CREATE,DataOffset): + case O(GROW,DataOffset): + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset may only be specified one. " + "Second value is %s.\n", optarg); + exit(2); + } + if (mode == CREATE && + strcmp(optarg, "variable") == 0) + data_offset = VARIABLE_OFFSET; + else + data_offset = parse_size(optarg); + if (data_offset == INVALID_SECTORS) { + pr_err("invalid data-offset: %s\n", + optarg); + exit(2); + } + continue; + case O(GROW,'l'): case O(CREATE,'l'): case O(BUILD,'l'): /* set raid level*/ - if (level != UnSet) { - fprintf(stderr, Name ": raid level may only be set once. " + if (s.level != UnSet) { + pr_err("raid level may only be set once. " "Second value is %s.\n", optarg); exit(2); } - level = map_name(pers, optarg); - if (level == UnSet) { - fprintf(stderr, Name ": invalid raid level: %s\n", + s.level = map_name(pers, optarg); + if (s.level == UnSet) { + pr_err("invalid raid level: %s\n", optarg); exit(2); } - if (level != 0 && level != LEVEL_LINEAR && level != 1 && - level != LEVEL_MULTIPATH && level != LEVEL_FAULTY && - level != 10 && + if (s.level != 0 && s.level != LEVEL_LINEAR && s.level != 1 && + s.level != LEVEL_MULTIPATH && s.level != LEVEL_FAULTY && + s.level != 10 && mode == BUILD) { - fprintf(stderr, Name ": Raid level %s not permitted with --build.\n", + pr_err("Raid level %s not permitted with --build.\n", optarg); exit(2); } - if (sparedisks > 0 && level < 1 && level >= -1) { - fprintf(stderr, Name ": raid level %s is incompatible with spare-devices setting.\n", + if (s.sparedisks > 0 && s.level < 1 && s.level >= -1) { + pr_err("raid level %s is incompatible with spare-devices setting.\n", optarg); exit(2); } - ident.level = level; + ident.level = s.level; continue; case O(GROW, 'p'): /* new layout */ case O(GROW, Layout): - if (layout_str) { - fprintf(stderr,Name ": layout may only be sent once. " - "Second value was %s\n", optarg); + if (s.layout_str) { + pr_err("layout may only be sent once. " + "Second value was %s\n", optarg); exit(2); } - layout_str = optarg; + s.layout_str = optarg; /* 'Grow' will parse the value */ continue; @@ -511,41 +520,41 @@ case O(CREATE,Layout): case O(BUILD,'p'): /* faulty layout */ case O(BUILD,Layout): - if (layout != UnSet) { - fprintf(stderr,Name ": layout may only be sent once. " - "Second value was %s\n", optarg); + if (s.layout != UnSet) { + pr_err("layout may only be sent once. " + "Second value was %s\n", optarg); exit(2); } - switch(level) { + switch(s.level) { default: - fprintf(stderr, Name ": layout not meaningful for %s arrays.\n", - map_num(pers, level)); + pr_err("layout not meaningful for %s arrays.\n", + map_num(pers, s.level)); exit(2); case UnSet: - fprintf(stderr, Name ": raid level must be given before layout.\n"); + pr_err("raid level must be given before layout.\n"); exit(2); case 5: - layout = map_name(r5layout, optarg); - if (layout==UnSet) { - fprintf(stderr, Name ": layout %s not understood for raid5.\n", + s.layout = map_name(r5layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid5.\n", optarg); exit(2); } break; case 6: - layout = map_name(r6layout, optarg); - if (layout==UnSet) { - fprintf(stderr, Name ": layout %s not understood for raid6.\n", + s.layout = map_name(r6layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid6.\n", optarg); exit(2); } break; case 10: - layout = parse_layout_10(optarg); - if (layout < 0) { - fprintf(stderr, Name ": layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); + s.layout = parse_layout_10(optarg); + if (s.layout < 0) { + pr_err("layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); exit(2); } break; @@ -553,9 +562,9 @@ /* Faulty * modeNNN */ - layout = parse_layout_faulty(optarg); - if (layout == -1) { - fprintf(stderr, Name ": layout %s not understood for faulty.\n", + s.layout = parse_layout_faulty(optarg); + if (s.layout == -1) { + pr_err("layout %s not understood for faulty.\n", optarg); exit(2); } @@ -566,40 +575,40 @@ case O(CREATE,AssumeClean): case O(BUILD,AssumeClean): /* assume clean */ case O(GROW,AssumeClean): - assume_clean = 1; + s.assume_clean = 1; continue; case O(GROW,'n'): case O(CREATE,'n'): case O(BUILD,'n'): /* number of raid disks */ - if (raiddisks) { - fprintf(stderr, Name ": raid-devices set twice: %d and %s\n", - raiddisks, optarg); + if (s.raiddisks) { + pr_err("raid-devices set twice: %d and %s\n", + s.raiddisks, optarg); exit(2); } - raiddisks = strtol(optarg, &c, 10); - if (!optarg[0] || *c || raiddisks<=0) { - fprintf(stderr, Name ": invalid number of raid devices: %s\n", + s.raiddisks = parse_num(optarg); + if (s.raiddisks <= 0) { + pr_err("invalid number of raid devices: %s\n", optarg); exit(2); } - ident.raid_disks = raiddisks; + ident.raid_disks = s.raiddisks; continue; - case O(CREATE,'x'): /* number of spare (eXtra) discs */ - if (sparedisks) { - fprintf(stderr,Name ": spare-devices set twice: %d and %s\n", - sparedisks, optarg); + case O(CREATE,'x'): /* number of spare (eXtra) disks */ + if (s.sparedisks) { + pr_err("spare-devices set twice: %d and %s\n", + s.sparedisks, optarg); exit(2); } - if (level != UnSet && level <= 0 && level >= -1) { - fprintf(stderr, Name ": spare-devices setting is incompatible with raid level %d\n", - level); + if (s.level != UnSet && s.level <= 0 && s.level >= -1) { + pr_err("spare-devices setting is incompatible with raid level %d\n", + s.level); exit(2); } - sparedisks = strtol(optarg, &c, 10); - if (!optarg[0] || *c || sparedisks < 0) { - fprintf(stderr, Name ": invalid number of spare-devices: %s\n", + s.sparedisks = parse_num(optarg); + if (s.sparedisks < 0) { + pr_err("invalid number of spare-devices: %s\n", optarg); exit(2); } @@ -613,7 +622,7 @@ case O(INCREMENTAL,Auto): case O(ASSEMBLE,'a'): case O(ASSEMBLE,Auto): /* auto-creation of device node */ - autof = parse_auto(optarg, "--auto flag", 0); + c.autof = parse_auto(optarg, "--auto flag", 0); continue; case O(CREATE,Symlinks): @@ -633,25 +642,25 @@ case O(MISC,'f'): /* force zero */ case O(MISC,Force): /* force zero */ case O(MANAGE,Force): /* add device which is too large */ - force=1; + c.force=1; continue; /* now for the Assemble options */ case O(ASSEMBLE, FreezeReshape): /* Freeze reshape during * initrd phase */ case O(INCREMENTAL, FreezeReshape): - freeze_reshape = 1; + c.freeze_reshape = 1; continue; case O(CREATE,'u'): /* uuid of array */ case O(ASSEMBLE,'u'): /* uuid of array */ if (ident.uuid_set) { - fprintf(stderr, Name ": uuid cannot be set twice. " + pr_err("uuid cannot be set twice. " "Second value %s.\n", optarg); exit(2); } if (parse_uuid(optarg, ident.uuid)) ident.uuid_set = 1; else { - fprintf(stderr,Name ": Bad uuid: %s\n", optarg); + pr_err("Bad uuid: %s\n", optarg); exit(2); } continue; @@ -660,16 +669,16 @@ case O(ASSEMBLE,'N'): case O(MISC,'N'): if (ident.name[0]) { - fprintf(stderr, Name ": name cannot be set twice. " + pr_err("name cannot be set twice. " "Second value %s.\n", optarg); exit(2); } - if (mode == MISC && !subarray) { - fprintf(stderr, Name ": -N/--name only valid with --update-subarray in misc mode\n"); + if (mode == MISC && !c.subarray) { + pr_err("-N/--name only valid with --update-subarray in misc mode\n"); exit(2); } if (strlen(optarg) > 32) { - fprintf(stderr, Name ": name '%s' is too long, 32 chars max.\n", + pr_err("name '%s' is too long, 32 chars max.\n", optarg); exit(2); } @@ -679,113 +688,128 @@ case O(ASSEMBLE,'m'): /* super-minor for array */ case O(ASSEMBLE,SuperMinor): if (ident.super_minor != UnSet) { - fprintf(stderr, Name ": super-minor cannot be set twice. " + pr_err("super-minor cannot be set twice. " "Second value: %s.\n", optarg); exit(2); } if (strcmp(optarg, "dev")==0) ident.super_minor = -2; else { - ident.super_minor = strtoul(optarg, &cp, 10); - if (!optarg[0] || *cp) { - fprintf(stderr, Name ": Bad super-minor number: %s.\n", optarg); + ident.super_minor = parse_num(optarg); + if (ident.super_minor < 0) { + pr_err("Bad super-minor number: %s.\n", optarg); exit(2); } } continue; + case O(ASSEMBLE,'o'): + case O(MANAGE,'o'): + case O(CREATE,'o'): + c.readonly = 1; + continue; + case O(ASSEMBLE,'U'): /* update the superblock */ case O(MISC,'U'): - if (update) { - fprintf(stderr, Name ": Can only update one aspect" + if (c.update) { + pr_err("Can only update one aspect" " of superblock, both %s and %s given.\n", - update, optarg); + c.update, optarg); exit(2); } - if (mode == MISC && !subarray) { - fprintf(stderr, Name ": Only subarrays can be" + if (mode == MISC && !c.subarray) { + pr_err("Only subarrays can be" " updated in misc mode\n"); exit(2); } - update = optarg; - if (strcmp(update, "sparc2.2")==0) + c.update = optarg; + if (strcmp(c.update, "sparc2.2")==0) continue; - if (strcmp(update, "super-minor") == 0) + if (strcmp(c.update, "super-minor") == 0) continue; - if (strcmp(update, "summaries")==0) + if (strcmp(c.update, "summaries")==0) continue; - if (strcmp(update, "resync")==0) + if (strcmp(c.update, "resync")==0) continue; - if (strcmp(update, "uuid")==0) + if (strcmp(c.update, "uuid")==0) continue; - if (strcmp(update, "name")==0) + if (strcmp(c.update, "name")==0) continue; - if (strcmp(update, "homehost")==0) + if (strcmp(c.update, "homehost")==0) continue; - if (strcmp(update, "devicesize")==0) + if (strcmp(c.update, "devicesize")==0) continue; - if (strcmp(update, "no-bitmap")==0) + if (strcmp(c.update, "no-bitmap")==0) continue; - if (strcmp(update, "byteorder")==0) { + if (strcmp(c.update, "bbl") == 0) + continue; + if (strcmp(c.update, "no-bbl") == 0) + continue; + if (strcmp(c.update, "metadata") == 0) + continue; + if (strcmp(c.update, "revert-reshape") == 0) + continue; + if (strcmp(c.update, "byteorder")==0) { if (ss) { - fprintf(stderr, - Name ": must not set metadata" - " type with --update=byteorder.\n"); + pr_err("must not set metadata" + " type with --update=byteorder.\n"); exit(2); } for(i=0; !ss && superlist[i]; i++) ss = superlist[i]->match_metadata_desc( "0.swap"); if (!ss) { - fprintf(stderr, Name ": INTERNAL ERROR" + pr_err("INTERNAL ERROR" " cannot find 0.swap\n"); exit(2); } continue; } - if (strcmp(update,"?") == 0 || - strcmp(update, "help") == 0) { + if (strcmp(c.update,"?") == 0 || + strcmp(c.update, "help") == 0) { outf = stdout; fprintf(outf, Name ": "); } else { outf = stderr; fprintf(outf, Name ": '--update=%s' is invalid. ", - update); + c.update); } fprintf(outf, "Valid --update options are:\n" " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n" " 'summaries', 'homehost', 'byteorder', 'devicesize',\n" - " 'no-bitmap'\n"); + " 'no-bitmap', 'metadata', 'revert-reshape'\n"); exit(outf == stdout ? 0 : 2); case O(MANAGE,'U'): /* update=devicesize is allowed with --re-add */ - if (devmode != 'a' || re_add != 1) { - fprintf(stderr, Name "--update in Manage mode only" + if (devmode != 'A') { + pr_err("--update in Manage mode only" " allowed with --re-add.\n"); exit(1); } - if (update) { - fprintf(stderr, Name ": Can only update one aspect" + if (c.update) { + pr_err("Can only update one aspect" " of superblock, both %s and %s given.\n", - update, optarg); + c.update, optarg); exit(2); } - update = optarg; - if (strcmp(update, "devicesize") != 0) { - fprintf(stderr, Name ": only 'devicesize' can be" + c.update = optarg; + if (strcmp(c.update, "devicesize") != 0 && + strcmp(c.update, "bbl") != 0 && + strcmp(c.update, "no-bbl") != 0) { + pr_err("only 'devicesize', 'bbl' and 'no-bbl' can be" " updated with --re-add\n"); exit(2); } continue; case O(INCREMENTAL,NoDegraded): - fprintf(stderr, Name ": --no-degraded is deprecated in Incremental mode\n"); + pr_err("--no-degraded is deprecated in Incremental mode\n"); case O(ASSEMBLE,NoDegraded): /* --no-degraded */ - runstop = -1; /* --stop isn't allowed for --assemble, - * so we overload slightly */ + c.runstop = -1; /* --stop isn't allowed for --assemble, + * so we overload slightly */ continue; case O(ASSEMBLE,'c'): @@ -796,8 +820,9 @@ case O(MISC, ConfigFile): case O(MONITOR,'c'): case O(MONITOR,ConfigFile): + case O(CREATE,ConfigFile): if (configfile) { - fprintf(stderr, Name ": configfile cannot be set twice. " + pr_err("configfile cannot be set twice. " "Second value is %s.\n", optarg); exit(2); } @@ -809,13 +834,13 @@ case O(MISC,'s'): case O(MONITOR,'s'): case O(INCREMENTAL,'s'): - scan = 1; + c.scan = 1; continue; case O(MONITOR,'m'): /* mail address */ case O(MONITOR,EMail): if (mailaddr) - fprintf(stderr, Name ": only specify one mailaddress. %s ignored.\n", + pr_err("only specify one mailaddress. %s ignored.\n", optarg); else mailaddr = optarg; @@ -824,7 +849,7 @@ case O(MONITOR,'p'): /* alert program */ case O(MONITOR,ProgramOpt): /* alert program */ if (program) - fprintf(stderr, Name ": only specify one alter program. %s ignored.\n", + pr_err("only specify one alter program. %s ignored.\n", optarg); else program = optarg; @@ -833,8 +858,8 @@ case O(MONITOR,'r'): /* rebuild increments */ case O(MONITOR,Increment): increments = atoi(optarg); - if (increments>99 || increments<1) { - fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n"); + if (increments > 99 || increments < 1) { + pr_err("please specify positive integer between 1 and 99 as rebuild increments.\n"); exit(2); } continue; @@ -843,13 +868,13 @@ case O(GROW, 'd'): case O(BUILD,'d'): /* delay for bitmap updates */ case O(CREATE,'d'): - if (delay) - fprintf(stderr, Name ": only specify delay once. %s ignored.\n", + if (c.delay) + pr_err("only specify delay once. %s ignored.\n", optarg); else { - delay = strtol(optarg, &c, 10); - if (!optarg[0] || *c || delay<1) { - fprintf(stderr, Name ": invalid delay: %s\n", + c.delay = parse_num(optarg); + if (c.delay < 1) { + pr_err("invalid delay: %s\n", optarg); exit(2); } @@ -861,7 +886,7 @@ continue; case O(MONITOR,'i'): /* pid */ if (pidfile) - fprintf(stderr, Name ": only specify one pid file. %s ignored.\n", + pr_err("only specify one pid file. %s ignored.\n", optarg); else pidfile = optarg; @@ -871,7 +896,7 @@ spare_sharing = 0; continue; case O(MONITOR,'t'): /* test */ - test = 1; + c.test = 1; continue; case O(MONITOR,'y'): /* log messages to syslog */ openlog("mdadm", LOG_PID, SYSLOG_FACILITY); @@ -880,6 +905,7 @@ case O(MONITOR, NoSharing): spare_sharing = 0; continue; + /* now the general management options. Some are applicable * to other modes. None have arguments. */ @@ -888,11 +914,9 @@ case O(MANAGE,'a'): case O(MANAGE,Add): /* add a drive */ devmode = 'a'; - re_add = 0; continue; case O(MANAGE,ReAdd): - devmode = 'a'; - re_add = 1; + devmode = 'A'; continue; case O(MANAGE,'r'): /* remove a drive */ case O(MANAGE,Remove): @@ -903,39 +927,52 @@ case O(INCREMENTAL,'f'): case O(INCREMENTAL,Remove): case O(INCREMENTAL,Fail): /* r for incremental is taken, use f - * even though we will both fail and - * remove the device */ + * even though we will both fail and + * remove the device */ devmode = 'f'; continue; + case O(MANAGE,Replace): + /* Mark these devices for replacement */ + devmode = 'R'; + continue; + case O(MANAGE,With): + /* These are the replacements to use */ + if (devmode != 'R') { + pr_err("--with must follow --replace\n"); + exit(2); + } + devmode = 'W'; + continue; case O(INCREMENTAL,'R'): case O(MANAGE,'R'): case O(ASSEMBLE,'R'): case O(BUILD,'R'): case O(CREATE,'R'): /* Run the array */ - if (runstop < 0) { - fprintf(stderr, Name ": Cannot both Stop and Run an array\n"); + if (c.runstop < 0) { + pr_err("Cannot both Stop and Run an array\n"); exit(2); } - runstop = 1; + c.runstop = 1; continue; case O(MANAGE,'S'): - if (runstop > 0) { - fprintf(stderr, Name ": Cannot both Run and Stop an array\n"); + if (c.runstop > 0) { + pr_err("Cannot both Run and Stop an array\n"); exit(2); } - runstop = -1; + c.runstop = -1; continue; case O(MANAGE,'t'): - test = 1; + c.test = 1; continue; case O(MISC,'Q'): case O(MISC,'D'): case O(MISC,'E'): - case O(MISC,'K'): + case O(MISC,KillOpt): case O(MISC,'R'): case O(MISC,'S'): case O(MISC,'X'): + case O(MISC, ExamineBB): case O(MISC,'o'): case O(MISC,'w'): case O(MISC,'W'): @@ -944,17 +981,19 @@ case O(MISC, DetailPlatform): case O(MISC, KillSubarray): case O(MISC, UpdateSubarray): + case O(MISC, Dump): + case O(MISC, Restore): if (opt == KillSubarray || opt == UpdateSubarray) { - if (subarray) { - fprintf(stderr, Name ": subarray can only" + if (c.subarray) { + pr_err("subarray can only" " be specified once\n"); exit(2); } - subarray = optarg; + c.subarray = optarg; } if (devmode && devmode != opt && (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) { - fprintf(stderr, Name ": --examine/-E cannot be given with "); + pr_err("--examine/-E cannot be given with "); if (devmode == 'E') { if (option_index >= 0) fprintf(stderr, "--%s\n", @@ -968,46 +1007,54 @@ exit(2); } devmode = opt; + if (opt == Dump || opt == Restore) { + if (dump_directory != NULL) { + pr_err("dump/restore directory specified twice: %s and %s\n", + dump_directory, optarg); + exit(2); + } + dump_directory = optarg; + } continue; - case O(MISC, UdevRules): - if (devmode && devmode != opt) { - fprintf(stderr, Name ": --udev-rules must" + case O(MISC, UdevRules): + if (devmode && devmode != opt) { + pr_err("--udev-rules must" " be the only option.\n"); - } else { - if (udev_filename) - fprintf(stderr, Name ": only specify one udev " - "rule filename. %s ignored.\n", - optarg); - else - udev_filename = optarg; - } - devmode = opt; - continue; + } else { + if (udev_filename) + pr_err("only specify one udev " + "rule filename. %s ignored.\n", + optarg); + else + udev_filename = optarg; + } + devmode = opt; + continue; case O(MISC,'t'): - test = 1; + c.test = 1; continue; case O(MISC, Sparc22): if (devmode != 'E') { - fprintf(stderr, Name ": --sparc2.2 only allowed with --examine\n"); + pr_err("--sparc2.2 only allowed with --examine\n"); exit(2); } - SparcAdjust = 1; + c.SparcAdjust = 1; continue; case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */ case O(ASSEMBLE,Bitmap): if (!optarg) { - fprintf(stderr, Name ": bitmap file needed with -b in --assemble mode\n"); + pr_err("bitmap file needed with -b in --assemble mode\n"); exit(2); } if (strcmp(optarg, "internal")==0) { - fprintf(stderr, Name ": there is no need to specify --bitmap when assembling arrays with internal bitmaps\n"); + pr_err("there is no need to specify --bitmap when assembling arrays with internal bitmaps\n"); continue; } bitmap_fd = open(optarg, O_RDWR); if (!*optarg || bitmap_fd < 0) { - fprintf(stderr, Name ": cannot open bitmap file %s: %s\n", optarg, strerror(errno)); + pr_err("cannot open bitmap file %s: %s\n", optarg, strerror(errno)); exit(2); } ident.bitmap_fd = bitmap_fd; /* for Assemble */ @@ -1018,11 +1065,11 @@ /* Specify a file into which grow might place a backup, * or from which assemble might recover a backup */ - if (backup_file) { - fprintf(stderr, Name ": backup file already specified, rejecting %s\n", optarg); + if (c.backup_file) { + pr_err("backup file already specified, rejecting %s\n", optarg); exit(2); } - backup_file = optarg; + c.backup_file = optarg; continue; case O(GROW, Continue): @@ -1034,55 +1081,49 @@ /* Acknowledge that the backupfile is invalid, but ask * to continue anyway */ - invalid_backup = 1; + c.invalid_backup = 1; continue; case O(BUILD,'b'): case O(BUILD,Bitmap): case O(CREATE,'b'): case O(CREATE,Bitmap): /* here we create the bitmap */ - if (strcmp(optarg, "none") == 0) { - fprintf(stderr, Name ": '--bitmap none' only" - " support for --grow\n"); - exit(2); - } - /* FALL THROUGH */ case O(GROW,'b'): case O(GROW,Bitmap): if (strcmp(optarg, "internal")== 0 || strcmp(optarg, "none")== 0 || strchr(optarg, '/') != NULL) { - bitmap_file = optarg; + s.bitmap_file = optarg; continue; } /* probable typo */ - fprintf(stderr, Name ": bitmap file must contain a '/', or be 'internal', or 'none'\n" + pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n" " not '%s'\n", optarg); exit(2); case O(GROW,BitmapChunk): case O(BUILD,BitmapChunk): case O(CREATE,BitmapChunk): /* bitmap chunksize */ - bitmap_chunk = parse_size(optarg); - if (bitmap_chunk <= 0 || - bitmap_chunk & (bitmap_chunk - 1)) { - fprintf(stderr, - Name ": invalid bitmap chunksize: %s\n", - optarg); + s.bitmap_chunk = parse_size(optarg); + if (s.bitmap_chunk == 0 || + s.bitmap_chunk == INVALID_SECTORS || + s.bitmap_chunk & (s.bitmap_chunk - 1)) { + pr_err("invalid bitmap chunksize: %s\n", + optarg); exit(2); } - bitmap_chunk = bitmap_chunk * 512; + s.bitmap_chunk = s.bitmap_chunk * 512; continue; case O(GROW, WriteBehind): case O(BUILD, WriteBehind): case O(CREATE, WriteBehind): /* write-behind mode */ - write_behind = DEFAULT_MAX_WRITE_BEHIND; + s.write_behind = DEFAULT_MAX_WRITE_BEHIND; if (optarg) { - write_behind = strtol(optarg, &c, 10); - if (write_behind < 0 || *c || - write_behind > 16383) { - fprintf(stderr, Name ": Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", optarg); + s.write_behind = parse_num(optarg); + if (s.write_behind < 0 || + s.write_behind > 16383) { + pr_err("Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", optarg); exit(2); } } @@ -1100,31 +1141,24 @@ * an error */ if (option_index > 0) - fprintf(stderr, Name ":option --%s not valid in %s mode\n", + pr_err(":option --%s not valid in %s mode\n", long_options[option_index].name, map_num(modes, mode)); else - fprintf(stderr, Name ": option -%c not valid in %s mode\n", + pr_err("option -%c not valid in %s mode\n", opt, map_num(modes, mode)); exit(2); } if (print_help) { - char *help_text = Help; + char *help_text; if (print_help == 2) help_text = OptionHelp; else - switch (mode) { - case ASSEMBLE : help_text = Help_assemble; break; - case BUILD : help_text = Help_build; break; - case CREATE : help_text = Help_create; break; - case MANAGE : help_text = Help_manage; break; - case MISC : help_text = Help_misc; break; - case MONITOR : help_text = Help_monitor; break; - case GROW : help_text = Help_grow; break; - case INCREMENTAL:help_text= Help_incr; break; - } + help_text = mode_help[mode]; + if (help_text == NULL) + help_text = Help; fputs(help_text,stdout); exit(0); } @@ -1148,7 +1182,7 @@ else if (strcasecmp(symlinks, "no") == 0) ci->symlinks = 0; else { - fprintf(stderr, Name ": option --symlinks must be 'no' or 'yes'\n"); + pr_err("option --symlinks must be 'no' or 'yes'\n"); exit(2); } } @@ -1158,18 +1192,19 @@ * * That is mosty checked in the per-mode stuff but... * - * For @,B,C and A without -s, the first device listed must be an md device - * we check that here and open it. + * For @,B,C and A without -s, the first device listed must be + * an md device. We check that here and open it. */ - if (mode==MANAGE || mode == BUILD || mode == CREATE || mode == GROW || - (mode == ASSEMBLE && ! scan)) { + if (mode == MANAGE || mode == BUILD || mode == CREATE + || mode == GROW + || (mode == ASSEMBLE && ! c.scan)) { if (devs_found < 1) { - fprintf(stderr, Name ": an md device must be given in this mode\n"); + pr_err("an md device must be given in this mode\n"); exit(2); } - if ((int)ident.super_minor == -2 && autof) { - fprintf(stderr, Name ": --super-minor=dev is incompatible with --auto\n"); + if ((int)ident.super_minor == -2 && c.autof) { + pr_err("--super-minor=dev is incompatible with --auto\n"); exit(2); } if (mode == MANAGE || mode == GROW) { @@ -1180,14 +1215,14 @@ /* non-existent device is OK */ mdfd = open_mddev(devlist->devname, 0); if (mdfd == -2) { - fprintf(stderr, Name ": device %s exists but is not an " + pr_err("device %s exists but is not an " "md array.\n", devlist->devname); exit(1); } if ((int)ident.super_minor == -2) { struct stat stb; if (mdfd < 0) { - fprintf(stderr, Name ": --super-minor=dev given, and " + pr_err("--super-minor=dev given, and " "listed device %s doesn't exist.\n", devlist->devname); exit(1); @@ -1204,442 +1239,220 @@ } } - if (raiddisks) { - if (raiddisks == 1 && !force && level != -5) { - fprintf(stderr, Name ": '1' is an unusual number of drives for an array, so it is probably\n" + if (s.raiddisks) { + if (s.raiddisks == 1 && !c.force && s.level != LEVEL_FAULTY) { + pr_err("'1' is an unusual number of drives for an array, so it is probably\n" " a mistake. If you really mean it you will need to specify --force before\n" " setting the number of drives.\n"); exit(2); } } - if (homehost == NULL) - homehost = conf_get_homehost(&require_homehost); - if (homehost == NULL || strcasecmp(homehost, "")==0) { + if (c.homehost == NULL) + c.homehost = conf_get_homehost(&c.require_homehost); + if (c.homehost == NULL || strcasecmp(c.homehost, "")==0) { if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { sys_hostname[sizeof(sys_hostname)-1] = 0; - homehost = sys_hostname; + c.homehost = sys_hostname; } } - if (homehost && (!homehost[0] || strcasecmp(homehost, "") == 0)) { - homehost = NULL; - require_homehost = 0; + if (c.homehost && (!c.homehost[0] || strcasecmp(c.homehost, "") == 0)) { + c.homehost = NULL; + c.require_homehost = 0; } - if (!((mode == MISC && devmode == 'E') - || (mode == MONITOR && spare_sharing == 0)) && - geteuid() != 0) { - fprintf(stderr, Name ": must be super-user to perform this action\n"); + if (c.backup_file && data_offset != INVALID_SECTORS) { + pr_err("--backup-file and --data-offset are incompatible\n"); + exit(2); + } + + if ((mode == MISC && devmode == 'E') + || (mode == MONITOR && spare_sharing == 0)) + /* Anyone may try this */; + else if (geteuid() != 0) { + pr_err("must be super-user to perform this action\n"); exit(1); } - ident.autof = autof; + ident.autof = c.autof; + + if (c.scan && c.verbose < 2) + /* --scan implied --brief unless -vv */ + c.brief = 1; rv = 0; switch(mode) { case MANAGE: /* readonly, add/remove, readwrite, runstop */ - if (readonly>0) - rv = Manage_ro(devlist->devname, mdfd, readonly); + if (c.readonly > 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); if (!rv && devs_found>1) rv = Manage_subdevs(devlist->devname, mdfd, - devlist->next, verbose-quiet, test, - update, force); - if (!rv && readonly < 0) - rv = Manage_ro(devlist->devname, mdfd, readonly); - if (!rv && runstop) - rv = Manage_runstop(devlist->devname, mdfd, runstop, quiet); + devlist->next, c.verbose, c.test, + c.update, c.force); + if (!rv && c.readonly < 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && c.runstop > 0) + rv = Manage_run(devlist->devname, mdfd, c.verbose); + if (!rv && c.runstop < 0) + rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0); break; case ASSEMBLE: if (devs_found == 1 && ident.uuid_set == 0 && - ident.super_minor == UnSet && ident.name[0] == 0 && !scan ) { + ident.super_minor == UnSet && ident.name[0] == 0 && !c.scan ) { /* Only a device has been given, so get details from config file */ struct mddev_ident *array_ident = conf_get_ident(devlist->devname); if (array_ident == NULL) { - fprintf(stderr, Name ": %s not identified in config file.\n", + pr_err("%s not identified in config file.\n", devlist->devname); rv |= 1; if (mdfd >= 0) close(mdfd); } else { if (array_ident->autof == 0) - array_ident->autof = autof; + array_ident->autof = c.autof; rv |= Assemble(ss, devlist->devname, array_ident, - NULL, backup_file, invalid_backup, - readonly, runstop, update, - homehost, require_homehost, - verbose-quiet, force, - freeze_reshape); + NULL, &c); } - } else if (!scan) + } else if (!c.scan) rv = Assemble(ss, devlist->devname, &ident, - devlist->next, backup_file, invalid_backup, - readonly, runstop, update, - homehost, require_homehost, - verbose-quiet, force, - freeze_reshape); - else if (devs_found>0) { - if (update && devs_found > 1) { - fprintf(stderr, Name ": can only update a single array at a time\n"); + devlist->next, &c); + else if (devs_found > 0) { + if (c.update && devs_found > 1) { + pr_err("can only update a single array at a time\n"); exit(1); } - if (backup_file && devs_found > 1) { - fprintf(stderr, Name ": can only assemble a single array when providing a backup file.\n"); + if (c.backup_file && devs_found > 1) { + pr_err("can only assemble a single array when providing a backup file.\n"); exit(1); } for (dv = devlist ; dv ; dv=dv->next) { struct mddev_ident *array_ident = conf_get_ident(dv->devname); if (array_ident == NULL) { - fprintf(stderr, Name ": %s not identified in config file.\n", + pr_err("%s not identified in config file.\n", dv->devname); rv |= 1; continue; } if (array_ident->autof == 0) - array_ident->autof = autof; + array_ident->autof = c.autof; rv |= Assemble(ss, dv->devname, array_ident, - NULL, backup_file, invalid_backup, - readonly, runstop, update, - homehost, require_homehost, - verbose-quiet, force, - freeze_reshape); + NULL, &c); } } else { - struct mddev_ident *a, *array_list = conf_get_ident(NULL); - struct mddev_dev *devlist = conf_get_devs(); - struct map_ent *map = NULL; - int cnt = 0; - int failures, successes; - - if (conf_verify_devnames(array_list)) { - fprintf(stderr, Name - ": Duplicate MD device names in " - "conf file were found.\n"); - exit(1); - } - if (devlist == NULL) { - fprintf(stderr, Name ": No devices listed in conf file were found.\n"); - exit(1); - } - if (update) { - fprintf(stderr, Name ": --update not meaningful with a --scan assembly.\n"); + if (c.update) { + pr_err("--update not meaningful with a --scan assembly.\n"); exit(1); } - if (backup_file) { - fprintf(stderr, Name ": --backup_file not meaningful with a --scan assembly.\n"); + if (c.backup_file) { + pr_err("--backup_file not meaningful with a --scan assembly.\n"); exit(1); } - for (a = array_list; a ; a = a->next) { - a->assembled = 0; - if (a->autof == 0) - a->autof = autof; - } - if (map_lock(&map)) - fprintf(stderr, Name " %s: failed to get " - "exclusive lock on mapfile\n", - __func__); - do { - failures = 0; - successes = 0; - rv = 0; - for (a = array_list; a ; a = a->next) { - int r; - if (a->assembled) - continue; - if (a->devname && - strcasecmp(a->devname, "") == 0) - continue; - - r = Assemble(ss, a->devname, - a, - NULL, NULL, 0, - readonly, runstop, NULL, - homehost, require_homehost, - verbose-quiet, force, - freeze_reshape); - if (r == 0) { - a->assembled = 1; - successes++; - } else - failures++; - rv |= r; - cnt++; - } - } while (failures && successes); - if (homehost && cnt == 0) { - /* Maybe we can auto-assemble something. - * Repeatedly call Assemble in auto-assemble mode - * until it fails - */ - int rv2; - int acnt; - ident.autof = autof; - do { - struct mddev_dev *devlist = conf_get_devs(); - acnt = 0; - do { - rv2 = Assemble(ss, NULL, - &ident, - devlist, NULL, 0, - readonly, - runstop, NULL, - homehost, - require_homehost, - verbose-quiet, - force, - freeze_reshape); - if (rv2==0) { - cnt++; - acnt++; - } - } while (rv2!=2); - /* Incase there are stacked devices, we need to go around again */ - } while (acnt); - if (cnt == 0 && rv == 0) { - fprintf(stderr, Name ": No arrays found in config file or automatically\n"); - rv = 1; - } else if (cnt) - rv = 0; - } else if (cnt == 0 && rv == 0) { - fprintf(stderr, Name ": No arrays found in config file\n"); - rv = 1; - } - map_unlock(&map); + rv = scan_assemble(ss, &c, &ident); } + break; case BUILD: - if (delay == 0) delay = DEFAULT_BITMAP_DELAY; - if (write_behind && !bitmap_file) { - fprintf(stderr, Name ": write-behind mode requires a bitmap.\n"); + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); rv = 1; break; } - if (raiddisks == 0) { - fprintf(stderr, Name ": no raid-devices specified.\n"); + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); rv = 1; break; } - if (bitmap_file) { - if (strcmp(bitmap_file, "internal")==0) { - fprintf(stderr, Name ": 'internal' bitmaps not supported with --build\n"); + if (s.bitmap_file) { + if (strcmp(s.bitmap_file, "internal")==0) { + pr_err("'internal' bitmaps not supported with --build\n"); rv |= 1; break; } } - rv = Build(devlist->devname, chunk, level, layout, - raiddisks, devlist->next, assume_clean, - bitmap_file, bitmap_chunk, write_behind, - delay, verbose-quiet, autof, size); + rv = Build(devlist->devname, devlist->next, &s, &c); break; case CREATE: - if (delay == 0) delay = DEFAULT_BITMAP_DELAY; - if (write_behind && !bitmap_file) { - fprintf(stderr, Name ": write-behind mode requires a bitmap.\n"); + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); rv = 1; break; } - if (raiddisks == 0) { - fprintf(stderr, Name ": no raid-devices specified.\n"); + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); rv = 1; break; } - rv = Create(ss, devlist->devname, chunk, level, layout, size<0 ? 0 : size, - raiddisks, sparedisks, ident.name, homehost, - ident.uuid_set ? ident.uuid : NULL, - devs_found-1, devlist->next, runstop, verbose-quiet, force, assume_clean, - bitmap_file, bitmap_chunk, write_behind, delay, autof); + rv = Create(ss, devlist->devname, + ident.name, ident.uuid_set ? ident.uuid : NULL, + devs_found-1, devlist->next, + &s, &c, data_offset); break; case MISC: if (devmode == 'E') { - if (devlist == NULL && !scan) { - fprintf(stderr, Name ": No devices to examine\n"); + if (devlist == NULL && !c.scan) { + pr_err("No devices to examine\n"); exit(2); } if (devlist == NULL) devlist = conf_get_devs(); if (devlist == NULL) { - fprintf(stderr, Name ": No devices listed in %s\n", configfile?configfile:DefaultConfFile); + pr_err("No devices listed in %s\n", configfile?configfile:DefaultConfFile); exit(1); } - if (brief && verbose) - brief = 2; - rv = Examine(devlist, scan?(verbose>1?0:verbose+1):brief, - export, scan, - SparcAdjust, ss, homehost); + rv = Examine(devlist, &c, ss); } else if (devmode == DetailPlatform) { - rv = Detail_Platform(ss ? ss->ss : NULL, ss ? scan : 1, verbose); - } else { - if (devlist == NULL) { - if ((devmode=='D' || devmode == Waitclean) && scan) { - /* apply --detail or --wait-clean to - * all devices in /proc/mdstat - */ - struct mdstat_ent *ms = mdstat_read(0, 1); - struct mdstat_ent *e; - struct map_ent *map = NULL; - int members; - int v = verbose>1?0:verbose+1; - - for (members = 0; members <= 1; members++) { - for (e=ms ; e ; e=e->next) { - char *name; - struct map_ent *me; - int member = e->metadata_version && - strncmp(e->metadata_version, - "external:/", 10) == 0; - if (members != member) - continue; - me = map_by_devnum(&map, e->devnum); - if (me && me->path - && strcmp(me->path, "/unknown") != 0) - name = me->path; - else - name = get_md_name(e->devnum); - - if (!name) { - fprintf(stderr, Name ": cannot find device file for %s\n", - e->dev); - continue; - } - if (devmode == 'D') - rv |= Detail(name, v, - export, test, - homehost, prefer); - else - rv |= WaitClean(name, -1, v); - put_md_name(name); - } - } - free_mdstat(ms); - } else if (devmode == 'S' && scan) { - /* apply --stop to all devices in /proc/mdstat */ - /* Due to possible stacking of devices, repeat until - * nothing more can be stopped - */ - int progress=1, err; - int last = 0; - do { - struct mdstat_ent *ms = mdstat_read(0, 0); - struct mdstat_ent *e; - - if (!progress) last = 1; - progress = 0; err = 0; - for (e=ms ; e ; e=e->next) { - char *name = get_md_name(e->devnum); - - if (!name) { - fprintf(stderr, Name ": cannot find device file for %s\n", - e->dev); - continue; - } - mdfd = open_mddev(name, 1); - if (mdfd >= 0) { - if (Manage_runstop(name, mdfd, -1, quiet?1:last?0:-1)) - err = 1; - else - progress = 1; - close(mdfd); - } - - put_md_name(name); - } - free_mdstat(ms); - } while (!last && err); - if (err) rv |= 1; - } else if (devmode == UdevRules) { - rv = Write_rules(udev_filename); - } else { - fprintf(stderr, Name ": No devices given.\n"); - exit(2); - } - } - for (dv=devlist ; dv; dv=dv->next) { - switch(dv->disposition) { - case 'D': - rv |= Detail(dv->devname, - brief?1+verbose:0, - export, test, homehost, prefer); - continue; - case 'K': /* Zero superblock */ - if (ss) - rv |= Kill(dv->devname, ss, force, quiet,0); - else { - int q = quiet; - do { - rv |= Kill(dv->devname, NULL, force, q, 0); - q = 1; - } while (rv == 0); - rv &= ~2; - } - continue; - case 'Q': - rv |= Query(dv->devname); continue; - case 'X': - rv |= ExamineBitmap(dv->devname, brief, ss); continue; - case 'W': - case WaitOpt: - rv |= Wait(dv->devname); continue; - case Waitclean: - rv |= WaitClean(dv->devname, -1, verbose-quiet); continue; - case KillSubarray: - rv |= Kill_subarray(dv->devname, subarray, quiet); - continue; - case UpdateSubarray: - if (update == NULL) { - fprintf(stderr, - Name ": -U/--update must be specified with --update-subarray\n"); - rv |= 1; - continue; - } - rv |= Update_subarray(dv->devname, subarray, update, &ident, quiet); - continue; - } - mdfd = open_mddev(dv->devname, 1); - if (mdfd>=0) { - switch(dv->disposition) { - case 'R': - rv |= Manage_runstop(dv->devname, mdfd, 1, quiet); break; - case 'S': - rv |= Manage_runstop(dv->devname, mdfd, -1, quiet); break; - case 'o': - rv |= Manage_ro(dv->devname, mdfd, 1); break; - case 'w': - rv |= Manage_ro(dv->devname, mdfd, -1); break; - } - close(mdfd); - } else - rv |= 1; + rv = Detail_Platform(ss ? ss->ss : NULL, ss ? c.scan : 1, + c.verbose, c.export, + devlist ? devlist->devname : NULL); + } else if (devlist == NULL) { + if (devmode == 'S' && c.scan) + rv = stop_scan(c.verbose); + else if ((devmode == 'D' || devmode == Waitclean) && c.scan) + rv = misc_scan(devmode, &c); + else if (devmode == UdevRules) + rv = Write_rules(udev_filename); + else { + pr_err("No devices given.\n"); + exit(2); } - } + } else + rv = misc_list(devlist, &ident, dump_directory, ss, &c); break; case MONITOR: - if (!devlist && !scan) { - fprintf(stderr, Name ": Cannot monitor: need --scan or at least one device\n"); + if (!devlist && !c.scan) { + pr_err("Cannot monitor: need --scan or at least one device\n"); rv = 1; break; } if (pidfile && !daemonise) { - fprintf(stderr, Name ": Cannot write a pid file when not in daemon mode\n"); + pr_err("Cannot write a pid file when not in daemon mode\n"); rv = 1; break; } - if (delay == 0) { + if (c.delay == 0) { if (get_linux_version() > 2006016) /* mdstat responds to poll */ - delay = 1000; + c.delay = 1000; else - delay = 60; + c.delay = 60; } + if (c.delay == 0) + c.delay = 60; rv= Monitor(devlist, mailaddr, program, - delay?delay:60, daemonise, scan, oneshot, - dosyslog, test, pidfile, increments, - spare_sharing, prefer); + &c, daemonise, oneshot, + dosyslog, pidfile, increments, + spare_sharing); break; case GROW: - if (array_size >= 0) { + if (array_size > 0) { /* alway impose array size first, independent of * anything else * Do not allow level or raid_disks changes at the @@ -1647,33 +1460,33 @@ */ struct mdinfo sra; int err; - if (raiddisks || level != UnSet) { - fprintf(stderr, Name ": cannot change array size in same operation " + if (s.raiddisks || s.level != UnSet) { + pr_err("cannot change array size in same operation " "as changing raiddisks or level.\n" " Change size first, then check that data is still intact.\n"); rv = 1; break; } - sysfs_init(&sra, mdfd, 0); - if (array_size == 0) + sysfs_init(&sra, mdfd, NULL); + if (array_size == MAX_SIZE) err = sysfs_set_str(&sra, NULL, "array_size", "default"); else err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2); if (err < 0) { if (errno == E2BIG) - fprintf(stderr, Name ": --array-size setting" + pr_err("--array-size setting" " is too large.\n"); else - fprintf(stderr, Name ": current kernel does" + pr_err("current kernel does" " not support setting --array-size\n"); rv = 1; break; } } - if (devs_found > 1 && raiddisks == 0) { + if (devs_found > 1 && s.raiddisks == 0 && s.level == UnSet) { /* must be '-a'. */ - if (size >= 0 || chunk || layout_str != NULL || bitmap_file) { - fprintf(stderr, Name ": --add cannot be used with " + if (s.size > 0 || s.chunk || s.layout_str != NULL || s.bitmap_file) { + pr_err("--add cannot be used with " "other geometry changes in --grow mode\n"); rv = 1; break; @@ -1684,71 +1497,63 @@ if (rv) break; } - } else if (bitmap_file) { - if (size >= 0 || raiddisks || chunk || - layout_str != NULL || devs_found > 1) { - fprintf(stderr, Name ": --bitmap changes cannot be " + } else if (s.bitmap_file) { + if (s.size > 0 || s.raiddisks || s.chunk || + s.layout_str != NULL || devs_found > 1) { + pr_err("--bitmap changes cannot be " "used with other geometry changes " "in --grow mode\n"); rv = 1; break; } - if (delay == 0) - delay = DEFAULT_BITMAP_DELAY; - rv = Grow_addbitmap(devlist->devname, mdfd, bitmap_file, - bitmap_chunk, delay, write_behind, force); + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + rv = Grow_addbitmap(devlist->devname, mdfd, &c, &s); } else if (grow_continue) rv = Grow_continue_command(devlist->devname, - mdfd, backup_file, - verbose); - else if (size >= 0 || raiddisks != 0 || layout_str != NULL - || chunk != 0 || level != UnSet) { - rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file, - size, level, layout_str, chunk, raiddisks, + mdfd, c.backup_file, + c.verbose); + else if (s.size > 0 || s.raiddisks || s.layout_str != NULL + || s.chunk != 0 || s.level != UnSet + || data_offset != INVALID_SECTORS) { + rv = Grow_reshape(devlist->devname, mdfd, devlist->next, - assume_clean, force); - } else if (array_size < 0) - fprintf(stderr, Name ": no changes to --grow\n"); + data_offset, &c, &s); + } else if (array_size == 0) + pr_err("no changes to --grow\n"); break; case INCREMENTAL: if (rebuild_map) { RebuildMap(); } - if (scan) { - if (runstop <= 0) { - fprintf(stderr, Name - ": --incremental --scan meaningless without --run.\n"); + if (c.scan) { + if (c.runstop <= 0) { + pr_err("--incremental --scan meaningless without --run.\n"); break; } if (devmode == 'f') { - fprintf(stderr, Name - ": --incremental --scan --fail not supported.\n"); + pr_err("--incremental --scan --fail not supported.\n"); break; } - rv = IncrementalScan(verbose); + rv = IncrementalScan(c.verbose, NULL); } if (!devlist) { - if (!rebuild_map && !scan) { - fprintf(stderr, Name - ": --incremental requires a device.\n"); + if (!rebuild_map && !c.scan) { + pr_err("--incremental requires a device.\n"); rv = 1; } break; } if (devlist->next) { - fprintf(stderr, Name - ": --incremental can only handle one device.\n"); + pr_err("--incremental can only handle one device.\n"); rv = 1; break; } if (devmode == 'f') rv = IncrementalRemove(devlist->devname, remove_path, - verbose-quiet); + c.verbose); else - rv = Incremental(devlist->devname, verbose-quiet, - runstop, ss, homehost, - require_homehost, autof, - freeze_reshape); + rv = Incremental(devlist->devname, &c, ss); break; case AUTODETECT: autodetect(); @@ -1756,3 +1561,260 @@ } exit(rv); } + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident) +{ + struct mddev_ident *a, *array_list = conf_get_ident(NULL); + struct mddev_dev *devlist = conf_get_devs(); + struct map_ent *map = NULL; + int cnt = 0; + int rv = 0; + int failures, successes; + + if (conf_verify_devnames(array_list)) { + pr_err("Duplicate MD device names in " + "conf file were found.\n"); + return 1; + } + if (devlist == NULL) { + pr_err("No devices listed in conf file were found.\n"); + return 1; + } + for (a = array_list; a ; a = a->next) { + a->assembled = 0; + if (a->autof == 0) + a->autof = c->autof; + } + if (map_lock(&map)) + pr_err("%s: failed to get " + "exclusive lock on mapfile\n", + __func__); + do { + failures = 0; + successes = 0; + rv = 0; + for (a = array_list; a ; a = a->next) { + int r; + if (a->assembled) + continue; + if (a->devname && + strcasecmp(a->devname, "") == 0) + continue; + + r = Assemble(ss, a->devname, + a, NULL, c); + if (r == 0) { + a->assembled = 1; + successes++; + } else + failures++; + rv |= r; + cnt++; + } + } while (failures && successes); + if (c->homehost && cnt == 0) { + /* Maybe we can auto-assemble something. + * Repeatedly call Assemble in auto-assemble mode + * until it fails + */ + int rv2; + int acnt; + ident->autof = c->autof; + do { + struct mddev_dev *devlist = conf_get_devs(); + acnt = 0; + do { + rv2 = Assemble(ss, NULL, + ident, + devlist, c); + if (rv2==0) { + cnt++; + acnt++; + } + } while (rv2!=2); + /* Incase there are stacked devices, we need to go around again */ + } while (acnt); + if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file or automatically\n"); + rv = 1; + } else if (cnt) + rv = 0; + } else if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file\n"); + rv = 1; + } + map_unlock(&map); + return rv; +} + +static int misc_scan(char devmode, struct context *c) +{ + /* apply --detail or --wait-clean to + * all devices in /proc/mdstat + */ + struct mdstat_ent *ms = mdstat_read(0, 1); + struct mdstat_ent *e; + struct map_ent *map = NULL; + int members; + int rv = 0; + + for (members = 0; members <= 1; members++) { + for (e=ms ; e ; e=e->next) { + char *name = NULL; + struct map_ent *me; + struct stat stb; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; + me = map_by_devnm(&map, e->devnm); + if (me && me->path + && strcmp(me->path, "/unknown") != 0) + name = me->path; + if (name == NULL || + stat(name, &stb) != 0) + name = get_md_name(e->devnm); + + if (!name) { + pr_err("cannot find device file for %s\n", + e->dev); + continue; + } + if (devmode == 'D') + rv |= Detail(name, c); + else + rv |= WaitClean(name, -1, c->verbose); + put_md_name(name); + } + } + free_mdstat(ms); + return rv; +} + +static int stop_scan(int verbose) +{ + /* apply --stop to all devices in /proc/mdstat */ + /* Due to possible stacking of devices, repeat until + * nothing more can be stopped + */ + int progress=1, err; + int last = 0; + int rv = 0; + do { + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + + if (!progress) last = 1; + progress = 0; err = 0; + for (e=ms ; e ; e=e->next) { + char *name = get_md_name(e->devnm); + int mdfd; + + if (!name) { + pr_err("cannot find device file for %s\n", + e->dev); + continue; + } + mdfd = open_mddev(name, 1); + if (mdfd >= 0) { + if (Manage_stop(name, mdfd, verbose, !last)) + err = 1; + else + progress = 1; + close(mdfd); + } + + put_md_name(name); + } + free_mdstat(ms); + } while (!last && err); + if (err) + rv |= 1; + return rv; +} + +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c) +{ + struct mddev_dev *dv; + int rv = 0; + + for (dv=devlist ; dv; dv=(rv & 16) ? NULL : dv->next) { + int mdfd; + + switch(dv->disposition) { + case 'D': + rv |= Detail(dv->devname, c); + continue; + case KillOpt: /* Zero superblock */ + if (ss) + rv |= Kill(dv->devname, ss, c->force, c->verbose,0); + else { + int v = c->verbose; + do { + rv |= Kill(dv->devname, NULL, c->force, v, 0); + v = -1; + } while (rv == 0); + rv &= ~2; + } + continue; + case 'Q': + rv |= Query(dv->devname); continue; + case 'X': + rv |= ExamineBitmap(dv->devname, c->brief, ss); continue; + case ExamineBB: + rv |= ExamineBadblocks(dv->devname, c->brief, ss); continue; + case 'W': + case WaitOpt: + rv |= Wait(dv->devname); continue; + case Waitclean: + rv |= WaitClean(dv->devname, -1, c->verbose); continue; + case KillSubarray: + rv |= Kill_subarray(dv->devname, c->subarray, c->verbose); + continue; + case UpdateSubarray: + if (c->update == NULL) { + pr_err("-U/--update must be specified with --update-subarray\n"); + rv |= 1; + continue; + } + rv |= Update_subarray(dv->devname, c->subarray, + c->update, ident, c->verbose); + continue; + case Dump: + rv |= Dump_metadata(dv->devname, dump_directory, c, ss); + continue; + case Restore: + rv |= Restore_metadata(dv->devname, dump_directory, c, ss, + (dv == devlist && dv->next == NULL)); + continue; + } + if (dv->devname[0] == '/') + mdfd = open_mddev(dv->devname, 1); + else { + mdfd = open_dev(dv->devname); + if (mdfd < 0) + pr_err("Cannot open %s\n", dv->devname); + } + if (mdfd>=0) { + switch(dv->disposition) { + case 'R': + rv |= Manage_run(dv->devname, mdfd, c->verbose); break; + case 'S': + rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0); break; + case 'o': + rv |= Manage_ro(dv->devname, mdfd, 1); break; + case 'w': + rv |= Manage_ro(dv->devname, mdfd, -1); break; + } + close(mdfd); + } else + rv |= 1; + } + return rv; +} diff -Nru mdadm-3.2.5/mdadm.conf.5 mdadm-3.3/mdadm.conf.5 --- mdadm-3.2.5/mdadm.conf.5 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdadm.conf.5 2013-09-03 04:47:47.000000000 +0000 @@ -25,6 +25,16 @@ Any word that beings with a hash sign (#) starts a comment and that word together with the remainder of the line is ignored. +Spaces can be included in a word using quotation characters. Either +single quotes +.RB ( ' ) +or double quotes (\fB"\fP) +may be used. All the characters from one quotation character to +next identical character are protected and will not be used to +separate words to start new quoted strings. To include a single quote +it must be between double quotes. To include a double quote it must +be between single quotes. + Any line that starts with white space (space or tab) is treated as though it were a continuation of the previous line. @@ -138,11 +148,11 @@ The value is a comma separated list of device names or device name patterns. Only devices with names which match one entry in the list will be used -to assemble the array. Note that the devices +to assemble the array. Note that the devices listed there must also be listed on a DEVICE line. .TP .B level= -The value is a raid level. This is not normally used to +The value is a RAID level. This is not normally used to identify an array, but is supported so that the output of .B "mdadm \-\-examine \-\-scan" @@ -246,8 +256,8 @@ .B \-\-scan option). There should only be one .B MAILADDR -line and it should have only one address. - +line and it should have only one address. Any subsequent addresses +are silently ignored. .TP .B MAILFROM @@ -329,6 +339,32 @@ Give .B symlinks=no to suppress this symlink creation. + +.TP +.B names=yes +Since Linux 2.6.29 it has been possible to create +.B md +devices with a name like +.B md_home +rather than just a number, like +.BR md3 . +.I mdadm +will use the numeric alternative by default as other tools that interact +with md arrays may expect only numbers. +If +.B names=yes +is given in +.I mdadm.conf +then +.I mdadm +will use a name when appropriate. +If +.B names=no +is given, then non-numeric +.I md +device names will not be used even if the default changes in a future +release of +.IR mdadm . .RE .TP @@ -361,7 +397,9 @@ used when creating arrays. This is the only case when there can be more that one other word on the .B HOMEHOST -line. +line. If there are other words, or other +.B HOMEHOST +lines, they are silently ignored. If .B @@ -425,9 +463,14 @@ .BR ddf , .BR imsm . +.B AUTO +should be given at most once. Subsequent lines are silently ignored. +Thus an earlier config file in a config directory will over-ride +the setting in a later config file. + .TP .B POLICY -This is used to specify what automatic behavior is allowed on devices +This is used to specify what automatic behavior is allowed on devices newly appearing in the system and provides a way of marking spares that can be moved to other arrays as well as the migration domains. .I Domain @@ -465,7 +508,7 @@ .B /dev/disk/by-path .TP .B type= -either +either .B disk or .BR part . @@ -490,9 +533,12 @@ .B include allows adding a disk to an array if metadata on that disk matches that array .TP -.B re-add +.B re\-add will include the device in the array if it appears to be a current member -or a member that was recently removed +or a member that was recently removed and the array has a +write-intent-bitmap to allow the +.B re\-add +functionality. .TP .B spare as above and additionally: if the device is bare it can @@ -504,7 +550,7 @@ degraded recently and the device plugged in has no metadata then it will be automatically added to that array (or it's container) .TP -.B force-spare +.B force\-spare as above and the disk will become a spare in remaining cases .RE @@ -547,6 +593,10 @@ .br auto=part .br +# The name of this array contains a space. +.br +ARRAY /dev/md9 name='Data Storage' +.sp POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* .br action=spare @@ -573,9 +623,8 @@ .br HOMEHOST .br -AUTO +1.x homehost -all +AUTO +1.x homehost \-all .SH SEE ALSO .BR mdadm (8), .BR md (4). - diff -Nru mdadm-3.2.5/mdadm.h mdadm-3.3/mdadm.h --- mdadm-3.2.5/mdadm.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdadm.h 2013-09-03 04:47:47.000000000 +0000 @@ -25,9 +25,9 @@ #define _GNU_SOURCE #define _FILE_OFFSET_BITS 64 #include -#if !defined(__dietlibc__) && !defined(__KLIBC__) +#ifdef __GLIBC__ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); -#else +#elif !defined(lseek64) # if defined(__NO_STAT64) || __WORDSIZE != 32 # define lseek64 lseek # endif @@ -51,7 +51,6 @@ #define srandom srand #endif - #include /*#include */ #include @@ -162,7 +161,6 @@ #endif #endif /* __KLIBC__ */ - /* * min()/max()/clamp() macros that also do * strict type-checking.. See the @@ -188,6 +186,7 @@ int uuid[4]; char name[33]; unsigned long long data_offset; + unsigned long long new_data_offset; unsigned long long component_size; /* same as array.size, except in * sectors and up to 64bits. */ @@ -197,6 +196,7 @@ #define NO_RESHAPE 0 #define VOLUME_RESHAPE 1 #define CONTAINER_RESHAPE 2 +#define RESHAPE_NO_BACKUP 16 /* Mask 'or'ed in */ int reshape_active; unsigned long long reshape_progress; int recovery_blocked; /* for external metadata it @@ -206,6 +206,12 @@ * for native metadata it is * reshape_active field mirror */ + /* During reshape we can sometimes change the data_offset to avoid + * over-writing still-valid data. We need to know if there is space. + * So getinfo_super will fill in space_before and space_after in sectors. + * data_offset can be increased or decreased by this amount. + */ + unsigned long long space_before, space_after; union { unsigned long long resync_start; /* per-array resync position */ unsigned long long recovery_start; /* per-device rebuild position */ @@ -225,7 +231,7 @@ int container_enough; /* flag external handlers can set to * indicate that subarrays have not enough (-1), * enough to start (0), or all expected disks (1) */ - char sys_name[20]; + char sys_name[20]; struct mdinfo *devs; struct mdinfo *next; @@ -249,6 +255,7 @@ int autof; int mode; int symlinks; + int names; struct supertype *supertype; }; @@ -264,6 +271,7 @@ GROW, INCREMENTAL, AUTODETECT, + mode_count }; extern char short_options[]; @@ -271,6 +279,7 @@ extern char short_bitmap_auto_options[]; extern struct option long_options[]; extern char Version[], Usage[], Help[], OptionHelp[], + *mode_help[], Help_create[], Help_build[], Help_assemble[], Help_grow[], Help_incr[], Help_manage[], Help_misc[], Help_monitor[], Help_config[]; @@ -302,6 +311,8 @@ Add, Remove, Fail, + Replace, + With, MiscOpt, WaitOpt, ConfigFile, @@ -323,6 +334,16 @@ Continue, OffRootOpt, Prefer, + KillOpt, + DataOffset, + ExamineBB, + Dump, + Restore, +}; + +enum prefix_standard { + JEDEC, + IEC }; /* structures read from config file */ @@ -371,15 +392,52 @@ }; }; +struct context { + int readonly; + int runstop; + int verbose; + int brief; + int force; + char *homehost; + int require_homehost; + char *prefer; + int export; + int test; + char *subarray; + char *update; + int scan; + int SparcAdjust; + int autof; + int delay; + int freeze_reshape; + char *backup_file; + int invalid_backup; +}; + +struct shape { + int raiddisks; + int sparedisks; + int level; + int layout; + char *layout_str; + int chunk; + int bitmap_chunk; + char *bitmap_file; + int assume_clean; + int write_behind; + unsigned long long size; +}; + /* List of device names - wildcards expanded */ struct mddev_dev { char *devname; - int disposition; /* 'a' for add, 'r' for remove, 'f' for fail. + int disposition; /* 'a' for add, 'r' for remove, 'f' for fail, + * 'A' for re_add. * Not set for names read from .config */ char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */ - char re_add; - char used; /* set when used */ + int used; /* set when used */ + long long data_offset; struct mddev_dev *next; }; @@ -388,10 +446,9 @@ int num; } mapping_t; - struct mdstat_ent { char *dev; - int devnum; + char devnm[32]; int active; char *level; char *pattern; /* U or up, _ for down */ @@ -403,38 +460,49 @@ struct dev_member { char *name; struct dev_member *next; - } *members; + } *members; struct mdstat_ent *next; }; extern struct mdstat_ent *mdstat_read(int hold, int start); +extern void mdstat_close(void); extern void free_mdstat(struct mdstat_ent *ms); extern void mdstat_wait(int seconds); extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); -extern int mddev_busy(int devnum); +extern int mddev_busy(char *devnm); extern struct mdstat_ent *mdstat_by_component(char *name); -extern struct mdstat_ent *mdstat_by_subdev(char *subdev, int container); +extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container); struct map_ent { struct map_ent *next; - int devnum; + char devnm[32]; char metadata[20]; int uuid[4]; int bad; char *path; }; -extern int map_update(struct map_ent **mpp, int devnum, char *metadata, +extern int map_update(struct map_ent **mpp, char *devnm, char *metadata, int uuid[4], char *path); -extern void map_remove(struct map_ent **map, int devnum); +extern void map_remove(struct map_ent **map, char *devnm); extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); -extern struct map_ent *map_by_devnum(struct map_ent **map, int devnum); +#ifdef MDASSEMBLE +static inline struct map_ent *map_by_devnm(struct map_ent **map, char *name) +{ + return NULL; +} +static inline void map_free(struct map_ent *map) +{ +} +#else +extern struct map_ent *map_by_devnm(struct map_ent **map, char *devnm); +extern void map_free(struct map_ent *map); +#endif extern struct map_ent *map_by_name(struct map_ent **map, char *name); extern void map_read(struct map_ent **melp); extern int map_write(struct map_ent *mel); -extern void map_delete(struct map_ent **mapp, int devnum); -extern void map_free(struct map_ent *map); +extern void map_delete(struct map_ent **mapp, char *devnm); extern void map_add(struct map_ent **melp, - int devnum, char *metadata, int uuid[4], char *path); + char *devnm, char *metadata, int uuid[4], char *path); extern int map_lock(struct map_ent **melp); extern void map_unlock(struct map_ent **melp); extern void map_fork(void); @@ -461,12 +529,12 @@ }; /* If fd >= 0, get the array it is open on, - * else use devnum. >=0 -> major9. <0..... + * else use devnm. */ -extern int sysfs_open(int devnum, char *devname, char *attr); -extern void sysfs_init(struct mdinfo *mdi, int fd, int devnum); +extern int sysfs_open(char *devnm, char *devname, char *attr); +extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm); extern void sysfs_free(struct mdinfo *sra); -extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options); +extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); extern int sysfs_attr_match(const char *attr, const char *str); extern int sysfs_match_word(const char *word, char **list); extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, @@ -481,6 +549,9 @@ extern int sysfs_fd_get_ll(int fd, unsigned long long *val); extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long *val); +extern int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2); extern int sysfs_fd_get_str(int fd, char *val, int size); extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name); @@ -490,8 +561,9 @@ extern int sysfs_set_array(struct mdinfo *info, int vers); extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); -extern int sysfs_unique_holder(int devnum, long rdev); +extern int sysfs_unique_holder(char *devnm, long rdev); extern int sysfs_freeze_array(struct mdinfo *sra); +extern int sysfs_wait(int fd, int *msec); extern int load_sys(char *path, char *buf); extern int reshape_prepare_fdlist(char *devname, struct mdinfo *sra, @@ -545,8 +617,7 @@ struct active_array; struct metadata_update; - -/* 'struct reshape' records the intermediate states +/* 'struct reshape' records the intermediate states of * a general reshape. * The starting geometry is converted to the 'before' geometry * by at most an atomic level change. They could be the same. @@ -557,6 +628,10 @@ * This will be a multiple of the stripe size in each of the * 'before' and 'after' geometries. * If 'blocks' is 0, no restriping is necessary. + * 'min_offset_change' is the minimum change to data_offset to + * allow the reshape to happen. It is at least the larger of + * the old and new chunk sizes, and typically the same as 'blocks' + * divided by number of data disks. */ struct reshape { int level; @@ -566,13 +641,14 @@ int data_disks; } before, after; unsigned long long backup_blocks; + unsigned long long min_offset_change; unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/ unsigned long long new_size; /* New size of array in sectors */ }; /* A superswitch provides entry point the a metadata handler. * - * The super_switch primarily operates on some "metadata" that + * The superswitch primarily operates on some "metadata" that * is accessed via the 'supertype'. * This metadata has one of three possible sources. * 1/ It is read from a single device. In this case it may not completely @@ -606,6 +682,8 @@ void (*brief_examine_super)(struct supertype *st, int verbose); void (*brief_examine_subarrays)(struct supertype *st, int verbose); void (*export_examine_super)(struct supertype *st); + int (*examine_badblocks)(struct supertype *st, int fd, char *devname); + int (*copy_metadata)(struct supertype *st, int from, int to); /* Used to report details of an active array. * ->load_super was possibly given a 'component' string. @@ -615,7 +693,8 @@ void (*export_detail_super)(struct supertype *st); /* Optional: platform hardware / firmware details */ - int (*detail_platform)(int verbose, int enumerate_only); + int (*detail_platform)(int verbose, int enumerate_only, char *controller_path); + int (*export_detail_platform)(int verbose, char *controller_path); /* Used: * to get uuid to storing in bitmap metadata @@ -668,6 +747,11 @@ * linear-grow-update - now change the size of the array. * writemostly - set the WriteMostly1 bit in the superblock devflags * readwrite - clear the WriteMostly1 bit in the superblock devflags + * no-bitmap - clear any record that a bitmap is present. + * bbl - add a bad-block-log if possible + * no-bbl - remove and bad-block-log is it is empty. + * revert-reshape - If a reshape is in progress, modify metadata so + * it will resume going in the opposite direction. */ int (*update_super)(struct supertype *st, struct mdinfo *info, char *update, @@ -681,13 +765,15 @@ */ int (*init_super)(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, - char *homehost, int *uuid); + char *homehost, int *uuid, + unsigned long long data_offset); /* update the metadata to include new device, either at create or * when hot-adding a spare. */ int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, - int fd, char *devname); + int fd, char *devname, + unsigned long long data_offset); /* update the metadata to delete a device, * when hot-removing. */ @@ -701,22 +787,62 @@ /* Write all metadata for this array. */ int (*write_init_super)(struct supertype *st); + /* Check if metadata read from one device is compatible with an array, + * used when assembling an array, or pseudo-assembling was with + * "--examine --brief" + * If "st" has not yet been loaded the superblock from, "tst" is + * moved in, otherwise the superblock in 'st' is compared with + * 'tst'. + */ int (*compare_super)(struct supertype *st, struct supertype *tst); + /* Load metadata from a single device. If 'devname' is not NULL + * print error messages as appropriate */ int (*load_super)(struct supertype *st, int fd, char *devname); + /* 'fd' is a 'container' md array - load array metadata from the + * whole container. + */ int (*load_container)(struct supertype *st, int fd, char *devname); + /* If 'arg' is a valid name of this metadata type, allocate and + * return a 'supertype' for the particular minor version */ struct supertype * (*match_metadata_desc)(char *arg); - __u64 (*avail_size)(struct supertype *st, __u64 size); + /* If a device has the given size, and the data_offset has been + * requested - work out how much space is available for data. + * This involves adjusting for reserved space (e.g. bitmaps) + * and for any rounding. + * 'mdadm' only calls this for existing arrays where a possible + * spare is being added. However some super-handlers call it + * internally from validate_geometry when creating an array. + */ + __u64 (*avail_size)(struct supertype *st, __u64 size, + unsigned long long data_offset); + /* This is similar to 'avail_size' in purpose, but is used for + * containers for which there is no 'component size' to compare. + * This reports that whole-device size which is a minimum + */ unsigned long long (*min_acceptable_spare_size)(struct supertype *st); + /* Find somewhere to put a bitmap - possibly auto-size it - and + * update the metadata to record this. The array may be newly + * created, in which case data_size may be updated, or it might + * already exist. Metadata handler can know if init_super + * has been called, but not write_init_super. + */ int (*add_internal_bitmap)(struct supertype *st, int *chunkp, int delay, int write_behind, unsigned long long size, int may_change, int major); + /* Seek 'fd' to start of write-intent-bitmap. Must be an + * md-native format bitmap + */ void (*locate_bitmap)(struct supertype *st, int fd); + /* if add_internal_bitmap succeeded for existing array, this + * writes it out. + */ int (*write_bitmap)(struct supertype *st, int fd); + /* Free the superblock and any other allocated data */ void (*free_super)(struct supertype *st); /* validate_geometry is called with an st returned by * match_metadata_desc. - * It should check that the geometry described in compatible with + * It should check that the geometry described is compatible with * the metadata type. It will be called repeatedly as devices * added to validate changing size and new devices. If there are * inter-device dependencies, it should record sufficient details @@ -726,15 +852,19 @@ * 1: everything is OK * 0: not OK for some reason - if 'verbose', then error was reported. * -1: st->sb was NULL, 'subdev' is a member of a container of this - * types, but array is not acceptable for some reason + * type, but array is not acceptable for some reason * message was reported even if verbose is 0. */ int (*validate_geometry)(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *subdev, unsigned long long *freesize, int verbose); + /* Return a linked list of 'mdinfo' structures for all arrays + * in the container. For non-containers, it is like + * getinfo_super with an allocated mdinfo.*/ struct mdinfo *(*container_content)(struct supertype *st, char *subarray); /* query the supertype for default geometry */ void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */ @@ -751,7 +881,8 @@ #define APPLY_METADATA_CHANGES 1 #define ROLLBACK_METADATA_CHANGES 0 - int (*reshape_super)(struct supertype *st, long long size, int level, + int (*reshape_super)(struct supertype *st, + unsigned long long size, int level, int layout, int chunksize, int raid_disks, int delta_disks, char *backup, char *dev, int direction, @@ -772,7 +903,7 @@ * (in a->resync_start). * resync status is really irrelevant if the array is not consistent, * but some metadata (DDF!) have a place to record the distinction. - * If 'consistent' is '2', then the array can mark it dirty if a + * If 'consistent' is '2', then the array can mark it dirty if a * resync/recovery/whatever is required, or leave it clean if not. * Return value is 0 dirty (not consistent) and 1 if clean. * it is only really important if consistent is passed in as '2'. @@ -861,9 +992,12 @@ struct superswitch *ss; int minor_version; int max_devs; - int container_dev; /* devnum of container */ + char container_devnm[32]; /* devnm of container */ void *sb; void *info; + void *other; /* Hack used to convert v0.90 to v1.0 */ + unsigned long long devsize; + unsigned long long data_offset; /* used by v1.x only */ int ignore_hw_compat; /* used to inform metadata handlers that it should ignore HW/firmware related incompatability to load metadata. Used when examining metadata to display content of disk @@ -875,10 +1009,9 @@ /* extra stuff used by mdmon */ struct active_array *arrays; int sock; /* listen to external programs */ - int devnum; - char *devname; /* e.g. md0. This appears in metadata_verison: - * external:/md0/12 - */ + char devnm[32]; /* e.g. md0. This appears in metadata_version: + * external:/md0/12 + */ int devcnt; int retry_soon; @@ -896,8 +1029,6 @@ extern int get_dev_size(int fd, char *dname, unsigned long long *sizep); extern int must_be_container(int fd); extern int dev_size_from_id(dev_t id, unsigned long long *size); -extern void get_one_disk(int mdfd, mdu_array_info_t *ainf, - mdu_disk_info_t *disk); void wait_for(char *dev, int fd); /* @@ -955,7 +1086,7 @@ extern struct dev_policy *path_policy(char *path, char *type); extern struct dev_policy *disk_policy(struct mdinfo *disk); -extern struct dev_policy *devnum_policy(int dev); +extern struct dev_policy *devid_policy(int devid); extern void dev_policy_free(struct dev_policy *p); //extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata); @@ -987,7 +1118,7 @@ const char *metadata); extern struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata); -extern void domainlist_add_dev(struct domainlist **dom, int devnum, +extern void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata); extern void domain_free(struct domainlist *dl); extern void domain_merge(struct domainlist **domp, struct dev_policy *pol, @@ -1037,21 +1168,21 @@ extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s); - extern int Manage_ro(char *devname, int fd, int readonly); -extern int Manage_runstop(char *devname, int fd, int runstop, int quiet); -extern int Manage_resize(char *devname, int fd, long long size, int raid_disks); +extern int Manage_run(char *devname, int fd, int quiet); +extern int Manage_stop(char *devname, int fd, int quiet, + int will_retry); extern int Manage_subdevs(char *devname, int fd, struct mddev_dev *devlist, int verbose, int test, char *update, int force); extern int autodetect(void); extern int Grow_Add_device(char *devname, int fd, char *newdev); -extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force); -extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, - long long size, - int level, char *layout_str, int chunksize, int raid_disks, +extern int Grow_addbitmap(char *devname, int fd, + struct context *c, struct shape *s); +extern int Grow_reshape(char *devname, int fd, struct mddev_dev *devlist, - int assume_clean, int force); + unsigned long long data_offset, + struct context *c, struct shape *s); extern int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, char *backup_file, int verbose); extern int Grow_continue(int mdfd, struct supertype *st, @@ -1070,46 +1201,41 @@ extern int Assemble(struct supertype *st, char *mddev, struct mddev_ident *ident, struct mddev_dev *devlist, - char *backup_file, int invalid_backup, - int readonly, int runstop, - char *update, char *homehost, int require_homehost, - int verbose, int force, int freeze_reshape); - -extern int Build(char *mddev, int chunk, int level, int layout, - int raiddisks, struct mddev_dev *devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, - int delay, int verbose, int autof, unsigned long long size); + struct context *c); +extern int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c); extern int Create(struct supertype *st, char *mddev, - int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks, - char *name, char *homehost, int *uuid, + char *name, int *uuid, int subdevs, struct mddev_dev *devlist, - int runstop, int verbose, int force, int assume_clean, - char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof); + struct shape *s, + struct context *c, + unsigned long long data_offset); -extern int Detail(char *dev, int brief, int export, int test, char *homehost, char *prefer); -extern int Detail_Platform(struct superswitch *ss, int scan, int verbose); +extern int Detail(char *dev, struct context *c); +extern int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path); extern int Query(char *dev); -extern int Examine(struct mddev_dev *devlist, int brief, int export, int scan, - int SparcAdjust, struct supertype *forcest, char *homehost); +extern int ExamineBadblocks(char *devname, int brief, struct supertype *forcest); +extern int Examine(struct mddev_dev *devlist, struct context *c, + struct supertype *forcest); extern int Monitor(struct mddev_dev *devlist, char *mailaddr, char *alert_cmd, - int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char *pidfile, int increments, - int share, char *prefer); + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share); -extern int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl); -extern int Kill_subarray(char *dev, char *subarray, int quiet); +extern int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl); +extern int Kill_subarray(char *dev, char *subarray, int verbose); extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet); extern int Wait(char *dev); extern int WaitClean(char *dev, int sock, int verbose); -extern int Incremental(char *devname, int verbose, int runstop, - struct supertype *st, char *homehost, int require_homehost, - int autof, int freeze_reshape); +extern int Incremental(char *devname, struct context *c, + struct supertype *st); extern void RebuildMap(void); -extern int IncrementalScan(int verbose); +extern int IncrementalScan(int verbose, char *devnm); extern int IncrementalRemove(char *devname, char *path, int verbose); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, @@ -1120,14 +1246,19 @@ extern int Write_rules(char *rule_name); extern int bitmap_update_uuid(int fd, int *uuid, int swap); extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb); +extern int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st); +extern int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only); extern int md_get_version(int fd); extern int get_linux_version(void); extern int mdadm_version(char *version); -extern long long parse_size(char *size); +extern unsigned long long parse_size(char *size); extern int parse_uuid(char *str, int uuid[4]); extern int parse_layout_10(char *layout); extern int parse_layout_faulty(char *layout); +extern long parse_num(char *num); extern int check_ext2(int fd, char *name); extern int check_reiser(int fd, char *name); extern int check_raid(int fd, char *name); @@ -1137,11 +1268,13 @@ extern int get_mdp_major(void); extern int dev_open(char *dev, int flags); -extern int open_dev(int devnum); -extern int open_dev_flags(int devnum, int flags); -extern int open_dev_excl(int devnum); +extern int open_dev(char *devnm); +extern int open_dev_flags(char *devnm, int flags); +extern int open_dev_excl(char *devnm); extern int is_standard(char *dev, int *nump); extern int same_dev(char *one, char *two); +extern int compare_paths (char* path1,char* path2); +extern void enable_fds(int devices); extern int parse_auto(char *str, char *msg, int config); extern struct mddev_ident *conf_get_ident(char *dev); @@ -1156,6 +1289,10 @@ extern char *conf_get_homehost(int *require_homehostp); extern char *conf_line(FILE *file); extern char *conf_word(FILE *file, int allow_key); +extern void print_quoted(char *str); +extern void print_escape(char *str); +extern int use_udev(void); +extern unsigned long GCD(unsigned long a, unsigned long b); extern int conf_name_is_free(char *name); extern int conf_verify_devnames(struct mddev_ident *array_list); extern int devname_matches(char *name, char *match); @@ -1189,9 +1326,9 @@ extern int flush_metadata_updates(struct supertype *st); extern void append_metadata_update(struct supertype *st, void *buf, int len); extern int assemble_container_content(struct supertype *st, int mdfd, - struct mdinfo *content, int runstop, - char *chosen_name, int verbose, - char *backup_file, int freeze_reshape); + struct mdinfo *content, + struct context *c, + char *chosen_name); extern struct mdinfo *container_choose_spares(struct supertype *st, unsigned long long min_size, struct domainlist *domlist, @@ -1206,14 +1343,16 @@ unsigned long long min_recovery_start(struct mdinfo *array); extern char *human_size(long long bytes); -extern char *human_size_brief(long long bytes); +extern char *human_size_brief(long long bytes, int prefix); extern void print_r10_layout(int layout); -#define NoMdDev (1<<23) -extern int find_free_devnum(int use_partitions); +extern char *find_free_devnm(int use_partitions); extern void put_md_name(char *name); -extern char *get_md_name(int dev); +extern char *devid2kname(int devid); +extern char *devid2devnm(int devid); +extern int devnm2devid(char *devnm); +extern char *get_md_name(char *devnm); extern char DefaultConfFile[]; @@ -1226,16 +1365,18 @@ #define METADATA 3 extern int open_mddev(char *dev, int report_errors); extern int open_container(int fd); +extern int metadata_container_matches(char *metadata, char *devnm); +extern int metadata_subdev_matches(char *metadata, char *devnm); extern int is_container_member(struct mdstat_ent *ent, char *devname); extern int is_subarray_active(char *subarray, char *devname); extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet); extern struct superswitch *version_to_superswitch(char *vers); -extern int mdmon_running(int devnum); -extern int mdmon_pid(int devnum); +extern int mdmon_running(char *devnm); +extern int mdmon_pid(char *devnm); extern int check_env(char *name); extern __u32 random32(void); -extern int start_mdmon(int devnum); +extern int start_mdmon(char *devnm); extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, struct supertype *st, unsigned long stripes, @@ -1243,26 +1384,13 @@ int dests, int *destfd, unsigned long long *destoffsets); void abort_reshape(struct mdinfo *sra); -extern char *devnum2devname(int num); -extern void fmt_devname(char *name, int num); -extern int devname2devnum(char *name); -extern int stat2devnum(struct stat *st); -extern int fd2devnum(int fd); +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0); -static inline int dev2major(int d) -{ - if (d >= 0) - return MD_MAJOR; - else - return get_mdp_major(); -} +extern void fmt_devname(char *name, int num); +extern char *stat2devnm(struct stat *st); +extern char *fd2devnm(int fd); -static inline int dev2minor(int d) -{ - if (d >= 0) - return d; - return (-1-d) << MdpMinorShift; -} +extern int in_initrd(void); #define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) #define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) @@ -1272,7 +1400,7 @@ static inline int is_subarray(char *vers) { /* The version string for a 'subarray' (an array in a container) - * is + * is * /containername/componentname for normal read-write arrays * -containername/componentname for arrays which mdmon must not * reconfigure. They might be read-only @@ -1307,6 +1435,14 @@ return ret; } +#define pr_err(fmt ...) fprintf(stderr, Name ": " fmt) +#define cont_err(fmt ...) fprintf(stderr, " " fmt) + +void *xmalloc(size_t len); +void *xrealloc(void *ptr, size_t len); +void *xcalloc(size_t num, size_t size); +char *xstrdup(const char *str); + #define LEVEL_MULTIPATH (-4) #define LEVEL_LINEAR (-1) #define LEVEL_FAULTY (-5) @@ -1315,7 +1451,6 @@ #define LEVEL_CONTAINER (-100) #define LEVEL_UNSUPPORTED (-200) - /* faulty stuff */ #define WriteTransient 0 @@ -1335,7 +1470,6 @@ #define ModeMask 0x1f #define ModeShift 5 - #ifdef __TINYC__ #undef minor #undef major @@ -1373,7 +1507,6 @@ #define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ #define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ - /* For every RAID5 algorithm we define a RAID6 algorithm * with exactly the same layout for data and parity, and * with the Q block always on the last device (N-1). @@ -1393,8 +1526,10 @@ #define PATH_MAX 4096 #endif -#define PROCESS_DELAYED -2 -#define PROCESS_PENDING -3 +#define RESYNC_NONE -1 +#define RESYNC_DELAYED -2 +#define RESYNC_PENDING -3 +#define RESYNC_UNKNOWN -4 /* When using "GET_DISK_INFO" it isn't certain how high * we need to check. So we impose an absolute limit of @@ -1404,4 +1539,14 @@ */ #define MAX_DISKS 4096 -extern int __offroot; +/* Sometimes the 'size' value passed needs to mean "Maximum". + * In those cases with use MAX_SIZE + */ +#define MAX_SIZE 1 + +/* We want to use unsigned numbers for sector counts, but need + * a value for 'invalid'. Use '1'. + */ +#define INVALID_SECTORS 1 +/* And another special number needed for --data_offset=variable */ +#define VARIABLE_OFFSET 3 diff -Nru mdadm-3.2.5/mdadm.spec mdadm-3.3/mdadm.spec --- mdadm-3.2.5/mdadm.spec 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdadm.spec 2013-09-03 04:47:47.000000000 +0000 @@ -1,6 +1,6 @@ Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) Name: mdadm -Version: 3.2.5 +Version: 3.3 Release: 1 Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz URL: http://neil.brown.name/blog/mdadm @@ -9,14 +9,9 @@ BuildRoot: %{_tmppath}/%{name}-root Obsoletes: mdctl -%description +%description mdadm is a program that can be used to create, manage, and monitor Linux MD (Software RAID) devices. -As such is provides similar functionality to the raidtools packages. -The particular differences to raidtools is that mdadm is a single -program, and it can perform (almost) all functions without a -configuration file (that a config file can be used to help with -some common tasks). %prep %setup -q @@ -42,37 +37,9 @@ %doc TODO ChangeLog mdadm.conf-example COPYING %{_sbindir}/mdadm %{_sbindir}/mdmon -/lib/udev/rules.d/64-md-raid.rules +/usr/lib/udev/rules.d/63-md-raid-arrays.rules +/usr/lib/udev/rules.d/64-md-raid-assembly.rules %config(noreplace,missingok)/%{_sysconfdir}/mdadm.conf %{_mandir}/man*/md* %changelog -* Fri May 10 2002 -- update to 1.0.0 -- Set CXFLAGS instead of CFLAGS - -* Sat Apr 6 2002 -- change install to use "make install" - -* Fri Mar 15 2002 -- beautification -- made mdadm.conf non-replaceable config -- renamed Copyright to License in the header -- added missing license file -- used macros for file paths - -* Fri Mar 15 2002 Luca Berra -- Added Obsoletes: mdctl -- missingok for configfile - -* Wed Mar 12 2002 NeilBrown -- Add md.4 and mdadm.conf.5 man pages - -* Fri Mar 08 2002 Chris Siebenmann -- builds properly as non-root. - -* Fri Mar 08 2002 Derek Vadala -- updated for 0.7, fixed /usr/share/doc and added manpage - -* Tue Aug 07 2001 Danilo Godec -- initial RPM build diff -Nru mdadm-3.2.5/mdassemble.8 mdadm-3.3/mdassemble.8 --- mdadm-3.2.5/mdassemble.8 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdassemble.8 2013-09-03 04:47:47.000000000 +0000 @@ -1,5 +1,5 @@ .\" -*- nroff -*- -.TH MDASSEMBLE 8 "" v3.2.5 +.TH MDASSEMBLE 8 "" v3.3 .SH NAME mdassemble \- assemble MD devices .I aka @@ -9,11 +9,11 @@ .BI mdassemble -.SH DESCRIPTION +.SH DESCRIPTION .B mdassemble is a tiny program that can be used to assemble MD devices inside an initial ramdisk (initrd) or initramfs; it is meant to replace the in-kernel -automatic raid detection and activation. +automatic RAID detection and activation. It can be built statically and linked against lightweight libc alternatives, like .B dietlibc, .B klibc diff -Nru mdadm-3.2.5/mdassemble.c mdadm-3.3/mdassemble.c --- mdadm-3.2.5/mdassemble.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdassemble.c 2013-09-03 04:47:47.000000000 +0000 @@ -32,10 +32,10 @@ { int mdfd = open(dev, O_RDWR); if (mdfd < 0) - fprintf(stderr, Name ": error opening %s: %s\n", + pr_err("error opening %s: %s\n", dev, strerror(errno)); else if (md_get_version(mdfd) <= 0) { - fprintf(stderr, Name ": %s does not appear to be an md device\n", + pr_err("%s does not appear to be an md device\n", dev); close(mdfd); mdfd = -1; @@ -48,7 +48,7 @@ return open_mddev(dev, 0); } #endif -int map_update(struct map_ent **mpp, int devnum, char *metadata, +int map_update(struct map_ent **mpp, char *devnm, char *metadata, int *uuid, char *path) { return 0; @@ -57,18 +57,19 @@ { return NULL; } +int map_lock(struct map_ent **melp){return 0;} +void map_unlock(struct map_ent **melp){} +struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]){return NULL;} int rv; int mdfd = -1; -int runstop = 0; -int readonly = 0; -int verbose = 0; -int force = 0; -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ struct mddev_ident *array_list = conf_get_ident(NULL); + struct context c = { .freeze_reshape = 1 }; if (!array_list) { - fprintf(stderr, Name ": No arrays found in config file\n"); + pr_err("No arrays found in config file\n"); rv = 1; } else for (; array_list; array_list = array_list->next) { @@ -83,9 +84,7 @@ if (mdfd >= 0) close(mdfd); rv |= Assemble(array_list->st, array_list->devname, - array_list, NULL, NULL, 0, - readonly, runstop, NULL, NULL, 0, - verbose, force, 1); + array_list, NULL, &c); } return rv; } diff -Nru mdadm-3.2.5/mdmon.8 mdadm-3.3/mdmon.8 --- mdadm-3.2.5/mdmon.8 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdmon.8 2013-09-03 04:47:47.000000000 +0000 @@ -1,11 +1,11 @@ .\" See file COPYING in distribution for details. -.TH MDMON 8 "" v3.2.5 +.TH MDMON 8 "" v3.3 .SH NAME mdmon \- monitor MD external metadata arrays .SH SYNOPSIS -.BI mdmon " [--all] [--takeover] [--offroot] CONTAINER" +.BI mdmon " [--all] [--takeover] [--foreground] CONTAINER" .SH OVERVIEW The 2.6.27 kernel brings the ability to support external metadata arrays. @@ -101,8 +101,8 @@ External metadata formats, like DDF, differ from the native MD metadata formats in that they define a set of disks and a series of sub-arrays within those disks. MD metadata in comparison defines a 1:1 -relationship between a set of block devices and a raid array. For -example to create 2 arrays at different raid levels on a single +relationship between a set of block devices and a RAID array. For +example to create 2 arrays at different RAID levels on a single set of disks, MD metadata requires the disks be partitioned and then each array can be created with a subset of those partitions. The supported external formats perform this disk carving internally. @@ -131,6 +131,14 @@ device to monitor. It can be a full path like /dev/md/container, or a simple md device name like md127. .TP +.B \-\-foreground +Normally, +.I mdmon +will fork and continue in the background. Adding this option will +skip that step and run +.I mdmon +in the foreground. +.TP .B \-\-takeover This instructs .I mdmon @@ -166,15 +174,6 @@ arbitrarily extended, e.g. to .BR \-\-all-active-arrays . .TP -.BR \-\-offroot -Set first character of argv[0] to @ to indicate mdmon was launched -from initrd/initramfs and should not be shutdown by systemd as part of -the regular shutdown process. This option is normally only used by -the system's initscripts. Please see here for more details on how -systemd handled argv[0]: -.IP -.B http://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons -.PP .PP Note that diff -Nru mdadm-3.2.5/mdmon.c mdadm-3.3/mdmon.c --- mdadm-3.2.5/mdmon.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdmon.c 2013-09-03 04:47:47.000000000 +0000 @@ -184,7 +184,8 @@ buf[sizeof(buf)-1] = 0; close(fd); - if (n < 0 || !strstr(buf, "mdmon")) + if (n < 0 || !(strstr(buf, "mdmon") || + strstr(buf, "@dmon"))) return; kill(pid, SIGTERM); @@ -199,7 +200,7 @@ fcntl(sock, F_SETFL, fl); n = read(sock, buf, 100); /* Ignore result, it is just the wait that - * matters + * matters */ } @@ -270,47 +271,56 @@ "\n" "Options are:\n" " --help -h : This message\n" -" --all : All devices\n" +" --all -a : All devices\n" +" --foreground -F : Run in foreground (do not fork)\n" " --takeover -t : Takeover container\n" -" --offroot : Set first character of argv[0] to @ to indicate the\n" -" application was launched from initrd/initramfs and\n" -" should not be shutdown by systemd as part of the\n" -" regular shutdown process.\n" ); exit(2); } -static int mdmon(char *devname, int devnum, int must_fork, int takeover); +static int mdmon(char *devnm, int must_fork, int takeover); int main(int argc, char *argv[]) { char *container_name = NULL; - int devnum; - char *devname; + char *devnm = NULL; int status = 0; int opt; int all = 0; int takeover = 0; + int dofork = 1; static struct option options[] = { {"all", 0, NULL, 'a'}, {"takeover", 0, NULL, 't'}, {"help", 0, NULL, 'h'}, {"offroot", 0, NULL, OffRootOpt}, + {"foreground", 0, NULL, 'F'}, {NULL, 0, NULL, 0} }; - while ((opt = getopt_long(argc, argv, "th", options, NULL)) != -1) { + if (in_initrd()) { + /* + * set first char of argv[0] to @. This is used by + * systemd to signal that the task was launched from + * initrd/initramfs and should be preserved during shutdown + */ + argv[0][0] = '@'; + } + + while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) { switch (opt) { case 'a': container_name = argv[optind-1]; all = 1; break; case 't': - container_name = optarg; takeover = 1; break; + case 'F': + dofork = 0; + break; case OffRootOpt: - argv[0][0] = '@'; + /* silently ignore old option */ break; case 'h': default: @@ -343,47 +353,39 @@ if (e->metadata_version && strncmp(e->metadata_version, "external:", 9) == 0 && !is_subarray(&e->metadata_version[9])) { - devname = devnum2devname(e->devnum); /* update cmdline so this mdmon instance can be * distinguished from others in a call to ps(1) */ - if (strlen(devname) <= (unsigned)container_len) { + if (strlen(e->devnm) <= (unsigned)container_len) { memset(container_name, 0, container_len); - sprintf(container_name, "%s", devname); + sprintf(container_name, "%s", e->devnm); } - status |= mdmon(devname, e->devnum, 1, - takeover); + status |= mdmon(e->devnm, 1, takeover); } } free_mdstat(mdstat); return status; } else if (strncmp(container_name, "md", 2) == 0) { - devnum = devname2devnum(container_name); - devname = devnum2devname(devnum); - if (strcmp(container_name, devname) != 0) - devname = NULL; + int id = devnm2devid(container_name); + if (id) + devnm = container_name; } else { struct stat st; - devnum = NoMdDev; if (stat(container_name, &st) == 0) - devnum = stat2devnum(&st); - if (devnum == NoMdDev) - devname = NULL; - else - devname = devnum2devname(devnum); + devnm = xstrdup(stat2devnm(&st)); } - if (!devname) { - fprintf(stderr, "mdmon: %s is not a valid md device name\n", + if (!devnm) { + pr_err("%s is not a valid md device name\n", container_name); exit(1); } - return mdmon(devname, devnum, do_fork(), takeover); + return mdmon(devnm, dofork && do_fork(), takeover); } -static int mdmon(char *devname, int devnum, int must_fork, int takeover) +static int mdmon(char *devnm, int must_fork, int takeover) { int mdfd; struct mdinfo *mdi, *di; @@ -396,30 +398,27 @@ pid_t victim = -1; int victim_sock = -1; - dprintf("starting mdmon for %s\n", devname); + dprintf("starting mdmon for %s\n", devnm); - mdfd = open_dev(devnum); + mdfd = open_dev(devnm); if (mdfd < 0) { - fprintf(stderr, "mdmon: %s: %s\n", devname, - strerror(errno)); + pr_err("%s: %s\n", devnm, strerror(errno)); return 1; } if (md_get_version(mdfd) < 0) { - fprintf(stderr, "mdmon: %s: Not an md device\n", - devname); + pr_err("%s: Not an md device\n", devnm); return 1; } /* Fork, and have the child tell us when they are ready */ if (must_fork) { if (pipe(pfd) != 0) { - fprintf(stderr, "mdmon: failed to create pipe\n"); + pr_err("failed to create pipe\n"); return 1; } switch(fork()) { case -1: - fprintf(stderr, "mdmon: failed to fork: %s\n", - strerror(errno)); + pr_err("failed to fork: %s\n", strerror(errno)); return 1; case 0: /* child */ close(pfd[0]); @@ -435,46 +434,38 @@ } else pfd[0] = pfd[1] = -1; - container = calloc(1, sizeof(*container)); - container->devnum = devnum; - container->devname = devname; + container = xcalloc(1, sizeof(*container)); + strcpy(container->devnm, devnm); container->arrays = NULL; container->sock = -1; - if (!container->devname) { - fprintf(stderr, "mdmon: failed to allocate container name string\n"); - exit(3); - } - - mdi = sysfs_read(mdfd, container->devnum, GET_VERSION|GET_LEVEL|GET_DEVS); + mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS); if (!mdi) { - fprintf(stderr, "mdmon: failed to load sysfs info for %s\n", - container->devname); + pr_err("failed to load sysfs info for %s\n", container->devnm); exit(3); } if (mdi->array.level != UnSet) { - fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n", - devname); + pr_err("%s is not a container - cannot monitor\n", devnm); exit(3); } if (mdi->array.major_version != -1 || mdi->array.minor_version != -2) { - fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n", - devname); + pr_err("%s does not use external metadata - cannot monitor\n", + devnm); exit(3); } container->ss = version_to_superswitch(mdi->text_version); if (container->ss == NULL) { - fprintf(stderr, "mdmon: %s uses unsupported metadata: %s\n", - devname, mdi->text_version); + pr_err("%s uses unsupported metadata: %s\n", + devnm, mdi->text_version); exit(3); } container->devs = NULL; for (di = mdi->devs; di; di = di->next) { - struct mdinfo *cd = malloc(sizeof(*cd)); + struct mdinfo *cd = xmalloc(sizeof(*cd)); *cd = *di; cd->next = container->devs; container->devs = cd; @@ -496,23 +487,21 @@ act.sa_handler = SIG_IGN; sigaction(SIGPIPE, &act, NULL); - victim = mdmon_pid(container->devnum); + victim = mdmon_pid(container->devnm); if (victim >= 0) - victim_sock = connect_monitor(container->devname); + victim_sock = connect_monitor(container->devnm); ignore = chdir("/"); if (!takeover && victim > 0 && victim_sock >= 0) { if (fping_monitor(victim_sock) == 0) { - fprintf(stderr, "mdmon: %s already managed\n", - container->devname); + pr_err("%s already managed\n", container->devnm); exit(3); } close(victim_sock); victim_sock = -1; } - if (container->ss->load_container(container, mdfd, devname)) { - fprintf(stderr, "mdmon: Cannot load metadata for %s\n", - devname); + if (container->ss->load_container(container, mdfd, devnm)) { + pr_err("Cannot load metadata for %s\n", devnm); exit(3); } close(mdfd); @@ -520,28 +509,28 @@ /* Ok, this is close enough. We can say goodbye to our parent now. */ if (victim > 0) - remove_pidfile(devname); - if (make_pidfile(devname) < 0) { + remove_pidfile(devnm); + if (make_pidfile(devnm) < 0) { exit(3); } - container->sock = make_control_sock(devname); + container->sock = make_control_sock(devnm); status = 0; if (write(pfd[1], &status, sizeof(status)) < 0) - fprintf(stderr, "mdmon: failed to notify our parent: %d\n", + pr_err("failed to notify our parent: %d\n", getppid()); close(pfd[1]); mlockall(MCL_CURRENT | MCL_FUTURE); if (clone_monitor(container) < 0) { - fprintf(stderr, "mdmon: failed to start monitor process: %s\n", + pr_err("failed to start monitor process: %s\n", strerror(errno)); exit(2); } if (victim > 0) { - try_kill_monitor(victim, container->devname, victim_sock); + try_kill_monitor(victim, container->devnm, victim_sock); if (victim_sock >= 0) close(victim_sock); } diff -Nru mdadm-3.2.5/mdmon.h mdadm-3.3/mdmon.h --- mdadm-3.2.5/mdmon.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdmon.h 2013-09-03 04:47:47.000000000 +0000 @@ -18,12 +18,14 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#undef pr_err +#define pr_err(fmt ...) fprintf(stderr, "mdmon: " fmt) + enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, bad_word}; enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; - struct active_array { struct mdinfo info; struct supertype *container; @@ -48,8 +50,6 @@ int check_degraded; /* flag set by mon, read by manage */ int check_reshape; /* flag set by mon, read by manage */ - - int devnum; }; /* @@ -72,7 +72,6 @@ extern struct active_array *pending_discard; extern struct md_generic_cmd *active_cmd; - void remove_pidfile(char *devname); void do_monitor(struct supertype *container); void do_manager(struct supertype *container); @@ -92,8 +91,21 @@ */ static inline int is_resync_complete(struct mdinfo *array) { - if (array->resync_start >= array->component_size) - return 1; - return 0; + unsigned long long sync_size = 0; + int ncopies, l; + switch(array->array.level) { + case 1: + case 4: + case 5: + case 6: + sync_size = array->component_size; + break; + case 10: + l = array->array.layout; + ncopies = (l & 0xff) * ((l >> 8) && 0xff); + sync_size = array->component_size * array->array.raid_disks; + sync_size /= ncopies; + break; + } + return array->resync_start >= sync_size; } - diff -Nru mdadm-3.2.5/mdopen.c mdadm-3.3/mdopen.c --- mdadm-3.2.5/mdopen.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdopen.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -26,7 +26,6 @@ #include "md_p.h" #include - void make_parts(char *dev, int cnt) { /* make 'cnt' partition devices for 'dev' @@ -49,7 +48,8 @@ char sym[1024]; int err; - if (cnt==0) cnt=4; + if (cnt == 0) + cnt = 4; if (lstat(dev, &stb)!= 0) return; @@ -67,11 +67,11 @@ minor_num = -1; } else return; - name = malloc(nlen); - for (i=1; i <= cnt ; i++) { + name = xmalloc(nlen); + for (i = 1; i <= cnt ; i++) { struct stat stb2; snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); - if (stat(name, &stb2)==0) { + if (stat(name, &stb2) == 0) { if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode)) continue; if (stb2.st_rdev == makedev(major_num, minor_num+i)) @@ -100,7 +100,6 @@ free(name); } - /* * We need a new md device to assemble/build/create an array. * 'dev' is a name given us by the user (command line or mdadm.conf) @@ -125,9 +124,9 @@ * supported by 'dev', we add a "_%d" suffix based on the minor number * use that. * - * If udev is configured, we create a temporary device, open it, and + * If udev is configured, we create a temporary device, open it, and * unlink it. - * If not, we create the /dev/mdXX device, and is name is usable, + * If not, we create the /dev/mdXX device, and if name is usable, * /dev/md/name * In any case we return /dev/md/name or (if that isn't available) * /dev/mdXX in 'chosen'. @@ -146,11 +145,11 @@ int parts; char *cname; char devname[20]; + char devnm[32]; char cbuf[400]; if (chosen == NULL) chosen = cbuf; - if (autof == 0) autof = ci->autof; @@ -160,7 +159,6 @@ strcpy(chosen, "/dev/md/"); cname = chosen + strlen(chosen); - if (dev) { if (strncmp(dev, "/dev/md/", 8) == 0) { strcpy(cname, dev+8); @@ -175,7 +173,7 @@ /* name *must* be mdXX or md_dXX in this context */ if (num < 0 || (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) { - fprintf(stderr, Name ": %s is an invalid name " + pr_err("%s is an invalid name " "for an md device. Try /dev/md/%s\n", dev, dev+5); return -1; @@ -193,12 +191,12 @@ * empty. */ if (strchr(cname, '/') != NULL) { - fprintf(stderr, Name ": %s is an invalid name " + pr_err("%s is an invalid name " "for an md device.\n", dev); return -1; } if (cname[0] == 0) { - fprintf(stderr, Name ": %s is an invalid name " + pr_err("%s is an invalid name " "for an md device (empty!).", dev); return -1; } @@ -210,7 +208,10 @@ char *ep; if (cname[0] == 'd') sp++; - num = strtoul(sp, &ep, 10); + if (isdigit(sp[0])) + num = strtoul(sp, &ep, 10); + else + ep = sp; if (ep == sp || *ep || num < 0) num = -1; else if (cname[0] == 'd') @@ -225,7 +226,7 @@ if (name && name[0] == 0) name = NULL; if (name && trustworthy == METADATA && use_mdp == 1) { - fprintf(stderr, Name ": %s is not allowed for a %s container. " + pr_err("%s is not allowed for a %s container. " "Consider /dev/md%d.\n", dev, name, num); return -1; } @@ -238,7 +239,7 @@ use_mdp = 0; } if (num < 0 && trustworthy == LOCAL && name) { - /* if name is numeric, possibly prefixed by + /* if name is numeric, possibly prefixed by * 'md' or '/dev/md', use that for num * if it is not already in use */ char *ep; @@ -252,31 +253,13 @@ num = strtoul(n2, &ep, 10); if (ep == n2 || *ep) num = -1; - else if (mddev_busy(use_mdp ? (-1-num) : num)) - num = -1; - } - - if (num < 0) { - /* need to choose a free number. */ - num = find_free_devnum(use_mdp); - if (num == NoMdDev) { - fprintf(stderr, Name ": No avail md devices - aborting\n"); - return -1; - } - } else { - num = use_mdp ? (-1-num) : num; - if (mddev_busy(num)) { - fprintf(stderr, Name ": %s is already in use.\n", - dev); - return -1; + else { + sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num); + if (mddev_busy(devnm)) + num = -1; } } - if (num < 0) - sprintf(devname, "/dev/md_d%d", -1-num); - else - sprintf(devname, "/dev/md%d", num); - if (cname[0] == 0 && name) { /* Need to find a name if we can * We don't completely trust 'name'. Truncate to @@ -289,8 +272,17 @@ int cnlen; strncpy(cname, name, 200); cname[200] = 0; - while ((cp = strchr(cname, '/')) != NULL) - *cp = '-'; + for (cp = cname; *cp ; cp++) + switch (*cp) { + case '/': + *cp = '-'; + break; + case ' ': + case '\t': + *cp = '_'; + break; + } + if (trustworthy == LOCAL || (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) { /* Only need suffix if there is a conflict */ @@ -302,7 +294,7 @@ if (trustworthy == METADATA && !isdigit(cname[cnlen-1])) sprintf(cname+cnlen, "%d", unum); else - /* add _%d to FOREIGN array that don't + /* add _%d to FOREIGN array that don't * a 'host:' prefix */ sprintf(cname+cnlen, "_%d", unum); @@ -312,6 +304,40 @@ } } + devnm[0] = 0; + if (num < 0 && cname && ci->names) { + int fd; + int n = -1; + sprintf(devnm, "md_%s", cname); + fd = open("/sys/module/md_mod/parameters/new_array", O_WRONLY); + if (fd >= 0) { + n = write(fd, devnm, strlen(devnm)); + close(fd); + } + if (n < 0) + devnm[0] = 0; + } + if (devnm[0]) + ; + else if (num < 0) { + /* need to choose a free number. */ + char *_devnm = find_free_devnm(use_mdp); + if (devnm == NULL) { + pr_err("No avail md devices - aborting\n"); + return -1; + } + strcpy(devnm, _devnm); + } else { + sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num); + if (mddev_busy(devnm)) { + pr_err("%s is already in use.\n", + dev); + return -1; + } + } + + sprintf(devname, "/dev/%s", devnm); + if (dev && dev[0] == '/') strcpy(chosen, dev); else if (cname[0] == 0) @@ -321,21 +347,20 @@ * If we cannot detect udev, we need to make * devices and links ourselves. */ - if ((stat("/dev/.udev", &stb) != 0 && stat("/run/udev", &stb) != 0) || - check_env("MDADM_NO_UDEV")) { + if (!use_udev()) { /* Make sure 'devname' exists and 'chosen' is a symlink to it */ if (lstat(devname, &stb) == 0) { /* Must be the correct device, else error */ if ((stb.st_mode&S_IFMT) != S_IFBLK || - stb.st_rdev != makedev(dev2major(num),dev2minor(num))) { - fprintf(stderr, Name ": %s exists but looks wrong, please fix\n", + stb.st_rdev != (dev_t)devnm2devid(devnm)) { + pr_err("%s exists but looks wrong, please fix\n", devname); return -1; } } else { if (mknod(devname, S_IFBLK|0600, - makedev(dev2major(num),dev2minor(num))) != 0) { - fprintf(stderr, Name ": failed to create %s\n", + devnm2devid(devnm)) != 0) { + pr_err("failed to create %s\n", devname); return -1; } @@ -348,9 +373,9 @@ } if (use_mdp == 1) make_parts(devname, parts); - if (strcmp(chosen, devname) != 0) { - if (mkdir("/dev/md",0700)==0) { + if (strcmp(chosen, devname) != 0) { + if (mkdir("/dev/md",0700) == 0) { if (chown("/dev/md", ci->uid, ci->gid)) perror("chown /dev/md"); if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) @@ -370,25 +395,24 @@ if ((stb.st_mode & S_IFMT) != S_IFLNK || link_len < 0 || strcmp(buf, devname) != 0) { - fprintf(stderr, Name ": %s exists - ignoring\n", + pr_err("%s exists - ignoring\n", chosen); strcpy(chosen, devname); } } else if (symlink(devname, chosen) != 0) - fprintf(stderr, Name ": failed to create %s: %s\n", + pr_err("failed to create %s: %s\n", chosen, strerror(errno)); if (use_mdp && strcmp(chosen, devname) != 0) make_parts(chosen, parts); } } - mdfd = open_dev_excl(num); + mdfd = open_dev_excl(devnm); if (mdfd < 0) - fprintf(stderr, Name ": unexpected failure opening %s\n", + pr_err("unexpected failure opening %s\n", devname); return mdfd; } - /* Open this and check that it is an md device. * On success, return filedescriptor. * On failure, return -1 if it doesn't exist, @@ -401,16 +425,49 @@ mdfd = open(dev, O_RDONLY); if (mdfd < 0) { if (report_errors) - fprintf(stderr, Name ": error opening %s: %s\n", + pr_err("error opening %s: %s\n", dev, strerror(errno)); return -1; } if (md_get_version(mdfd) <= 0) { close(mdfd); if (report_errors) - fprintf(stderr, Name ": %s does not appear to be " + pr_err("%s does not appear to be " "an md device\n", dev); return -2; } return mdfd; } + +char *find_free_devnm(int use_partitions) +{ + static char devnm[32]; + int devnum; + for (devnum = 127; devnum != 128; + devnum = devnum ? devnum-1 : (1<<20)-1) { + + if (use_partitions) + sprintf(devnm, "md_d%d", devnum); + else + sprintf(devnm, "md%d", devnum); + if (mddev_busy(devnm)) + continue; + if (!conf_name_is_free(devnm)) + continue; + if (!use_udev()) { + /* make sure it is new to /dev too, at least as a + * non-standard */ + int devid = devnm2devid(devnm); + if (devid) { + char *dn = map_dev(major(devid), + minor(devid), 0); + if (dn && ! is_standard(dn, NULL)) + continue; + } + } + break; + } + if (devnum == 128) + return NULL; + return devnm; +} diff -Nru mdadm-3.2.5/md_p.h mdadm-3.3/md_p.h --- mdadm-3.2.5/md_p.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/md_p.h 2013-09-03 04:47:47.000000000 +0000 @@ -84,6 +84,8 @@ * dire need */ +#define MD_DISK_REPLACEMENT 17 + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ @@ -110,7 +112,7 @@ /* * Constant generic information */ - __u32 md_magic; /* 0 MD identifier */ + __u32 md_magic; /* 0 MD identifier */ __u32 major_version; /* 1 major version to which the set conforms */ __u32 minor_version; /* 2 minor version ... */ __u32 patch_version; /* 3 patchlevel version ... */ @@ -194,4 +196,3 @@ } #endif - diff -Nru mdadm-3.2.5/mdstat.c mdadm-3.3/mdstat.c --- mdadm-3.2.5/mdstat.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mdstat.c 2013-09-03 04:47:47.000000000 +0000 @@ -103,7 +103,7 @@ /* not a device */ return 0; - new = malloc(sizeof(*new)); + new = xmalloc(sizeof(*new)); new->name = strndup(name, t - name); new->next = *m; *m = new; @@ -152,9 +152,8 @@ for (; (line = conf_line(f)) ; free_line(line)) { struct mdstat_ent *ent; char *w; - int devnum; + char devnm[32]; int in_devs = 0; - char *ep; if (strcmp(line, "Personalities")==0) continue; @@ -164,28 +163,15 @@ continue; insert_here = NULL; /* Better be an md line.. */ - if (strncmp(line, "md", 2)!= 0) + if (strncmp(line, "md", 2)!= 0 || strlen(line) >= 32 + || (line[2] != '_' && !isdigit(line[2]))) continue; - if (strncmp(line, "md_d", 4) == 0) - devnum = -1-strtoul(line+4, &ep, 10); - else if (strncmp(line, "md", 2) == 0) - devnum = strtoul(line+2, &ep, 10); - else - continue; - if (ep == NULL || *ep ) { - /* fprintf(stderr, Name ": bad /proc/mdstat line starts: %s\n", line); */ - continue; - } + strcpy(devnm, line); - ent = malloc(sizeof(*ent)); - if (!ent) { - fprintf(stderr, Name ": malloc failed reading /proc/mdstat.\n"); - free_line(line); - break; - } + ent = xmalloc(sizeof(*ent)); ent->dev = ent->level = ent->pattern= NULL; ent->next = NULL; - ent->percent = -1; + ent->percent = RESYNC_NONE; ent->active = -1; ent->resync = 0; ent->metadata_version = NULL; @@ -193,8 +179,8 @@ ent->devcnt = 0; ent->members = NULL; - ent->dev = strdup(line); - ent->devnum = devnum; + ent->dev = xstrdup(line); + strcpy(ent->devnm, devnm); for (w=dl_next(line); w!= line ; w=dl_next(w)) { int l = strlen(w); @@ -207,40 +193,41 @@ } else if (ent->active > 0 && ent->level == NULL && w[0] != '(' /*readonly*/) { - ent->level = strdup(w); + ent->level = xstrdup(w); in_devs = 1; } else if (in_devs && strcmp(w, "blocks")==0) in_devs = 0; else if (in_devs) { + char *ep = strchr(w, '['); ent->devcnt += add_member_devname(&ent->members, w); - if (strncmp(w, "md", 2)==0) { + if (ep && strncmp(w, "md", 2)==0) { /* This has an md device as a component. * If that device is already in the * list, make sure we insert before * there. */ struct mdstat_ent **ih; - int dn2 = devname2devnum(w); ih = &all; while (ih != insert_here && *ih && - (*ih)->devnum != dn2) + ((int)strlen((*ih)->devnm) != ep-w + || strncmp((*ih)->devnm, w, ep-w) != 0)) ih = & (*ih)->next; insert_here = ih; } } else if (strcmp(w, "super") == 0 && dl_next(w) != line) { w = dl_next(w); - ent->metadata_version = strdup(w); + ent->metadata_version = xstrdup(w); } else if (w[0] == '[' && isdigit(w[1])) { ent->raid_disks = atoi(w+1); } else if (!ent->pattern && w[0] == '[' && (w[1] == 'U' || w[1] == '_')) { - ent->pattern = strdup(w+1); + ent->pattern = xstrdup(w+1); if (ent->pattern[l-2]==']') ent->pattern[l-2] = '\0'; - } else if (ent->percent == -1 && + } else if (ent->percent == RESYNC_NONE && strncmp(w, "re", 2)== 0 && w[l-1] == '%' && (eq=strchr(w, '=')) != NULL ) { @@ -251,7 +238,7 @@ ent->resync = 2; else ent->resync = 0; - } else if (ent->percent == -1 && + } else if (ent->percent == RESYNC_NONE && (w[0] == 'r' || w[0] == 'c')) { if (strncmp(w, "resync", 4)==0) ent->resync = 1; @@ -263,10 +250,10 @@ ent->resync = 3; if (l > 8 && strcmp(w+l-8, "=DELAYED") == 0) - ent->percent = PROCESS_DELAYED; + ent->percent = RESYNC_DELAYED; if (l > 8 && strcmp(w+l-8, "=PENDING") == 0) - ent->percent = PROCESS_PENDING; - } else if (ent->percent == -1 && + ent->percent = RESYNC_PENDING; + } else if (ent->percent == RESYNC_NONE && w[0] >= '0' && w[0] <= '9' && w[l-1] == '%') { @@ -302,6 +289,13 @@ return rv; } +void mdstat_close(void) +{ + if (mdstat_fd >= 0) + close(mdstat_fd); + mdstat_fd = -1; +} + void mdstat_wait(int seconds) { fd_set fds; @@ -350,13 +344,13 @@ NULL, sigmask); } -int mddev_busy(int devnum) +int mddev_busy(char *devnm) { struct mdstat_ent *mdstat = mdstat_read(0, 0); struct mdstat_ent *me; for (me = mdstat ; me ; me = me->next) - if (me->devnum == devnum) + if (strcmp(me->devnm, devnm) == 0) break; free_mdstat(mdstat); return me != NULL; @@ -389,35 +383,34 @@ return NULL; } -struct mdstat_ent *mdstat_by_subdev(char *subdev, int container) +struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container) { struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent = NULL; while (mdstat) { - struct mdstat_ent *ent; - char *pos; /* metadata version must match: - * external:[/-]md%d/%s - * where %d is 'container' and %s is 'subdev' + * external:[/-]%s/%s + * where first %s is 'container' and second %s is 'subdev' */ - if (mdstat->metadata_version && - strncmp(mdstat->metadata_version, "external:", 9) == 0 && - strchr("/-", mdstat->metadata_version[9]) != NULL && - strncmp(mdstat->metadata_version+10, "md", 2) == 0 && - strtoul(mdstat->metadata_version+12, &pos, 10) - == (unsigned)container && - pos > mdstat->metadata_version+12 && - *pos == '/' && - strcmp(pos+1, subdev) == 0 - ) { - free_mdstat(mdstat->next); - mdstat->next = NULL; - return mdstat; - } + if (ent) + free_mdstat(ent); ent = mdstat; mdstat = mdstat->next; ent->next = NULL; - free_mdstat(ent); + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + + if (!metadata_container_matches(ent->metadata_version+9, + container) || + !metadata_subdev_matches(ent->metadata_version+9, + subdev)) + continue; + + free_mdstat(mdstat); + return ent; } return NULL; } diff -Nru mdadm-3.2.5/md_u.h mdadm-3.3/md_u.h --- mdadm-3.2.5/md_u.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/md_u.h 2013-09-03 04:47:47.000000000 +0000 @@ -120,4 +120,3 @@ } mdu_param_t; #endif - diff -Nru mdadm-3.2.5/mkinitramfs mdadm-3.3/mkinitramfs --- mdadm-3.2.5/mkinitramfs 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/mkinitramfs 2013-09-03 04:47:47.000000000 +0000 @@ -53,5 +53,3 @@ ) > init.cpio.gz rm -rf initramfs ls -l init.cpio.gz - - diff -Nru mdadm-3.2.5/monitor.c mdadm-3.3/monitor.c --- mdadm-3.2.5/monitor.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/monitor.c 2013-09-03 04:47:47.000000000 +0000 @@ -38,8 +38,17 @@ static void add_fd(fd_set *fds, int *maxfd, int fd) { + struct stat st; if (fd < 0) return; + if (fstat(fd, &st) == -1) { + dprintf("%s: Invalid fd %d\n", __func__, fd); + return; + } + if (st.st_nlink == 0) { + dprintf("%s: fd %d was deleted\n", __func__, fd); + return; + } if (fd > *maxfd) *maxfd = fd; FD_SET(fd, fds); @@ -66,18 +75,21 @@ return n; } -static unsigned long long read_resync_start(int fd) +static void read_resync_start(int fd, unsigned long long *v) { char buf[30]; int n; n = read_attr(buf, 30, fd); - if (n <= 0) - return 0; + if (n <= 0) { + dprintf("%s: Failed to read resync_start (%d)\n", + __func__, fd); + return; + } if (strncmp(buf, "none", 4) == 0) - return MaxSector; + *v = MaxSector; else - return strtoull(buf, NULL, 10); + *v = strtoull(buf, NULL, 10); } static unsigned long long read_sync_completed(int fd) @@ -222,23 +234,42 @@ struct mdinfo *mdi; int ret = 0; int count = 0; + struct timeval tv; a->next_state = bad_word; a->next_action = bad_action; a->curr_state = read_state(a->info.state_fd); a->curr_action = read_action(a->action_fd); - a->info.resync_start = read_resync_start(a->resync_start_fd); + if (a->curr_state != clear) + /* + * In "clear" state, resync_start may wrongly be set to "0" + * when the kernel called md_clean but didn't remove the + * sysfs attributes yet + */ + read_resync_start(a->resync_start_fd, &a->info.resync_start); sync_completed = read_sync_completed(a->sync_completed_fd); for (mdi = a->info.devs; mdi ; mdi = mdi->next) { mdi->next_state = 0; mdi->curr_state = 0; if (mdi->state_fd >= 0) { - mdi->recovery_start = read_resync_start(mdi->recovery_fd); + read_resync_start(mdi->recovery_fd, + &mdi->recovery_start); mdi->curr_state = read_dev_state(mdi->state_fd); } } + gettimeofday(&tv, NULL); + dprintf("%s(%d): %ld.%06ld state:%s prev:%s action:%s prev: %s start:%llu\n", + __func__, a->info.container_member, + tv.tv_sec, tv.tv_usec, + array_states[a->curr_state], + array_states[a->prev_state], + sync_actions[a->curr_action], + sync_actions[a->prev_action], + a->info.resync_start + ); + if (a->curr_state > inactive && a->prev_state == inactive) { /* array has been started @@ -246,7 +277,7 @@ */ a->container->ss->set_array_state(a, 0); } - if (a->curr_state <= inactive && + if ((a->curr_state == bad_word || a->curr_state <= inactive) && a->prev_state > inactive) { /* array has been stopped */ a->container->ss->set_array_state(a, 1); @@ -269,8 +300,7 @@ a->container->ss->set_array_state(a, 1); } if (a->curr_state == active || - a->curr_state == suspended || - a->curr_state == bad_word) + a->curr_state == suspended) ret |= ARRAY_DIRTY; if (a->curr_state == readonly) { /* Well, I'm ready to handle things. If readonly @@ -398,7 +428,8 @@ if (sync_completed > a->last_checkpoint) a->last_checkpoint = sync_completed; - a->container->ss->sync_metadata(a->container); + if (deactivate || a->curr_state >= clean) + a->container->ss->sync_metadata(a->container); dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member, array_states[a->curr_state], sync_actions[a->curr_action]); @@ -573,9 +604,9 @@ */ int fd; if (sigterm) - fd = open_dev_excl(container->devnum); + fd = open_dev_excl(container->devnm); else - fd = open_dev_flags(container->devnum, O_RDONLY|O_EXCL); + fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL); if (fd >= 0 || errno != EBUSY) { /* OK, we are safe to leave */ if (sigterm && !dirty_arrays) @@ -586,7 +617,7 @@ /* On SIGTERM, someone (the take-over mdmon) will * clean up */ - remove_pidfile(container->devname); + remove_pidfile(container->devnm); exit_now = 1; signal_manager(); close(fd); @@ -609,10 +640,17 @@ monitor_loop_cnt |= 1; rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set); monitor_loop_cnt += 1; - if (rv == -1 && errno == EINTR) - rv = 0; + if (rv == -1) { + if (errno == EINTR) { + rv = 0; + dprintf("monitor: caught signal\n"); + } else + dprintf("monitor: error %d in pselect\n", + errno); + } #ifdef DEBUG - dprint_wake_reasons(&rfds); + else + dprint_wake_reasons(&rfds); #endif container->retry_soon = 0; } diff -Nru mdadm-3.2.5/Monitor.c mdadm-3.3/Monitor.c --- mdadm-3.2.5/Monitor.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Monitor.c 2013-09-03 04:47:47.000000000 +0000 @@ -30,25 +30,21 @@ #include #include -/* The largest number of disks current arrays can manage is 384 - * This really should be dynamically, but that will have to wait - * At least it isn't MD_SB_DISKS. - */ -#define MaxDisks 384 struct state { char *devname; - int devnum; /* to sync with mdstat info */ + char devnm[32]; /* to sync with mdstat info */ long utime; int err; char *spare_group; int active, working, failed, spare, raid; + int from_config; int expected_spares; - int devstate[MaxDisks]; - dev_t devid[MaxDisks]; + int devstate[MAX_DISKS]; + dev_t devid[MAX_DISKS]; int percent; - int parent_dev; /* For subarray, devnum of parent. - * For others, NoMdDev - */ + char parent_devnm[32]; /* For subarray, devnm of parent. + * For others, "" + */ struct supertype *metadata; struct state *subarray;/* for a container it is a link to first subarray * for a subarray it is a link to next subarray @@ -77,9 +73,10 @@ int Monitor(struct mddev_dev *devlist, char *mailaddr, char *alert_cmd, - int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char *pidfile, int increments, - int share, char *prefer) + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share) { /* * Every few seconds, scan every md device looking for changes @@ -132,20 +129,20 @@ if (!mailaddr) { mailaddr = conf_get_mailaddr(); - if (mailaddr && ! scan) - fprintf(stderr, Name ": Monitor using email address \"%s\" from config file\n", + if (mailaddr && ! c->scan) + pr_err("Monitor using email address \"%s\" from config file\n", mailaddr); } mailfrom = conf_get_mailfrom(); if (!alert_cmd) { alert_cmd = conf_get_program(); - if (alert_cmd && ! scan) - fprintf(stderr, Name ": Monitor using program \"%s\" from config file\n", + if (alert_cmd && ! c->scan) + pr_err("Monitor using program \"%s\" from config file\n", alert_cmd); } - if (scan && !mailaddr && !alert_cmd && !dosyslog) { - fprintf(stderr, Name ": No mail address or alert command - not monitoring.\n"); + if (c->scan && !mailaddr && !alert_cmd && !dosyslog) { + pr_err("No mail address or alert command - not monitoring.\n"); return 1; } info.alert_cmd = alert_cmd; @@ -159,8 +156,8 @@ return rv; } - if (share) - if (check_one_sharer(scan)) + if (share) + if (check_one_sharer(c->scan)) return 1; if (devlist == NULL) { @@ -171,46 +168,42 @@ continue; if (strcasecmp(mdlist->devname, "") == 0) continue; - st = calloc(1, sizeof *st); - if (st == NULL) - continue; + st = xcalloc(1, sizeof *st); if (mdlist->devname[0] == '/') - st->devname = strdup(mdlist->devname); + st->devname = xstrdup(mdlist->devname); else { - st->devname = malloc(8+strlen(mdlist->devname)+1); + st->devname = xmalloc(8+strlen(mdlist->devname)+1); strcpy(strcpy(st->devname, "/dev/md/"), mdlist->devname); } st->next = statelist; - st->devnum = INT_MAX; - st->percent = -2; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->from_config = 1; st->expected_spares = mdlist->spare_disks; if (mdlist->spare_group) - st->spare_group = strdup(mdlist->spare_group); + st->spare_group = xstrdup(mdlist->spare_group); statelist = st; } } else { struct mddev_dev *dv; for (dv=devlist ; dv; dv=dv->next) { struct mddev_ident *mdlist = conf_get_ident(dv->devname); - struct state *st = calloc(1, sizeof *st); - if (st == NULL) - continue; - st->devname = strdup(dv->devname); + struct state *st = xcalloc(1, sizeof *st); + st->devname = xstrdup(dv->devname); st->next = statelist; - st->devnum = INT_MAX; - st->percent = -2; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; st->expected_spares = -1; if (mdlist) { st->expected_spares = mdlist->spare_disks; if (mdlist->spare_group) - st->spare_group = strdup(mdlist->spare_group); + st->spare_group = xstrdup(mdlist->spare_group); } statelist = st; } } - while (! finished) { int new_found = 0; struct state *st; @@ -221,13 +214,13 @@ mdstat = mdstat_read(oneshot?0:1, 0); for (st=statelist; st; st=st->next) - if (check_array(st, mdstat, test, &info, - increments, prefer)) + if (check_array(st, mdstat, c->test, &info, + increments, c->prefer)) anydegraded = 1; - + /* now check if there are any new devices found in mdstat */ - if (scan) - new_found = add_new_arrays(mdstat, &statelist, test, + if (c->scan) + new_found = add_new_arrays(mdstat, &statelist, c->test, &info); /* If an array has active < raid && spare == 0 && spare_group != NULL @@ -240,9 +233,9 @@ if (oneshot) break; else - mdstat_wait(period); + mdstat_wait(c->delay); } - test = 0; + c->test = 0; } for (st2 = statelist; st2; st2 = statelist) { statelist = st2->next; @@ -306,13 +299,13 @@ rv = stat(dir, &buf); if (rv != -1) { if (scan) { - fprintf(stderr, Name ": Only one " + pr_err("Only one " "autorebuild process allowed" " in scan mode, aborting\n"); fclose(fp); return 1; } else { - fprintf(stderr, Name ": Warning: One" + pr_err("Warning: One" " autorebuild process already" " running.\n"); } @@ -322,12 +315,12 @@ if (scan) { if (mkdir(MDMON_DIR, S_IRWXU) < 0 && errno != EEXIST) { - fprintf(stderr, Name ": Can't create " + pr_err("Can't create " "autorebuild.pid file\n"); } else { fp = fopen(path, "w"); if (!fp) - fprintf(stderr, Name ": Cannot create" + pr_err("Cannot create" " autorebuild.pid" "file\n"); else { @@ -452,7 +445,7 @@ * or found by directly examining the array, and return * '1' if the array is degraded, or '0' if it is optimal (or dead). */ - struct { int state, major, minor; } info[MaxDisks]; + struct { int state, major, minor; } info[MAX_DISKS]; mdu_array_info_t array; struct mdstat_ent *mse = NULL, *mse2; char *dev = st->devname; @@ -460,6 +453,7 @@ int i; int remaining_disks; int last_disk; + int new_array = 0; if (test) alert("TestMessage", dev, NULL, ainfo); @@ -482,26 +476,18 @@ * have a device disappear than all of them that can */ if (array.level == 0 || array.level == -1) { - if (!st->err) + if (!st->err && !st->from_config) alert("DeviceDisappeared", dev, "Wrong-Level", ainfo); st->err = 1; close(fd); return 0; } - if (st->devnum == INT_MAX) { - struct stat stb; - if (fstat(fd, &stb) == 0 && - (S_IFMT&stb.st_mode)==S_IFBLK) { - if (major(stb.st_rdev) == MD_MAJOR) - st->devnum = minor(stb.st_rdev); - else - st->devnum = -1- (minor(stb.st_rdev)>>6); - } - } + if (st->devnm[0] == 0) + strcpy(st->devnm, fd2devnm(fd)); for (mse2 = mdstat ; mse2 ; mse2=mse2->next) - if (mse2->devnum == st->devnum) { - mse2->devnum = INT_MAX; /* flag it as "used" */ + if (strcmp(mse2->devnm, st->devnm) == 0) { + mse2->devnm[0] = 0; /* flag it as "used" */ mse = mse2; } @@ -518,6 +504,14 @@ * just make sure it is always different. */ array.utime = st->utime + 1;; + if (st->err) { + /* New array appeared where previously had and error */ + st->err = 0; + st->percent = RESYNC_NONE; + new_array = 1; + alert("NewArray", st->devname, NULL, ainfo); + } + if (st->utime == array.utime && st->failed == array.failed_disks && st->working == array.working_disks && @@ -526,7 +520,6 @@ mse->percent == st->percent ))) { close(fd); - st->err = 0; if ((st->active < st->raid) && st->spare == 0) return 1; else @@ -541,7 +534,7 @@ st->expected_spares > 0 && array.spare_disks < st->expected_spares) alert("SparesMissing", dev, NULL, ainfo); - if (st->percent == -1 && + if (st->percent < 0 && st->percent != RESYNC_UNKNOWN && mse->percent >= 0) alert("RebuildStarted", dev, NULL, ainfo); if (st->percent >= 0 && @@ -557,14 +550,14 @@ alert(percentalert, dev, NULL, ainfo); } - if (mse->percent == -1 && + if (mse->percent == RESYNC_NONE && st->percent >= 0) { /* Rebuild/sync/whatever just finished. * If there is a number in /mismatch_cnt, * we should report that. */ struct mdinfo *sra = - sysfs_read(-1, st->devnum, GET_MISMATCH); + sysfs_read(-1, st->devnm, GET_MISMATCH); if (sra && sra->mismatch_cnt > 0) { char cnt[80]; snprintf(cnt, sizeof(cnt), @@ -579,7 +572,7 @@ st->percent = mse->percent; remaining_disks = array.nr_disks; - for (i=0; i 0; + for (i=0; i 0; i++) { mdu_disk_info_t disc; disc.number = i; @@ -596,27 +589,29 @@ if (mse->metadata_version && strncmp(mse->metadata_version, "external:", 9) == 0 && - is_subarray(mse->metadata_version+9)) - st->parent_dev = - devname2devnum(mse->metadata_version+10); - else - st->parent_dev = NoMdDev; + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + if (sl) + *sl = 0; + } else + st->parent_devnm[0] = 0; if (st->metadata == NULL && - st->parent_dev == NoMdDev) + st->parent_devnm[0] == 0) st->metadata = super_by_fd(fd, NULL); close(fd); - for (i=0; i= last_disk) { - newstate = 0; - disc.major = disc.minor = 0; - } else if (info[i].major || info[i].minor) { + if (i < last_disk && + (info[i].major || info[i].minor)) { newstate = info[i].state; dv = map_dev_preferred( info[i].major, info[i].minor, 1, @@ -624,37 +619,22 @@ disc.state = newstate; disc.major = info[i].major; disc.minor = info[i].minor; - } else if (mse && mse->pattern && i < (int)strlen(mse->pattern)) { - switch(mse->pattern[i]) { - case 'U': newstate = 6 /* ACTIVE/SYNC */; break; - case '_': newstate = 8 /* REMOVED */; break; - } - disc.major = disc.minor = 0; - } + } else + newstate = (1 << MD_DISK_REMOVED); + if (dv == NULL && st->devid[i]) dv = map_dev_preferred( major(st->devid[i]), minor(st->devid[i]), 1, prefer); change = newstate ^ st->devstate[i]; - if (st->utime && change && !st->err) { - if (i < array.raid_disks && - (((newstate&change)&(1<devstate[i]&change)&(1<devstate[i]&change)&(1<utime && change && !st->err && !new_array) { + if ((st->devstate[i]&change)&(1<= array.raid_disks && + else if ((newstate & (1<devid[i] == makedev(disc.major, disc.minor) && - ((newstate&change)&(1<devid[i] == makedev(disc.major, disc.minor)) alert("FailSpare", dev, dv, ainfo); - else if (i < array.raid_disks && - ! (newstate & (1<devstate[i]&change)&(1<devstate[i] = newstate; @@ -679,17 +659,15 @@ int new_found = 0; for (mse=mdstat; mse; mse=mse->next) - if (mse->devnum != INT_MAX && + if (mse->devnm[0] && (!mse->level || /* retrieve containers */ (strcmp(mse->level, "raid0") != 0 && strcmp(mse->level, "linear") != 0)) ) { - struct state *st = calloc(1, sizeof *st); + struct state *st = xcalloc(1, sizeof *st); mdu_array_info_t array; int fd; - if (st == NULL) - continue; - st->devname = strdup(get_md_name(mse->devnum)); + st->devname = xstrdup(get_md_name(mse->devnm)); if ((fd = open(st->devname, O_RDONLY)) < 0 || ioctl(fd, GET_ARRAY_INFO, &array)< 0) { /* no such array */ @@ -706,20 +684,22 @@ close(fd); st->next = *statelist; st->err = 1; - st->devnum = mse->devnum; - st->percent = -2; + strcpy(st->devnm, mse->devnm); + st->percent = RESYNC_UNKNOWN; st->expected_spares = -1; if (mse->metadata_version && strncmp(mse->metadata_version, "external:", 9) == 0 && - is_subarray(mse->metadata_version+9)) - st->parent_dev = - devname2devnum(mse->metadata_version+10); - else - st->parent_dev = NoMdDev; + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + *sl = 0; + } else + st->parent_devnm[0] = 0; *statelist = st; if (test) alert("TestMessage", st->devname, NULL, info); - alert("NewArray", st->devname, NULL, info); new_found = 1; } return new_found; @@ -782,7 +762,7 @@ int d; dev_t dev = 0; - for (d = from->raid; !dev && d < MaxDisks; d++) { + for (d = from->raid; !dev && d < MAX_DISKS; d++) { if (from->devid[d] > 0 && from->devstate[d] == 0) { struct dev_policy *pol; @@ -797,7 +777,7 @@ dev_size < min_size) continue; - pol = devnum_policy(from->devid[d]); + pol = devid_policy(from->devid[d]); if (from->spare_group) pol_add(&pol, pol_domain, from->spare_group, NULL); @@ -828,12 +808,12 @@ close(fd); return 0; } - + err = st->ss->load_container(st, fd, NULL); close(fd); if (err) return 0; - + if (from == to) { /* We must check if number of active disks has not increased * since ioctl in main loop. mdmon may have added spare @@ -874,7 +854,6 @@ return dev; } - static void try_spare_migration(struct state *statelist, struct alert_info *info) { struct state *from; @@ -889,11 +868,11 @@ struct state *to = st; unsigned long long min_size; - if (to->parent_dev != NoMdDev && !to->parent) + if (to->parent_devnm[0] && !to->parent) /* subarray monitored without parent container * we can't move spares here */ continue; - + if (to->parent) /* member of a container */ to = to->parent; @@ -909,7 +888,7 @@ if (devid > 0) continue; } - for (d = 0; d < MaxDisks; d++) + for (d = 0; d < MAX_DISKS; d++) if (to->devid[d]) domainlist_add_dev(&domlist, to->devid[d], @@ -957,11 +936,11 @@ st->subarray = NULL; } for (st = list; st; st = st->next) - if (st->parent_dev != NoMdDev) + if (st->parent_devnm[0]) for (cont = list; cont; cont = cont->next) if (!cont->err && - cont->parent_dev == NoMdDev && - cont->devnum == st->parent_dev) { + cont->parent_devnm[0] == 0 && + strcmp(cont->devnm, st->parent_devnm) == 0) { st->parent = cont; st->subarray = cont->subarray; cont->subarray = st; @@ -973,31 +952,44 @@ int Wait(char *dev) { struct stat stb; - int devnum; + char devnm[32]; int rv = 1; if (stat(dev, &stb) != 0) { - fprintf(stderr, Name ": Cannot find %s: %s\n", dev, + pr_err("Cannot find %s: %s\n", dev, strerror(errno)); return 2; } - devnum = stat2devnum(&stb); + strcpy(devnm, stat2devnm(&stb)); while(1) { struct mdstat_ent *ms = mdstat_read(1, 0); struct mdstat_ent *e; for (e=ms ; e; e=e->next) - if (e->devnum == devnum) + if (strcmp(e->devnm, devnm) == 0) break; - if (!e || e->percent < 0) { + if (e->percent == RESYNC_NONE) { + /* We could be in the brief pause before something + * starts. /proc/mdstat doesn't show that, but + * sync_action does. + */ + struct mdinfo mdi; + char buf[21]; + sysfs_init(&mdi, -1, devnm); + if (sysfs_get_str(&mdi, NULL, "sync_action", + buf, 20) > 0 && + strcmp(buf,"idle\n") != 0) + e->percent = RESYNC_UNKNOWN; + } + if (!e || e->percent == RESYNC_NONE) { if (e && e->metadata_version && strncmp(e->metadata_version, "external:", 9) == 0) { if (is_subarray(&e->metadata_version[9])) ping_monitor(&e->metadata_version[9]); else - ping_monitor_by_id(devnum); + ping_monitor(devnm); } free_mdstat(ms); return rv; @@ -1018,21 +1010,21 @@ int fd; struct mdinfo *mdi; int rv = 1; - int devnum; + char devnm[32]; fd = open(dev, O_RDONLY); if (fd < 0) { if (verbose) - fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno)); + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); return 1; } - devnum = fd2devnum(fd); - mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + strcpy(devnm, fd2devnm(fd)); + mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE); if (!mdi) { if (verbose) - fprintf(stderr, Name ": Failed to read sysfs attributes for " - "%s\n", dev); + pr_err("Failed to read sysfs attributes for " + "%s\n", dev); close(fd); return 0; } @@ -1056,19 +1048,14 @@ rv = 0; if (rv) { - int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state"); + int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state"); char buf[20]; - fd_set fds; - struct timeval tm; + int delay = 5000; /* minimize the safe_mode_delay and prepare to wait up to 5s * for writes to quiesce */ sysfs_set_safemode(mdi, 1); - tm.tv_sec = 5; - tm.tv_usec = 0; - - FD_ZERO(&fds); /* wait for array_state to be clean */ while (1) { @@ -1077,8 +1064,7 @@ break; if (sysfs_match_word(buf, clean_states) <= 4) break; - FD_SET(state_fd, &fds); - rv = select(state_fd + 1, NULL, NULL, &fds, &tm); + rv = sysfs_wait(state_fd, &delay); if (rv < 0 && errno != EINTR) break; lseek(state_fd, 0, SEEK_SET); @@ -1095,7 +1081,7 @@ } else rv = 1; if (rv && verbose) - fprintf(stderr, Name ": Error waiting for %s to be clean\n", + pr_err("Error waiting for %s to be clean\n", dev); /* restore the original safe_mode_delay */ diff -Nru mdadm-3.2.5/msg.c mdadm-3.3/msg.c --- mdadm-3.2.5/msg.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/msg.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * Copyright (C) 2008 Intel Corporation * - * mdmon socket / message handling + * mdmon socket / message handling * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -78,7 +78,6 @@ return 0; } - int send_message(int fd, struct metadata_update *msg, int tmo) { __s32 len = msg->len; @@ -106,9 +105,7 @@ if (rv < 0 || len > MSG_MAX_LEN) return -1; if (len > 0) { - msg->buf = malloc(len); - if (msg->buf == NULL) - return -1; + msg->buf = xmalloc(len); rv = recv_buf(fd, msg->buf, len, tmo); if (rv < 0) { free(msg->buf); @@ -218,20 +215,6 @@ return err; } -/* ping monitor using device number */ -int ping_monitor_by_id(int devnum) -{ - int err = -1; - char *container = devnum2devname(devnum); - - if (container) { - err = ping_monitor(container); - free(container); - } - - return err; -} - static char *ping_monitor_version(char *devname) { int sfd = connect_monitor(devname); @@ -293,9 +276,8 @@ int check_mdmon_version(char *container) { char *version = NULL; - int devnum = devname2devnum(container); - if (!mdmon_running(devnum)) { + if (!mdmon_running(container)) { /* if mdmon is not active we assume that any instance that is * later started will match the current mdadm version, if this * assumption is violated we may inadvertantly rebuild an array @@ -310,9 +292,8 @@ ver = version ? mdadm_version(version) : -1; free(version); if (ver < 3002000) { - fprintf(stderr, Name - ": mdmon instance for %s cannot be disabled\n", - container); + pr_err("mdmon instance for %s cannot be disabled\n", + container); return -1; } } @@ -351,8 +332,7 @@ ent = mdstat_read(0, 0); if (!ent) { - fprintf(stderr, Name - ": failed to read /proc/mdstat while disabling mdmon\n"); + pr_err("failed to read /proc/mdstat while disabling mdmon\n"); return -1; } @@ -361,11 +341,10 @@ if (!is_container_member(e, container)) continue; sysfs_free(sra); - sra = sysfs_read(-1, e->devnum, GET_VERSION); + sra = sysfs_read(-1, e->devnm, GET_VERSION); if (!sra) { - fprintf(stderr, Name - ": failed to read sysfs for subarray%s\n", - to_subarray(e, container)); + pr_err("failed to read sysfs for subarray%s\n", + to_subarray(e, container)); break; } /* can't reshape an array that we can't monitor */ @@ -398,7 +377,7 @@ * or part-spares */ sysfs_free(sra); - sra = sysfs_read(-1, e->devnum, GET_DEVS | GET_STATE); + sra = sysfs_read(-1, e->devnm, GET_DEVS | GET_STATE); if (sra && sra->array.spare_disks > 0) { unblock_subarray(sra, freeze); break; @@ -406,7 +385,7 @@ } if (e) { - fprintf(stderr, Name ": failed to freeze subarray%s\n", + pr_err("failed to freeze subarray%s\n", to_subarray(e, container)); /* thaw the partially frozen container */ @@ -414,9 +393,9 @@ if (!is_container_member(e2, container)) continue; sysfs_free(sra); - sra = sysfs_read(-1, e2->devnum, GET_VERSION); + sra = sysfs_read(-1, e2->devnm, GET_VERSION); if (unblock_subarray(sra, freeze)) - fprintf(stderr, Name ": Failed to unfreeze %s\n", e2->dev); + pr_err("Failed to unfreeze %s\n", e2->dev); } ping_monitor(container); /* cleared frozen */ @@ -437,8 +416,7 @@ ent = mdstat_read(0, 0); if (!ent) { - fprintf(stderr, Name - ": failed to read /proc/mdstat while unblocking container\n"); + pr_err("failed to read /proc/mdstat while unblocking container\n"); return; } @@ -447,13 +425,13 @@ if (!is_container_member(e, container)) continue; sysfs_free(sra); - sra = sysfs_read(-1, e->devnum, GET_VERSION|GET_LEVEL); + sra = sysfs_read(-1, e->devnm, GET_VERSION|GET_LEVEL); if (!sra) continue; if (sra->array.level > 0) to_ping++; if (unblock_subarray(sra, unfreeze)) - fprintf(stderr, Name ": Failed to unfreeze %s\n", e->dev); + pr_err("Failed to unfreeze %s\n", e->dev); } if (to_ping) ping_monitor(container); @@ -462,8 +440,6 @@ free_mdstat(ent); } - - /* give the manager a chance to view the updated container state. This * would naturally happen due to the manager noticing a change in * /proc/mdstat; however, pinging encourages this detection to happen diff -Nru mdadm-3.2.5/msg.h mdadm-3.3/msg.h --- mdadm-3.2.5/msg.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/msg.h 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * Copyright (C) 2008 Intel Corporation * - * mdmon socket / message handling + * mdmon socket / message handling * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -17,7 +17,6 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ - struct mdinfo; struct metadata_update; @@ -27,7 +26,6 @@ extern int wait_reply(int fd, int tmo); extern int connect_monitor(char *devname); extern int ping_monitor(char *devname); -extern int ping_monitor_by_id(int devnum); extern int block_subarray(struct mdinfo *sra); extern int unblock_subarray(struct mdinfo *sra, const int unfreeze); extern int block_monitor(char *container, const int freeze); diff -Nru mdadm-3.2.5/part.h mdadm-3.3/part.h --- mdadm-3.2.5/part.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/part.h 2013-09-03 04:47:47.000000000 +0000 @@ -26,7 +26,6 @@ /* Structure definitions ext for MBR and GPT partition tables */ - #define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55) #define MBR_PARTITIONS 4 @@ -49,8 +48,6 @@ __u16 magic; } __attribute__((packed)); - - #define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL) #define MBR_GPT_PARTITION_TYPE 0xEE diff -Nru mdadm-3.2.5/platform-intel.c mdadm-3.3/platform-intel.c --- mdadm-3.2.5/platform-intel.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/platform-intel.c 2013-09-03 04:47:47.000000000 +0000 @@ -30,13 +30,12 @@ #include #include - static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val); static __u16 devpath_to_vendor(const char *dev_path); -void free_sys_dev(struct sys_dev **list) +static void free_sys_dev(struct sys_dev **list) { while (*list) { struct sys_dev *next = (*list)->next; @@ -102,10 +101,10 @@ /* start / add list entry */ if (!head) { - head = malloc(sizeof(*head)); + head = xmalloc(sizeof(*head)); list = head; } else { - list->next = malloc(sizeof(*head)); + list->next = xmalloc(sizeof(*head)); list = list->next; } @@ -116,7 +115,7 @@ list->dev_id = (__u16) dev_id; list->type = type; - list->path = canonicalize_file_name(path); + list->path = realpath(path, NULL); list->next = NULL; if ((list->pci_id = strrchr(list->path, '/')) != NULL) list->pci_id++; @@ -125,8 +124,8 @@ return head; } - static struct sys_dev *intel_devices=NULL; +static time_t valid_time = 0; static enum sys_dev_type device_type_by_id(__u16 device_id) { @@ -154,7 +153,6 @@ return n; } - static __u16 devpath_to_vendor(const char *dev_path) { char path[strlen(dev_path) + strlen("/vendor") + 1]; @@ -183,6 +181,12 @@ { struct sys_dev *ahci, *isci; + if (valid_time > time(0) - 10) + return intel_devices; + + if (intel_devices) + free_sys_dev(&intel_devices); + isci = find_driver_devices("pci", "isci"); ahci = find_driver_devices("pci", "ahci"); @@ -194,7 +198,9 @@ elem = elem->next; elem->next = isci; } - return ahci; + intel_devices = ahci; + valid_time = time(0); + return intel_devices; } /* @@ -245,7 +251,6 @@ return 0; } - const struct imsm_orom *imsm_platform_test(enum sys_dev_type hba_id, int *populated, struct imsm_orom *imsm_orom) { @@ -280,8 +285,6 @@ return imsm_orom; } - - static const struct imsm_orom *find_imsm_hba_orom(enum sys_dev_type hba_id) { unsigned long align; @@ -297,7 +300,7 @@ } if (check_env("IMSM_TEST_OROM")) { dprintf("OROM CAP: %p, pid: %d pop: %d\n", - &imsm_orom[hba_id], (int) getpid(), populated_orom[hba_id]); + &imsm_orom[hba_id], (int) getpid(), populated_orom[hba_id]); return imsm_platform_test(hba_id, &populated_orom[hba_id], &imsm_orom[hba_id]); } /* return empty OROM capabilities in EFI test mode */ @@ -305,11 +308,7 @@ check_env("IMSM_TEST_SCU_EFI")) return NULL; - - if (intel_devices != NULL) - free_sys_dev(&intel_devices); - - intel_devices = find_intel_devices(); + find_intel_devices(); if (intel_devices == NULL) return NULL; @@ -326,10 +325,6 @@ scan_adapter_roms(scan); probe_roms_exit(); - if (intel_devices != NULL) - free_sys_dev(&intel_devices); - intel_devices = NULL; - if (populated_orom[hba_id]) return &imsm_orom[hba_id]; return NULL; @@ -345,7 +340,6 @@ (c) & 0xff, ((c) >> 8) & 0xff, \ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) - #define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars" #define SCU_PROP "RstScuV" #define AHCI_PROP "RstSataV" @@ -381,7 +375,7 @@ errno = 0; var_data_len = strtoul(buf, NULL, 16); if ((errno == ERANGE && (var_data_len == LONG_MAX)) - || (errno != 0 && var_data_len == 0)) + || (errno != 0 && var_data_len == 0)) return 1; /* get data */ @@ -446,7 +440,6 @@ { const struct imsm_orom *cap=NULL; - if ((cap = find_imsm_efi(hba_id)) != NULL) return cap; if ((cap = find_imsm_hba_orom(hba_id)) != NULL) @@ -459,7 +452,7 @@ char device[46]; sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev)); - return canonicalize_file_name(device); + return realpath(device, NULL); } char *diskfd_to_devpath(int fd) diff -Nru mdadm-3.2.5/platform-intel.h mdadm-3.3/platform-intel.h --- mdadm-3.2.5/platform-intel.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/platform-intel.h 2013-09-03 04:47:47.000000000 +0000 @@ -133,7 +133,6 @@ return !!(orom->sss & (1 << (fs - 1))); } - /** * fls - find last (most-significant) bit set * @x: the word to search @@ -202,11 +201,9 @@ char *diskfd_to_devpath(int fd); struct sys_dev *find_driver_devices(const char *bus, const char *driver); struct sys_dev *find_intel_devices(void); -void free_sys_dev(struct sys_dev **list); const struct imsm_orom *find_imsm_capability(enum sys_dev_type hba_id); const struct imsm_orom *find_imsm_orom(void); int disk_attached_to_hba(int fd, const char *hba_path); char *devt_to_devpath(dev_t dev); int path_attached_to_hba(const char *disk_path, const char *hba_path); const char *get_sys_dev_type(enum sys_dev_type); - diff -Nru mdadm-3.2.5/policy.c mdadm-3.3/policy.c --- mdadm-3.2.5/policy.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/policy.c 2013-09-03 04:47:47.000000000 +0000 @@ -43,7 +43,7 @@ static void pol_new(struct dev_policy **pol, char *name, const char *val, const char *metadata) { - struct dev_policy *n = malloc(sizeof(*n)); + struct dev_policy *n = xmalloc(sizeof(*n)); const char *real_metadata = NULL; int i; @@ -67,7 +67,7 @@ if (!real_metadata) { static const char *prev = NULL; if (prev != metadata) { - fprintf(stderr, Name ": metadata=%s unrecognised - ignoring rule\n", + pr_err("metadata=%s unrecognised - ignoring rule\n", metadata); prev = metadata; } @@ -217,7 +217,7 @@ if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor)) continue; closedir(by_path); - return strdup(ent->d_name); + return xstrdup(ent->d_name); } closedir(by_path); /* A NULL path isn't really acceptable - use the devname.. */ @@ -228,9 +228,9 @@ nm[rv] = 0; dname = strrchr(nm, '/'); if (dname) - return strdup(dname + 1); + return xstrdup(dname + 1); } - return strdup("unknown"); + return xstrdup("unknown"); } char type_part[] = "part"; @@ -401,7 +401,6 @@ pol_dedup(*pol); } - /* * disk_policy() gathers policy information for the * disk described in the given mdinfo (disk.{major,minor}). @@ -421,7 +420,7 @@ return pol; } -struct dev_policy *devnum_policy(int dev) +struct dev_policy *devid_policy(int dev) { struct mdinfo disk; disk.disk.major = major(dev); @@ -451,10 +450,10 @@ if (strncmp(w, name, len) != 0 || w[len] != '=') return 0; - r = malloc(sizeof(*r)); + r = xmalloc(sizeof(*r)); r->next = *rp; r->name = name; - r->value = strdup(w+len+1); + r->value = xstrdup(w+len+1); r->dups = NULL; *rp = r; return 1; @@ -468,7 +467,7 @@ if (config_rules_end == NULL) config_rules_end = &config_rules; - pr = malloc(sizeof(*pr)); + pr = xmalloc(sizeof(*pr)); pr->type = type; pr->rule = NULL; for (w = dl_next(line); w != line ; w = dl_next(w)) { @@ -479,7 +478,7 @@ ! try_rule(w, pol_act, &pr->rule) && ! try_rule(w, pol_domain, &pr->rule) && ! try_rule(w, pol_auto, &pr->rule)) - fprintf(stderr, Name ": policy rule %s unrecognised and ignored\n", + pr_err("policy rule %s unrecognised and ignored\n", w); } pr->next = config_rules; @@ -492,7 +491,7 @@ struct pol_rule *pr; char *name, *val; - pr = malloc(sizeof(*pr)); + pr = xmalloc(sizeof(*pr)); pr->type = type; pr->rule = NULL; @@ -501,10 +500,10 @@ struct rule *r; val = va_arg(ap, char*); - r = malloc(sizeof(*r)); + r = xmalloc(sizeof(*r)); r->next = pr->rule; r->name = name; - r->value = strdup(val); + r->value = xstrdup(val); r->dups = NULL; pr->rule = r; } @@ -592,7 +591,6 @@ return rv; } - /* Domain policy: * Any device can have a list of domains asserted by different policy * statements. @@ -618,7 +616,7 @@ dom = *domp; } if (dom == NULL || strcmp(dom->dom, domain) != 0) { - dom = malloc(sizeof(*dom)); + dom = xmalloc(sizeof(*dom)); dom->next = *domp; dom->dom = domain; *domp = dom; @@ -677,9 +675,9 @@ return found_any; } -void domainlist_add_dev(struct domainlist **dom, int devnum, const char *metadata) +void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata) { - struct dev_policy *pol = devnum_policy(devnum); + struct dev_policy *pol = devid_policy(devid); domain_merge(dom, pol, metadata); dev_policy_free(pol); } @@ -703,7 +701,6 @@ domain_merge_one(domp, domain); } - void domain_free(struct domainlist *dl) { while (dl) { @@ -731,7 +728,7 @@ FILE *f = NULL; if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) { - fprintf(stderr, Name ": can't create file to save path " + pr_err("can't create file to save path " "to old disk: %s\n", strerror(errno)); return; } @@ -739,7 +736,7 @@ snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); f = fopen(path, "w"); if (!f) { - fprintf(stderr, Name ": can't create file to" + pr_err("can't create file to" " save path to old disk: %s\n", strerror(errno)); return; @@ -749,8 +746,8 @@ array->metadata, array->uuid[0], array->uuid[1], array->uuid[2], array->uuid[3]) <= 0) - fprintf(stderr, Name ": Failed to write to " - " cookie\n"); + pr_err("Failed to write to " + " cookie\n"); fclose(f); } @@ -791,13 +788,13 @@ /* find rule named rule_type and return its value */ char *find_rule(struct rule *rule, char *rule_type) { - while (rule) { - if (rule->name == rule_type) - return rule->value; - - rule = rule->next; - } - return NULL; + while (rule) { + if (rule->name == rule_type) + return rule->value; + + rule = rule->next; + } + return NULL; } #define UDEV_RULE_FORMAT \ @@ -834,44 +831,44 @@ */ int generate_entries(int fd) { - struct pol_rule *loop, *dup; - char *loop_value, *dup_value; - int duplicate; - - for (loop = config_rules; loop; loop = loop->next) { - if (loop->type != rule_policy && loop->type != rule_part) - continue; - duplicate = 0; - - /* only policies with paths and with actions supporting - * bare disks are considered */ - loop_value = find_rule(loop->rule, pol_act); - if (!loop_value || map_act(loop_value) < act_spare_same_slot) - continue; - loop_value = find_rule(loop->rule, rule_path); - if (!loop_value) - continue; - for (dup = config_rules; dup != loop; dup = dup->next) { - if (dup->type != rule_policy && loop->type != rule_part) - continue; - dup_value = find_rule(dup->rule, pol_act); - if (!dup_value || map_act(dup_value) < act_spare_same_slot) - continue; - dup_value = find_rule(dup->rule, rule_path); - if (!dup_value) - continue; - if (strcmp(loop_value, dup_value) == 0) { - duplicate = 1; - break; - } - } - - /* not a dup or first occurrence */ - if (!duplicate) - if (!write_rule(loop->rule, fd, loop->type == rule_part) ) - return 0; - } - return 1; + struct pol_rule *loop, *dup; + char *loop_value, *dup_value; + int duplicate; + + for (loop = config_rules; loop; loop = loop->next) { + if (loop->type != rule_policy && loop->type != rule_part) + continue; + duplicate = 0; + + /* only policies with paths and with actions supporting + * bare disks are considered */ + loop_value = find_rule(loop->rule, pol_act); + if (!loop_value || map_act(loop_value) < act_spare_same_slot) + continue; + loop_value = find_rule(loop->rule, rule_path); + if (!loop_value) + continue; + for (dup = config_rules; dup != loop; dup = dup->next) { + if (dup->type != rule_policy && loop->type != rule_part) + continue; + dup_value = find_rule(dup->rule, pol_act); + if (!dup_value || map_act(dup_value) < act_spare_same_slot) + continue; + dup_value = find_rule(dup->rule, rule_path); + if (!dup_value) + continue; + if (strcmp(loop_value, dup_value) == 0) { + duplicate = 1; + break; + } + } + + /* not a dup or first occurrence */ + if (!duplicate) + if (!write_rule(loop->rule, fd, loop->type == rule_part) ) + return 0; + } + return 1; } /* Write_rules routine creates dynamic udev rules used to handle @@ -879,40 +876,40 @@ */ int Write_rules(char *rule_name) { - int fd; - char udev_rule_file[PATH_MAX]; + int fd; + char udev_rule_file[PATH_MAX]; - if (rule_name) { - strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); - udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; - strcat(udev_rule_file, ".temp"); - fd = creat(udev_rule_file, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - if (fd == -1) - return 1; - } else - fd = 1; - - /* write static invocation */ - if (write(fd, udev_template_start, - sizeof(udev_template_start) - 1) - != (int)sizeof(udev_template_start)-1) - goto abort; - - /* iterate, if none created or error occurred, remove file */ - if (generate_entries(fd) < 0) - goto abort; - - fsync(fd); - if (rule_name) { - close(fd); - rename(udev_rule_file, rule_name); - } - return 0; + if (rule_name) { + strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); + udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; + strcat(udev_rule_file, ".temp"); + fd = creat(udev_rule_file, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd == -1) + return 1; + } else + fd = 1; + + /* write static invocation */ + if (write(fd, udev_template_start, + sizeof(udev_template_start) - 1) + != (int)sizeof(udev_template_start)-1) + goto abort; + + /* iterate, if none created or error occurred, remove file */ + if (generate_entries(fd) < 0) + goto abort; + + fsync(fd); + if (rule_name) { + close(fd); + rename(udev_rule_file, rule_name); + } + return 0; abort: - if (rule_name) { - close(fd); - unlink(udev_rule_file); - } - return 1; + if (rule_name) { + close(fd); + unlink(udev_rule_file); + } + return 1; } diff -Nru mdadm-3.2.5/probe_roms.c mdadm-3.3/probe_roms.c --- mdadm-3.2.5/probe_roms.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/probe_roms.c 2013-09-03 04:47:47.000000000 +0000 @@ -109,7 +109,7 @@ if (rc == 0) rom_fd = fd; else { - if (fd >= 0) + if (fd >= 0) close(fd); probe_roms_exit(); } @@ -150,39 +150,39 @@ }; static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0xc8000, .data = 0, .end = 0, }, { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0, .data = 0, .end = 0, }, { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0, .data = 0, .end = 0, }, { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0, .data = 0, .end = 0, }, { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0, .data = 0, .end = 0, }, { - .name = "Adapter ROM", + .name = "Adapter ROM", .start = 0, .data = 0, .end = 0, } }; static struct resource video_rom_resource = { - .name = "Video ROM", + .name = "Video ROM", .start = 0xc0000, .data = 0, .end = 0xc7fff, diff -Nru mdadm-3.2.5/Query.c mdadm-3.3/Query.c --- mdadm-3.2.5/Query.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/Query.c 2013-09-03 04:47:47.000000000 +0000 @@ -47,7 +47,7 @@ char *activity; if (fd < 0){ - fprintf(stderr, Name ": cannot open %s: %s\n", + pr_err("cannot open %s: %s\n", dev, strerror(errno)); return 1; } @@ -76,22 +76,26 @@ else { printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n", dev, - human_size_brief(larray_size), + human_size_brief(larray_size,IEC), map_num(pers, array.level), array.raid_disks, array.spare_disks, array.spare_disks==1?"":"s"); } st = guess_super(fd); - if (st) + if (st && st->ss->compare_super != NULL) superror = st->ss->load_super(st, fd, dev); else superror = -1; close(fd); if (superror == 0) { /* array might be active... */ + int uuid[4]; + struct map_ent *me, *map = NULL; st->ss->getinfo_super(st, &info, NULL); - if (st->ss == &super0) { - mddev = get_md_name(info.array.md_minor); + st->ss->uuid_from_super(st, uuid); + me = map_by_uuid(&map, uuid); + if (me) { + mddev = me->path; disc.number = info.disk.number; activity = "undetected"; if (mddev && (fd = open(mddev, O_RDONLY))>=0) { @@ -106,7 +110,7 @@ close(fd); } } else { - activity = "unknown"; + activity = "inactive"; mddev = "array"; } printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n", @@ -120,4 +124,3 @@ } return 0; } - diff -Nru mdadm-3.2.5/raid6check.c mdadm-3.3/raid6check.c --- mdadm-3.2.5/raid6check.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/raid6check.c 2013-09-03 04:47:47.000000000 +0000 @@ -27,10 +27,21 @@ #include #include +enum repair { + NO_REPAIR = 0, + MANUAL_REPAIR, + AUTO_REPAIR +}; + int geo_map(int block, unsigned long long stripe, int raid_disks, int level, int layout); void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size); void make_tables(void); +void ensure_zero_has_size(int chunk_size); +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs); +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs); +void xor_blocks(char *target, char **sources, int disks, int size); /* Collect per stripe consistency information */ void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q, @@ -101,37 +112,59 @@ return curr_broken_disk; } +int lock_stripe(struct mdinfo *info, unsigned long long start, + int chunk_size, int data_disks, sighandler_t *sig) { + int rv; + if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + return 2; + } + + sig[0] = signal(SIGTERM, SIG_IGN); + sig[1] = signal(SIGINT, SIG_IGN); + sig[2] = signal(SIGQUIT, SIG_IGN); + + rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks); + rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks); + return rv * 256; +} + +int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) { + int rv; + rv = sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + rv |= sysfs_set_num(info, NULL, "suspend_hi", 0); + rv |= sysfs_set_num(info, NULL, "suspend_lo", 0); + + signal(SIGQUIT, sig[2]); + signal(SIGINT, sig[1]); + signal(SIGTERM, sig[0]); + + if(munlockall() != 0) + return 3; + return rv * 256; +} + int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets, int raid_disks, int chunk_size, int level, int layout, - unsigned long long start, unsigned long long length, char *name[]) + unsigned long long start, unsigned long long length, char *name[], + enum repair repair, int failed_disk1, int failed_disk2) { /* read the data and p and q blocks, and check we got them right */ - char *stripe_buf = malloc(raid_disks * chunk_size); - char **stripes = malloc(raid_disks * sizeof(char*)); - char **blocks = malloc(raid_disks * sizeof(char*)); - uint8_t *p = malloc(chunk_size); - uint8_t *q = malloc(chunk_size); - int *results = malloc(chunk_size * sizeof(int)); + char *stripe_buf = xmalloc(raid_disks * chunk_size); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + int *block_index_for_slot = xmalloc(raid_disks * sizeof(int)); + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + int *results = xmalloc(chunk_size * sizeof(int)); + sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t)); int i; int diskP, diskQ; int data_disks = raid_disks - 2; int err = 0; - sighandler_t sig[3]; - int rv; extern int tables_ready; - if((stripe_buf == NULL) || - (stripes == NULL) || - (blocks == NULL) || - (p == NULL) || - (q == NULL) || - (results == NULL)) { - err = 1; - goto exitCheck; - } - if (!tables_ready) make_tables(); @@ -143,47 +176,51 @@ printf("pos --> %llu\n", start); - if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { - err = 2; + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); goto exitCheck; } - sig[0] = signal(SIGTERM, SIG_IGN); - sig[1] = signal(SIGINT, SIG_IGN); - sig[2] = signal(SIGQUIT, SIG_IGN); - rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks); - rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks); for (i = 0 ; i < raid_disks ; i++) { - lseek64(source[i], offsets[i] + start * chunk_size, 0); - read(source[i], stripes[i], chunk_size); + off64_t seek_res = lseek64(source[i], offsets[i] + start * chunk_size, + SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek to source %d failed\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + int read_res = read(source[i], stripes[i], chunk_size); + if (read_res < chunk_size) { + fprintf(stderr, "Failed to read complete chunk disk %d, aborting\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } } - rv |= sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); - rv |= sysfs_set_num(info, NULL, "suspend_hi", 0); - rv |= sysfs_set_num(info, NULL, "suspend_lo", 0); - signal(SIGQUIT, sig[2]); - signal(SIGINT, sig[1]); - signal(SIGTERM, sig[0]); - if(munlockall() != 0) { - err = 3; + err = unlock_all_stripes(info, sig); + if(err != 0) goto exitCheck; - } - - if(rv != 0) { - err = rv * 256; - goto exitCheck; - } for (i = 0 ; i < data_disks ; i++) { int disk = geo_map(i, start, raid_disks, level, layout); blocks[i] = stripes[disk]; + block_index_for_slot[disk] = i; printf("%d->%d\n", i, disk); } qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); diskP = geo_map(-1, start, raid_disks, level, layout); + diskQ = geo_map(-2, start, raid_disks, level, layout); + blocks[data_disks] = stripes[diskP]; + block_index_for_slot[diskP] = data_disks; + blocks[data_disks+1] = stripes[diskQ]; + block_index_for_slot[diskQ] = data_disks+1; + if (memcmp(p, stripes[diskP], chunk_size) != 0) { printf("P(%d) wrong at %llu\n", diskP, start); } - diskQ = geo_map(-2, start, raid_disks, level, layout); if (memcmp(q, stripes[diskQ], chunk_size) != 0) { printf("Q(%d) wrong at %llu\n", diskQ, start); } @@ -200,6 +237,129 @@ if(disk == -65535) { printf("Error detected at %llu: disk slot unknown\n", start); } + if(repair == MANUAL_REPAIR) { + printf("Repairing stripe %llu\n", start); + printf("Assuming slots %d (%s) and %d (%s) are incorrect\n", + failed_disk1, name[failed_disk1], + failed_disk2, name[failed_disk2]); + + if (failed_disk1 == diskQ || failed_disk2 == diskQ) { + char *all_but_failed_blocks[data_disks]; + int failed_data_or_p; + int failed_block_index; + + if (failed_disk1 == diskQ) + failed_data_or_p = failed_disk2; + else + failed_data_or_p = failed_disk1; + printf("Repairing D/P(%d) and Q\n", failed_data_or_p); + failed_block_index = block_index_for_slot[failed_data_or_p]; + for (i=0; i < data_disks; i++) + if (failed_block_index == i) + all_but_failed_blocks[i] = stripes[diskP]; + else + all_but_failed_blocks[i] = blocks[i]; + xor_blocks(stripes[failed_data_or_p], + all_but_failed_blocks, data_disks, chunk_size); + qsyndrome(p, (uint8_t*)stripes[diskQ], (uint8_t**)blocks, data_disks, chunk_size); + } else { + ensure_zero_has_size(chunk_size); + if (failed_disk1 == diskP || failed_disk2 == diskP) { + int failed_data, failed_block_index; + if (failed_disk1 == diskP) + failed_data = failed_disk2; + else + failed_data = failed_disk1; + failed_block_index = block_index_for_slot[failed_data]; + printf("Repairing D(%d) and P\n", failed_data); + raid6_datap_recov(raid_disks, chunk_size, failed_block_index, (uint8_t**)blocks); + } else { + printf("Repairing D and D\n"); + int failed_block_index1 = block_index_for_slot[failed_disk1]; + int failed_block_index2 = block_index_for_slot[failed_disk2]; + if (failed_block_index1 > failed_block_index2) { + int t = failed_block_index1; + failed_block_index1 = failed_block_index2; + failed_block_index2 = t; + } + raid6_2data_recov(raid_disks, chunk_size, failed_block_index1, failed_block_index2, (uint8_t**)blocks); + } + } + + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); + goto exitCheck; + } + + int write_res1, write_res2; + off64_t seek_res; + + seek_res = lseek64(source[failed_disk1], + offsets[failed_disk1] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk1\n"); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + write_res1 = write(source[failed_disk1], stripes[failed_disk1], chunk_size); + + seek_res = lseek64(source[failed_disk2], + offsets[failed_disk2] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk1\n"); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + write_res2 = write(source[failed_disk2], stripes[failed_disk2], chunk_size); + + err = unlock_all_stripes(info, sig); + if(err != 0) + goto exitCheck; + + if (write_res1 != chunk_size || write_res2 != chunk_size) { + fprintf(stderr, "Failed to write a complete chunk.\n"); + goto exitCheck; + } + + } else if (disk >= 0 && repair == AUTO_REPAIR) { + printf("Auto-repairing slot %d (%s)\n", disk, name[disk]); + if (disk == diskQ) { + qsyndrome(p, (uint8_t*)stripes[diskQ], (uint8_t**)blocks, data_disks, chunk_size); + } else { + char *all_but_failed_blocks[data_disks]; + int failed_block_index = block_index_for_slot[disk]; + for (i=0; i < data_disks; i++) + if (failed_block_index == i) + all_but_failed_blocks[i] = stripes[diskP]; + else + all_but_failed_blocks[i] = blocks[i]; + xor_blocks(stripes[disk], + all_but_failed_blocks, data_disks, chunk_size); + } + + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); + goto exitCheck; + } + + lseek64(source[disk], offsets[disk] + start * chunk_size, 0); + int write_res = write(source[disk], stripes[disk], chunk_size); + + err = unlock_all_stripes(info, sig); + if(err != 0 || write_res != chunk_size) + goto exitCheck; + + if (write_res != chunk_size) { + fprintf(stderr, "Failed to write a full chunk.\n"); + goto exitCheck; + } + } length--; start++; @@ -210,9 +370,11 @@ free(stripe_buf); free(stripes); free(blocks); + free(block_index_for_slot); free(p); free(q); free(results); + free(sig); return err; } @@ -240,6 +402,9 @@ int chunk_size = 0; int layout = -1; int level = 6; + enum repair repair = NO_REPAIR; + int failed_disk1 = -1; + int failed_disk2 = -1; unsigned long long start, length; int i; int mdfd; @@ -255,7 +420,8 @@ prg++; if (argc < 4) { - fprintf(stderr, "Usage: %s md_device start_stripe length_stripes\n", prg); + fprintf(stderr, "Usage: %s md_device start_stripe length_stripes [autorepair]\n", prg); + fprintf(stderr, " or: %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); exit_err = 1; goto exitHere; } @@ -263,12 +429,12 @@ mdfd = open(argv[1], O_RDONLY); if(mdfd < 0) { perror(argv[1]); - fprintf(stderr,"%s: cannot open %s\n", prg, argv[1]); + fprintf(stderr, "%s: cannot open %s\n", prg, argv[1]); exit_err = 2; goto exitHere; } - info = sysfs_read(mdfd, -1, + info = sysfs_read(mdfd, NULL, GET_LEVEL| GET_LAYOUT| GET_DISKS| @@ -321,8 +487,40 @@ raid_disks = info->array.raid_disks; chunk_size = info->array.chunk_size; layout = info->array.layout; - start = getnum(argv[2], &err); - length = getnum(argv[3], &err); + if (strcmp(argv[2], "repair")==0) { + if (argc < 6) { + fprintf(stderr, "For repair mode, call %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + repair = MANUAL_REPAIR; + start = getnum(argv[3], &err); + length = 1; + failed_disk1 = getnum(argv[4], &err); + failed_disk2 = getnum(argv[5], &err); + + if(failed_disk1 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_1 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk2 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_2 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk1 == failed_disk2) { + fprintf(stderr, "%s: failed_slot_1 and failed_slot_2 are the same\n", prg); + exit_err = 4; + goto exitHere; + } + } + else { + start = getnum(argv[2], &err); + length = getnum(argv[3], &err); + if (argc >= 5 && strcmp(argv[4], "autorepair")==0) + repair = AUTO_REPAIR; + } if (err) { fprintf(stderr, "%s: Bad number: %s\n", prg, err); @@ -340,21 +538,11 @@ length = (info->component_size * 512) / chunk_size - start; } - disk_name = malloc(raid_disks * sizeof(*disk_name)); - fds = malloc(raid_disks * sizeof(*fds)); - offsets = malloc(raid_disks * sizeof(*offsets)); - buf = malloc(raid_disks * chunk_size); - - if((disk_name == NULL) || - (fds == NULL) || - (offsets == NULL) || - (buf == NULL)) { - fprintf(stderr, "%s: allocation fail\n", prg); - exit_err = 5; - goto exitHere; - } + disk_name = xmalloc(raid_disks * sizeof(*disk_name)); + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + buf = xmalloc(raid_disks * chunk_size); - memset(offsets, 0, raid_disks * sizeof(*offsets)); for(i=0; i + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -24,7 +24,13 @@ #include "mdadm.h" -char Version[] = Name " - v3.2.5 - 18th May 2012\n"; +#ifndef VERSION +#define VERSION "3.3" +#endif +#ifndef VERS_DATE +#define VERS_DATE "3rd September 2013" +#endif +char Version[] = Name " - v" VERSION " - " VERS_DATE "\n"; /* * File: ReadMe.c @@ -32,37 +38,16 @@ * This file contains general comments about the implementation * and the various usage messages that can be displayed by mdadm * - * mdadm is a single program that can be used to control Linux md devices. - * It is intended to provide all the functionality of the mdtools and - * raidtools but with a very different interface. - * mdadm can perform all functions without a configuration file. - * There is the option of using a configuration file, but not in the same - * way that raidtools uses one - * raidtools uses a configuration file to describe how to create a RAID - * array, and also uses this file partially to start a previously - * created RAID array. Further, raidtools requires the configuration - * file for such things as stopping a raid array which needs to know - * nothing about the array. - * - * The configuration file that can be used by mdadm lists two - * different things: - * 1/ a mapping from uuid to md device to identify which arrays are - * expect and what names (numbers) they should be given - * 2/ a list of devices that should be scanned for md sub-devices - * - * */ /* * mdadm has 7 major modes of operation: * 1/ Create * This mode is used to create a new array with a superblock - * It can progress in several step create-add-add-run - * or it can all happen with one command * 2/ Assemble * This mode is used to assemble the parts of a previously created * array into an active array. Components can be explicitly given - * or can be searched for. mdadm (optionally) check that the components + * or can be searched for. mdadm (optionally) checks that the components * do form a bona-fide array, and can, on request, fiddle superblock * version numbers so as to assemble a faulty array. * 3/ Build @@ -83,14 +68,18 @@ * 7/ Grow * This mode allows for changing of key attributes of a raid array, such * as size, number of devices, and possibly even layout. - * At the time if writing, there is only minimal support. + * 8/ Incremental + * Is assembles an array incrementally instead of all at once. + * As devices are discovered they can be passed to "mdadm --incremental" + * which will collect them. When enough devices to for an array are + * found, it is started. */ char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; struct option long_options[] = { {"manage", 0, 0, ManageOpt}, @@ -103,7 +92,7 @@ {"follow", 0, 0, 'F'}, {"grow", 0, 0, 'G'}, {"incremental",0,0, 'I'}, - {"zero-superblock", 0, 0, 'K'}, /* deliberately no a short_option */ + {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ {"query", 0, 0, 'Q'}, {"examine-bitmap", 0, 0, 'X'}, {"auto-detect", 0, 0, AutoDetect}, @@ -112,11 +101,16 @@ {"update-subarray", 1, 0, UpdateSubarray}, {"udev-rules", 2, 0, UdevRules}, {"offroot", 0, 0, OffRootOpt}, + {"examine-badblocks", 0, 0, ExamineBB}, + + {"dump", 1, 0, Dump}, + {"restore", 1, 0, Restore}, /* synonyms */ {"monitor", 0, 0, 'F'}, /* after those will normally come the name of the md device */ + {"help", 0, 0, 'h'}, {"help-options",0,0, HelpOptions}, {"version", 0, 0, 'V'}, @@ -145,6 +139,7 @@ {"re-add", 0, 0, ReAdd}, {"homehost", 1, 0, HomeHost}, {"symlinks", 1, 0, Symlinks}, + {"data-offset",1, 0, DataOffset}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -161,6 +156,8 @@ {"remove", 0, 0, Remove}, {"fail", 0, 0, Fail}, {"set-faulty",0, 0, Fail}, + {"replace", 0, 0, Replace}, + {"with", 0, 0, With}, {"run", 0, 0, 'R'}, {"stop", 0, 0, 'S'}, {"readonly", 0, 0, 'o'}, @@ -254,8 +251,8 @@ " --verbose -v : Be more verbose about what is happening\n" " --quiet -q : Don't print un-necessary messages\n" " --brief -b : Be less verbose, more brief\n" -" --export -Y : With --detail, use key=value format for easy\n" -" import into environment\n" +" --export -Y : With --detail, --detail-platform or --examine use\n" +" key=value format for easy import into environment\n" " --force -f : Override normal checks and be more forceful\n" "\n" " --assemble -A : Assemble an array\n" @@ -264,16 +261,13 @@ " --detail -D : Display details of an array\n" " --examine -E : Examine superblock on an array component\n" " --examine-bitmap -X: Display the detail of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" " --monitor -F : monitor (follow) some arrays\n" " --grow -G : resize/ reshape and array\n" " --incremental -I : add/remove a single device to/from an array as appropriate\n" " --query -Q : Display general information about how a\n" " device relates to the md driver\n" " --auto-detect : Start arrays auto-detected by the kernel\n" -" --offroot : Set first character of argv[0] to @ to indicate the\n" -" application was launched from initrd/initramfs and\n" -" should not be shutdown by systemd as part of the\n" -" regular shutdown process.\n" ; /* "\n" @@ -281,15 +275,16 @@ " --bitmap= -b : File to store bitmap in - may pre-exist for --build\n" " --chunk= -c : chunk size of kibibytes\n" " --rounding= : rounding factor for linear array (==chunk size)\n" -" --level= -l : raid level: 0,1,4,5,6,linear,mp. 0 or linear for build\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n" +" : 0,1,10,mp,faulty or linear for build.\n" " --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" -" --layout= : same as --parity\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" " --raid-devices= -n : number of active devices in array\n" -" --spare-devices= -x: number of spares (eXtras) devices in initial array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" " --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" " --force -f : Honour devices as listed on command line. Don't\n" " : insert a missing drive for RAID5.\n" -" --assume-clean : Assume the array is already in-sync. This is dangerous.\n" +" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n" " --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" " --delay= -d : seconds between bitmap updates\n" " --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" @@ -319,9 +314,11 @@ "\n" " General management:\n" " --add -a : add, or hotadd subsequent devices\n" +" --re-add : re-add a recently removed device\n" " --remove -r : remove subsequent devices\n" -" --fail -f : mark subsequent devices a faulty\n" +" --fail -f : mark subsequent devices as faulty\n" " --set-faulty : same as --fail\n" +" --replace : mark a device for replacement\n" " --run -R : start a partially built array\n" " --stop -S : deactivate array, releasing all resources\n" " --readonly -o : mark array as readonly\n" @@ -354,14 +351,17 @@ "\n" " Options that are valid with --create (-C) are:\n" " --bitmap= : Create a bitmap for the array with the given filename\n" -" --chunk= -c : chunk size of kibibytes\n" +" : or an internal bitmap is 'internal' is given\n" +" --chunk= -c : chunk size in kibibytes\n" " --rounding= : rounding factor for linear array (==chunk size)\n" -" --level= -l : raid level: 0,1,4,5,6,linear,multipath and synonyms\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" " --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" -" --layout= : same as --parity\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" " --raid-devices= -n : number of active devices in array\n" -" --spare-devices= -x: number of spares (eXtras) devices in initial array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" " --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --data-offset= : Space to leave between start of device and start\n" +" : of array data.\n" " --force -f : Honour devices as listed on command line. Don't\n" " : insert a missing drive for RAID5.\n" " --run -R : insist of running the array even if not all\n" @@ -382,13 +382,13 @@ " assembling the array, except that hopefully there is useful data\n" " there in the second case.\n" "\n" -" The level may only be 0, raid0, or linear.\n" +" The level may only be 0, 1, 10, linear, multipath, or faulty.\n" " All devices must be listed and the array will be started once complete.\n" " Options that are valid with --build (-B) are:\n" " --bitmap= : file to store/find bitmap information in.\n" " --chunk= -c : chunk size of kibibytes\n" " --rounding= : rounding factor for linear array (==chunk size)\n" -" --level= -l : 0, raid0, or linear\n" +" --level= -l : 0, 1, 10, linear, multipath, faulty\n" " --raid-devices= -n : number of active devices in array\n" " --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" " --delay= -d : bitmap update delay in seconds.\n" @@ -404,9 +404,10 @@ "the array, and a number of sub devices. These can be found in a number\n" "of ways.\n" "\n" -"The md device is either given on the command line or is found listed\n" -"in the config file. The array identity is determined either from the\n" -"--uuid or --super-minor commandline arguments, from the config file,\n" +"The md device is given on the command line, is found listed in the\n" +"config file, or can be deduced from the array identity.\n" +"The array identity is determined either from the --uuid, --name, or\n" +"--super-minor commandline arguments, from the config file,\n" "or from the first component device on the command line.\n" "\n" "The different combinations of these are as follows:\n" @@ -421,6 +422,9 @@ " If the --scan option is given, and no devices are listed, then\n" " every array listed in the config file is considered for assembly.\n" " The identity of candidate devices are determined from the config file.\n" +" After these arrays are assembled, mdadm will look for other devices\n" +" that could form further arrays and tries to assemble them. This can\n" +" be disabled using the 'AUTO' option in the config file.\n" "\n" " If the --scan option is given as well as one or more devices, then\n" " Those devices are md devices that are to be assembled. Their identity\n" @@ -435,7 +439,7 @@ " is only newly missing devices that cause the array not to be started.\n" "\n" "Options that are valid with --assemble (-A) are:\n" -" --bitmap= : bitmap file to use wit the array\n" +" --bitmap= : bitmap file to use with the array\n" " --uuid= -u : uuid of array to assemble. Devices which don't\n" " have this uuid are excluded\n" " --super-minor= -m : minor number to look for in super-block when\n" @@ -449,6 +453,7 @@ " : out-of-date. This involves modifying the superblocks.\n" " --update= -U : Update superblock: try '-A --update=?' for option list.\n" " --no-degraded : Assemble but do not start degraded arrays.\n" +" --readonly -o : Mark the array as read-only. No resync will start.\n" ; char Help_manage[] = @@ -457,14 +462,20 @@ "This usage is for managing the component devices within an array.\n" "The --manage option is not needed and is assumed if the first argument\n" "is a device name or a management option.\n" -"The first device listed will be taken to be an md array device, and\n" +"The first device listed will be taken to be an md array device, any\n" "subsequent devices are (potential) components of that array.\n" "\n" "Options that are valid with management mode are:\n" " --add -a : hotadd subsequent devices to the array\n" +" --re-add : subsequent devices are re-added if there were\n" +" : recent members of the array\n" " --remove -r : remove subsequent devices, which must not be active\n" " --fail -f : mark subsequent devices a faulty\n" " --set-faulty : same as --fail\n" +" --replace : mark device(s) to be replaced by spares. Once\n" +" : replacement completes, device will be marked faulty\n" +" --with : Indicate which spare a previous '--replace' should\n" +" : prefer to use\n" " --run -R : start a partially built array\n" " --stop -S : deactivate array, releasing all resources\n" " --readonly -o : mark array as readonly\n" @@ -486,6 +497,7 @@ " --detail-platform : Display hardware/firmware details\n" " --examine -E : Examine superblock on an array component\n" " --examine-bitmap -X: Display contents of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" " --zero-superblock : erase the MD superblock from a device.\n" " --run -R : start a partially built array\n" " --stop -S : deactivate array, releasing all resources\n" @@ -504,8 +516,8 @@ "all devices listed in the config file are monitored.\n" "The address for mailing advisories to, and the program to handle\n" "each change can be specified in the config file or on the command line.\n" -"If no mail address or program are specified, then mdadm reports all\n" -"state changes to stdout.\n" +"There must be at least one destination for advisories, whether\n" +"an email address, a program, or --syslog\n" "\n" "Options that are valid with the monitor (-F --follow) mode are:\n" " --mail= -m : Address to mail alerts of failure to\n" @@ -527,26 +539,31 @@ "\n" "This usage causes mdadm to attempt to reconfigure a running array.\n" "This is only possibly if the kernel being used supports a particular\n" -"reconfiguration. This version supports changing the number of\n" -"devices in a RAID1/5/6, changing the active size of all devices in\n" -"a RAID1/4/5/6, adding or removing a write-intent bitmap, and changing\n" -"the error mode for a 'FAULTY' array.\n" +"reconfiguration.\n" "\n" "Options that are valid with the grow (-G --grow) mode are:\n" -" --level= -l : Tell mdadm what level the array is so that it can\n" -" : interpret '--layout' properly.\n" +" --level= -l : Tell mdadm what level to convert the array to.\n" " --layout= -p : For a FAULTY array, set/change the error mode.\n" +" : for other arrays, update the layout\n" " --size= -z : Change the active size of devices in an array.\n" " : This is useful if all devices have been replaced\n" " : with larger devices. Value is in Kilobytes, or\n" " : the special word 'max' meaning 'as large as possible'.\n" +" --assume-clean : When increasing the --size, this flag will avoid\n" +" : a resync of the new space\n" +" --chunk= -c : Change the chunksize of the array\n" " --raid-devices= -n : Change the number of active devices in an array.\n" +" --add= -a : Add listed devices as part of reshape. This is\n" +" : needed for resizing a RAID0 which cannot have\n" +" : spares already present.\n" " --bitmap= -b : Add or remove a write-intent bitmap.\n" -" --backup-file= file : A file on a differt device to store data for a\n" +" --backup-file= file : A file on a different device to store data for a\n" " : short time while increasing raid-devices on a\n" -" : RAID4/5/6 array. Not needed when a spare is present.\n" +" : RAID4/5/6 array. Also needed throughout a reshape\n" +" : when changing parameters other than raid-devices\n" " --array-size= -Z : Change visible size of array. This does not change\n" " : any data on the device, and is not stable across restarts.\n" +" --data-offset= : Location on device to move start of data to.\n" ; char Help_incr[] = @@ -568,18 +585,17 @@ " : partial arrays.\n" " --scan -s : Use with -R to start any arrays that have the minimal\n" " : required number of devices, but are not yet started.\n" -" --fail -f : First fail (if needed) and then remove device from\n" -" : any array that it is a member of.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" ; char Help_config[] = "The /etc/mdadm.conf config file:\n\n" " The config file contains, apart from blank lines and comment lines that\n" -" start with a hash(#), four sorts of configuration lines: array lines, \n" -" device lines, mailaddr lines and program lines.\n" -" Each configuration line is constructed of a number of space separated\n" -" words, and can be continued on subsequent physical lines by indenting\n" -" those lines.\n" +" start with a hash(#), array lines, device lines, and various\n" +" configuration lines.\n" +" Each line is constructed of a number of space separated words, and can\n" +" be continued on subsequent physical lines by indenting those lines.\n" "\n" " A device line starts with the word 'device' and then has a number of words\n" " which identify devices. These words should be names of devices in the\n" @@ -596,13 +612,22 @@ " containing wildcards, preceded by 'devices='. If multiple critea are given,\n" " than a device must match all of them to be considered.\n" "\n" -" A mailaddr line starts with the word 'mailaddr' and should contain exactly\n" -" one Email address. 'mdadm --monitor --scan' will send alerts of failed drives\n" -" to this Email address." -"\n" -" A program line starts with the word 'program' and should contain exactly\n" -" one program name. 'mdadm --monitor --scan' will run this program when any\n" -" event is detected.\n" +" Other configuration lines include:\n" +" mailaddr, mailfrom, program used for --monitor mode\n" +" create, auto used when creating device names in /dev\n" +" homehost, policy, part-policy used to guide policy in various\n" +" situations\n" "\n" ; +char *mode_help[mode_count] = { + [0] = Help, + [ASSEMBLE] = Help_assemble, + [BUILD] = Help_build, + [CREATE] = Help_create, + [MANAGE] = Help_manage, + [MISC] = Help_misc, + [MONITOR] = Help_monitor, + [GROW] = Help_grow, + [INCREMENTAL] = Help_incr, +}; diff -Nru mdadm-3.2.5/restripe.c mdadm-3.3/restripe.c --- mdadm-3.2.5/restripe.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/restripe.c 2013-09-03 04:47:47.000000000 +0000 @@ -83,7 +83,6 @@ case 500 + ALGORITHM_PARITY_0: return block + 1; - case 600 + ALGORITHM_PARITY_N_6: if (block == -2) return raid_disks - 1; @@ -131,7 +130,6 @@ return raid_disks - 1; return block + 1; - case 600 + ALGORITHM_PARITY_0: if (block == -1) return 0; @@ -173,7 +171,6 @@ if (block == -2) return (pd+1) % raid_disks; return (pd + 2 + block) % raid_disks; - case 600 + ALGORITHM_ROTATING_N_RESTART: /* Same a left_asymmetric, by first stripe is * D D D P Q rather than @@ -210,8 +207,7 @@ } } - -static void xor_blocks(char *target, char **sources, int disks, int size) +void xor_blocks(char *target, char **sources, int disks, int size) { int i, j; /* Amazingly inefficient... */ @@ -243,7 +239,6 @@ } } - /* * The following was taken from linux/drivers/md/mktables.c, and modified * to create in-memory tables rather than C code @@ -335,6 +330,17 @@ uint8_t *zero; int zero_size; + +void ensure_zero_has_size(int chunk_size) +{ + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } +} + /* Following was taken from linux/drivers/md/raid6recov.c */ /* Recover two failed data blocks. */ @@ -425,11 +431,9 @@ if((Px != 0) && (Qx == 0)) curr_broken_disk = diskP; - if((Px == 0) && (Qx != 0)) curr_broken_disk = diskQ; - if((Px != 0) && (Qx != 0)) { data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); if(data_id < 0) data_id += 255; @@ -510,15 +514,7 @@ if (!tables_ready) make_tables(); - - if (zero == NULL || chunk_size > zero_size) { - if (zero) - free(zero); - zero = malloc(chunk_size); - if (zero) - memset(zero, 0, chunk_size); - zero_size = chunk_size; - } + ensure_zero_has_size(chunk_size); len = data_disks * chunk_size; length_test = length / len; @@ -684,8 +680,8 @@ char *src_buf) { char *stripe_buf; - char **stripes = malloc(raid_disks * sizeof(char*)); - char **blocks = malloc(raid_disks * sizeof(char*)); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); int i; int rv; @@ -697,9 +693,7 @@ if (zero == NULL || chunk_size > zero_size) { if (zero) free(zero); - zero = malloc(chunk_size); - if (zero) - memset(zero, 0, chunk_size); + zero = xcalloc(1, chunk_size); zero_size = chunk_size; } @@ -779,7 +773,7 @@ syndrome_disks = data_disks; } qsyndrome((uint8_t*)stripes[disk], - (uint8_t*)stripes[qdisk], + (uint8_t*)stripes[qdisk], (uint8_t**)blocks, syndrome_disks, chunk_size); break; @@ -816,11 +810,11 @@ unsigned long long start, unsigned long long length) { /* ready the data and p (and q) blocks, and check we got them right */ - char *stripe_buf = malloc(raid_disks * chunk_size); - char **stripes = malloc(raid_disks * sizeof(char*)); - char **blocks = malloc(raid_disks * sizeof(char*)); - char *p = malloc(chunk_size); - char *q = malloc(chunk_size); + char *stripe_buf = xmalloc(raid_disks * chunk_size); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + char *p = xmalloc(chunk_size); + char *q = xmalloc(chunk_size); int i; int diskP, diskQ; @@ -935,9 +929,8 @@ raid_disks, argc-9); exit(2); } - fds = malloc(raid_disks * sizeof(*fds)); - offsets = malloc(raid_disks * sizeof(*offsets)); - memset(offsets, 0, raid_disks * sizeof(*offsets)); + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); storefd = open(file, O_RDWR); if (storefd < 0) { @@ -953,7 +946,7 @@ *p++ = '\0'; offsets[i] = atoll(p) * 512; } - + fds[i] = open(argv[9+i], O_RDWR); if (fds[i] < 0) { perror(argv[9+i]); @@ -962,7 +955,7 @@ } } - buf = malloc(raid_disks * chunk_size); + buf = xmalloc(raid_disks * chunk_size); if (save == 1) { int rv = save_stripes(fds, offsets, diff -Nru mdadm-3.2.5/sg_io.c mdadm-3.3/sg_io.c --- mdadm-3.2.5/sg_io.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/sg_io.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * Copyright (C) 2007-2008 Intel Corporation * - * Retrieve drive serial numbers for scsi disks + * Retrieve drive serial numbers for scsi disks * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff -Nru mdadm-3.2.5/sha1.c mdadm-3.3/sha1.c --- mdadm-3.2.5/sha1.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/sha1.c 2013-09-03 04:47:47.000000000 +0000 @@ -50,7 +50,6 @@ 64-byte boundary. (RFC 1321, 3.1: Step 1) */ static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; - /* Take a pointer to a 160 bit block of data (five 32 bit ints) and initialize it to the start constants of the SHA1 algorithm. This must be called before using hash in the call to sha1_hash. */ diff -Nru mdadm-3.2.5/sha1.h mdadm-3.3/sha1.h --- mdadm-3.2.5/sha1.h 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/sha1.h 2013-09-03 04:47:47.000000000 +0000 @@ -1,11 +1,11 @@ /* Declarations of functions and data types used for SHA1 sum library functions. - Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008, 2010 + Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the - Free Software Foundation; either version 3, or (at your option) any + Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, @@ -26,8 +26,6 @@ # include #endif -#include "ansidecl.h" - /* The following contortions are an attempt to use the C preprocessor to determine an unsigned integral type that is 32 bits wide. An alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but @@ -86,7 +84,6 @@ sha1_uint32 buffer[32]; }; - /* Initialize structure containing state of computation. */ extern void sha1_init_ctx (struct sha1_ctx *ctx); @@ -113,7 +110,6 @@ aligned for a 32 bits value. */ extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf); - /* Put result from CTX in first 20 bytes following RESBUF. The result is always in little endian byte order, so that a byte-wise output yields to the wanted ASCII representation of the message digest. @@ -122,7 +118,6 @@ aligned for a 32 bits value. */ extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf); - /* Compute SHA1 message digest for bytes read from STREAM. The resulting message digest number will be written into the 20 bytes beginning at RESBLOCK. */ diff -Nru mdadm-3.2.5/super0.c mdadm-3.3/super0.c --- mdadm-3.2.5/super0.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super0.c 2013-09-03 04:47:47.000000000 +0000 @@ -36,7 +36,6 @@ * .. other stuff */ - static unsigned long calc_sb0_csum(mdp_super_t *super) { unsigned long csum = super->sb_csum; @@ -47,7 +46,6 @@ return newcsum; } - static void super0_swap_endian(struct mdp_superblock_s *sb) { /* as super0 superblocks are host-endian, it is sometimes @@ -281,6 +279,51 @@ + sb->events_lo); } +static int copy_metadata0(struct supertype *st, int from, int to) +{ + /* Read 64K from the appropriate offset of 'from' + * and if it looks a little like a 0.90 superblock, + * write it to the same offset of 'to' + */ + void *buf; + unsigned long long dsize, offset; + const int bufsize = 64*1024; + mdp_super_t *super; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (dsize < MD_RESERVED_SECTORS*512) + goto err; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(from, offset, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + if (lseek64(to, offset, 0) < 0LL) + goto err; + super = buf; + if (super->md_magic != MD_SB_MAGIC || + super->major_version != 0 || + calc_sb0_csum(super) != super->sb_csum) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + free(buf); + return 0; +err: + free(buf); + return 1; +} + static void detail_super0(struct supertype *st, char *homehost) { mdp_super_t *sb = st->sb; @@ -418,7 +461,7 @@ if (subarray) return NULL; - info = malloc(sizeof(*info)); + info = xmalloc(sizeof(*info)); getinfo_super0(st, info, NULL); return info; } @@ -435,6 +478,18 @@ int rv = 0; int uuid[4]; mdp_super_t *sb = st->sb; + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* note that 'homehost' is special as it is really + * a "uuid" update. + */ + uuid_set = 0; + update = "uuid"; + info->uuid[0] = sb->set_uuid0; + info->uuid[1] = sb->set_uuid1; + } + if (strcmp(update, "sparc2.2")==0 ) { /* 2.2 sparc put the events in the wrong place * So we copy the tail of the superblock @@ -445,12 +500,12 @@ sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1, (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4); if (verbose >= 0) - fprintf (stderr, Name ": adjusting superblock of %s for 2.2/sparc compatability.\n", - devname); + pr_err("adjusting superblock of %s for 2.2/sparc compatibility.\n", + devname); } else if (strcmp(update, "super-minor") ==0) { sb->md_minor = info->array.md_minor; if (verbose > 0) - fprintf(stderr, Name ": updating superblock of %s with minor number %d\n", + pr_err("updating superblock of %s with minor number %d\n", devname, info->array.md_minor); } else if (strcmp(update, "summaries") == 0) { unsigned int i; @@ -551,12 +606,6 @@ /* make sure resync happens */ sb->state &= ~(1<recovery_cp = 0; - } else if (strcmp(update, "homehost") == 0 && - homehost) { - uuid_set = 0; - update = "uuid"; - info->uuid[0] = sb->set_uuid0; - info->uuid[1] = sb->set_uuid1; } else if (strcmp(update, "uuid") == 0) { if (!uuid_set && homehost) { char buf[20]; @@ -575,6 +624,58 @@ uuid_from_super0(st, uuid); memcpy(bm->uuid, uuid, 16); } + } else if (strcmp(update, "metadata") == 0) { + /* Create some v1.0 metadata to match ours but make the + * ctime bigger. Also update info->array.*_version. + * We need to arrange that store_super writes out + * the v1.0 metadata. + * Not permitted for unclean array, or array with + * bitmap. + */ + if (info->bitmap_offset) { + pr_err("Cannot update metadata when bitmap is present\n"); + rv = -2; + } else if (info->array.state != 1) { + pr_err("Cannot update metadata on unclean array\n"); + rv = -2; + } else { + info->array.major_version = 1; + info->array.minor_version = 0; + uuid_from_super0(st, info->uuid); + st->other = super1_make_v0(st, info, st->sb); + } + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (sb->minor_version <= 90) + pr_err("No active reshape to revert on %s\n", + devname); + else if (sb->delta_disks == 0) + pr_err("%s: Can only revert reshape which changes number of devices\n", + devname); + else { + int tmp; + int parity = sb->level == 6 ? 2 : 1; + rv = 0; + + if (sb->level >= 4 && sb->level <= 6 && + sb->reshape_position % ( + sb->new_chunk/512 * + (sb->raid_disks - sb->delta_disks - parity))) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + sb->raid_disks -= sb->delta_disks; + sb->delta_disks = -sb->delta_disks; + + tmp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = tmp; + + tmp = sb->new_chunk; + sb->new_chunk = sb->chunk_size; + sb->chunk_size = tmp; + } } else if (strcmp(update, "no-bitmap") == 0) { sb->state &= ~(1<working_disks - info->active_disks; if (info->raid_disks + spares > MD_SB_DISKS) { - fprintf(stderr, Name ": too many devices requested: %d+%d > %d\n", + pr_err("too many devices requested: %d+%d > %d\n", info->raid_disks , spares, MD_SB_DISKS); return 0; } @@ -633,9 +738,9 @@ sb->gvalid_words = 0; /* ignored */ sb->ctime = time(0); sb->level = info->level; - if (size != (unsigned long long)info->size) + sb->size = size; + if (size != (unsigned long long)sb->size) return 0; - sb->size = info->size; sb->nr_disks = info->nr_disks; sb->raid_disks = info->raid_disks; sb->md_minor = info->md_minor; @@ -657,7 +762,7 @@ if (rfd >= 0) close(rfd); } - if (homehost) { + if (homehost && !uuid) { char buf[20]; char *hash = sha1_buffer(homehost, strlen(homehost), @@ -690,7 +795,7 @@ #ifndef MDASSEMBLE /* Add a device to the superblock being created */ static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, - int fd, char *devname) + int fd, char *devname, unsigned long long data_offset) { mdp_super_t *sb = st->sb; mdp_disk_t *dk = &sb->disks[dinfo->number]; @@ -710,7 +815,7 @@ dip = (struct devinfo **)&st->info; while (*dip) dip = &(*dip)->next; - di = malloc(sizeof(struct devinfo)); + di = xmalloc(sizeof(struct devinfo)); di->fd = fd; di->devname = devname; di->disk = *dinfo; @@ -733,6 +838,24 @@ if (dsize < MD_RESERVED_SECTORS*512) return 2; + if (st->other) { + /* Writing out v1.0 metadata for --update=metadata */ + int ret = 0; + + offset = dsize/512 - 8*2; + offset &= ~(4*2-1); + offset *= 512; + if (lseek64(fd, offset, 0)< 0LL) + ret = 3; + else if (write(fd, st->other, 1024) != 1024) + ret = 4; + else + fsync(fd); + free(st->other); + st->other = NULL; + return ret; + } + offset = MD_NEW_SIZE_SECTORS(dsize>>9); offset *= 512; @@ -746,7 +869,7 @@ if (super->state & (1<magic) == BITMAP_MAGIC) - if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != + if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != ROUND_UP(sizeof(*bm),4096)) return 5; } @@ -768,7 +891,7 @@ continue; if (di->fd == -1) continue; - while (Kill(di->devname, NULL, 0, 1, 1) == 0) + while (Kill(di->devname, NULL, 0, -1, 1) == 0) ; sb->disks[di->disk.number].state &= ~(1<ss->write_bitmap(st, di->fd); if (rv) - fprintf(stderr, - Name ": failed to write superblock to %s\n", - di->devname); + pr_err("failed to write superblock to %s\n", + di->devname); } return rv; } @@ -806,10 +928,9 @@ return 1; if (!first) { if (posix_memalign((void**)&first, 4096, - MD_SB_BYTES + + MD_SB_BYTES + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) { - fprintf(stderr, Name - ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 1; } memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); @@ -834,7 +955,6 @@ return 0; } - static void free_super0(struct supertype *st); static int load_super0(struct supertype *st, int fd, char *devname) @@ -858,21 +978,19 @@ if (dsize < MD_RESERVED_SECTORS*512) { if (devname) - fprintf(stderr, Name - ": %s is too small for md: size is %llu sectors.\n", - devname, dsize); + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); return 1; } + st->devsize = dsize; offset = MD_NEW_SIZE_SECTORS(dsize>>9); offset *= 512; - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ - if (lseek64(fd, offset, 0)< 0LL) { if (devname) - fprintf(stderr, Name ": Cannot seek to superblock on %s: %s\n", + pr_err("Cannot seek to superblock on %s: %s\n", devname, strerror(errno)); return 1; } @@ -880,14 +998,13 @@ if (posix_memalign((void**)&super, 4096, MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { - fprintf(stderr, Name - ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 1; } if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { if (devname) - fprintf(stderr, Name ": Cannot read superblock on %s\n", + pr_err("Cannot read superblock on %s\n", devname); free(super); return 1; @@ -898,7 +1015,7 @@ if (super->md_magic != MD_SB_MAGIC) { if (devname) - fprintf(stderr, Name ": No super block found on %s (Expected magic %08x, got %08x)\n", + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", devname, MD_SB_MAGIC, super->md_magic); free(super); return 2; @@ -906,7 +1023,7 @@ if (super->major_version != 0) { if (devname) - fprintf(stderr, Name ": Cannot interpret superblock on %s - version is %d\n", + pr_err("Cannot interpret superblock on %s - version is %d\n", devname, super->major_version); free(super); return 2; @@ -946,11 +1063,9 @@ static struct supertype *match_metadata_desc0(char *arg) { - struct supertype *st = calloc(1, sizeof(*st)); - if (!st) - return st; + struct supertype *st = xcalloc(1, sizeof(*st)); - st->container_dev = NoMdDev; + st->container_devnm[0] = 0; st->ss = &super0; st->info = NULL; st->minor_version = 90; @@ -981,8 +1096,11 @@ return NULL; } -static __u64 avail_size0(struct supertype *st, __u64 devsize) +static __u64 avail_size0(struct supertype *st, __u64 devsize, + unsigned long long data_offset) { + if (data_offset != 0 && data_offset != INVALID_SECTORS) + return 0ULL; if (devsize < MD_RESERVED_SECTORS) return 0ULL; return MD_NEW_SIZE_SECTORS(devsize); @@ -1007,7 +1125,6 @@ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MD_SB_BYTES); int uuid[4]; - min_chunk = 4096; /* sub-page chunks don't work yet.. */ bits = (size * 512) / min_chunk + 1; while (bits > max_bits) { @@ -1039,7 +1156,6 @@ return 1; } - static void locate_bitmap0(struct supertype *st, int fd) { unsigned long long dsize; @@ -1127,6 +1243,7 @@ static int validate_geometry0(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *subdev, unsigned long long *freesize, int verbose) { @@ -1142,22 +1259,22 @@ if (level == LEVEL_CONTAINER) { if (verbose) - fprintf(stderr, Name ": 0.90 metadata does not support containers\n"); + pr_err("0.90 metadata does not support containers\n"); return 0; } if (raiddisks > MD_SB_DISKS) { if (verbose) - fprintf(stderr, Name ": 0.90 metadata supports at most %d devices per array\n", + pr_err("0.90 metadata supports at most %d devices per array\n", MD_SB_DISKS); return 0; } if (size >= tbmax * 2ULL*1024*1024*1024) { if (verbose) - fprintf(stderr, Name ": 0.90 metadata supports at most " - "%d terabytes per device\n", tbmax); + pr_err("0.90 metadata supports at most " + "%d terabytes per device\n", tbmax); return 0; } - if (chunk && *chunk == UnSet) + if (*chunk == UnSet) *chunk = DEFAULT_CHUNK; if (!subdev) @@ -1166,7 +1283,7 @@ fd = open(subdev, O_RDONLY|O_EXCL, 0); if (fd < 0) { if (verbose) - fprintf(stderr, Name ": super0.90 cannot open %s: %s\n", + pr_err("super0.90 cannot open %s: %s\n", subdev, strerror(errno)); return 0; } @@ -1194,6 +1311,7 @@ .write_init_super = write_init_super0, .validate_geometry = validate_geometry0, .add_to_super = add_to_super0, + .copy_metadata = copy_metadata0, #endif .match_home = match_home0, .uuid_from_super = uuid_from_super0, diff -Nru mdadm-3.2.5/super1.c mdadm-3.3/super1.c --- mdadm-3.2.5/super1.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super1.c 2013-09-03 04:47:47.000000000 +0000 @@ -57,8 +57,11 @@ __u64 reshape_position; /* next address in array-space for reshape */ __u32 delta_disks; /* change in number of raid_disks */ __u32 new_layout; /* new layout */ - __u32 new_chunk; /* new chunk size (bytes) */ - __u8 pad1[128-124]; /* set to 0 when written */ + __u32 new_chunk; /* new chunk size (sectors) */ + __u32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ /* constant this-device information - 64 bytes */ __u64 data_offset; /* sector start of data, often 0 */ @@ -68,9 +71,14 @@ __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 devflags; /* per-device flags. Only one defined...*/ + __u8 devflags; /* per-device flags. Only one defined...*/ #define WriteMostly1 1 /* mask for writemostly flag in above */ - __u8 pad2[64-57]; /* set to 0 when writing */ + /* bad block log. If there are any bad blocks the feature flag is set. + * if offset and size are non-zero, that space is reserved and available. + */ + __u8 bblog_shift; /* shift from sectors to block size for badblocklist */ + __u16 bblog_size; /* number of sectors reserved for badblocklist */ + __u32 bblog_offset; /* sector offset from superblock to bblog, signed */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ @@ -106,8 +114,24 @@ * must be honoured */ #define MD_FEATURE_RESHAPE_ACTIVE 4 - -#define MD_FEATURE_ALL (1|2|4) +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ + |MD_FEATURE_RECOVERY_OFFSET \ + |MD_FEATURE_RESHAPE_ACTIVE \ + |MD_FEATURE_BAD_BLOCKS \ + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + ) #ifndef offsetof #define offsetof(t,f) ((size_t)&(((t*)0)->f)) @@ -245,6 +269,7 @@ int l = homehost ? strlen(homehost) : 0; int layout; unsigned long long sb_offset; + struct mdinfo info; printf(" Magic : %08x\n", __le32_to_cpu(sb->magic)); printf(" Version : 1"); @@ -303,10 +328,23 @@ if (sb->data_offset) printf(" Data Offset : %llu sectors\n", (unsigned long long)__le64_to_cpu(sb->data_offset)); + if (sb->new_offset && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) { + unsigned long long offset = __le64_to_cpu(sb->data_offset); + offset += (signed)(int32_t)__le32_to_cpu(sb->new_offset); + printf(" New Offset : %llu sectors\n", offset); + } printf(" Super Offset : %llu sectors\n", (unsigned long long)__le64_to_cpu(sb->super_offset)); if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET) printf("Recovery Offset : %llu sectors\n", (unsigned long long)__le64_to_cpu(sb->recovery_offset)); + + st->ss->getinfo_super(st, &info, NULL); + if (info.space_after != 1 && + !(__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + printf(" Unused Space : before=%llu sectors, after=%llu sectors\n", + info.space_before, info.space_after); + printf(" State : %s\n", (__le64_to_cpu(sb->resync_offset)+1)? "active":"clean"); printf(" Device UUID : "); for (i=0; i<16; i++) { @@ -319,7 +357,7 @@ printf("Internal Bitmap : %ld sectors from superblock\n", (long)(int32_t)__le32_to_cpu(sb->bitmap_offset)); } - if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) { printf(" Reshape pos'n : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->reshape_position)/2, human_size(__le64_to_cpu(sb->reshape_position)<<9)); if (__le32_to_cpu(sb->delta_disks)) { @@ -363,6 +401,16 @@ atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL; printf(" Update Time : %.24s\n", ctime(&atime)); + if (sb->bblog_size && sb->bblog_offset) { + printf(" Bad Block Log : %d entries available at offset %ld sectors", + __le16_to_cpu(sb->bblog_size)*512/8, + (long)(int32_t)__le32_to_cpu(sb->bblog_offset)); + if (sb->feature_map & + __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + printf(" - bad blocks present."); + printf("\n"); + } + if (calc_sb_1_csum(sb) == sb->sb_csum) printf(" Checksum : %x - correct\n", __le32_to_cpu(sb->sb_csum)); else @@ -421,6 +469,8 @@ role = 0xFFFF; if (role >= 0xFFFE) printf("spare\n"); + else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT)) + printf("Replacement device %d\n", role); else printf("Active device %d\n", role); @@ -433,9 +483,14 @@ if (role == d) cnt++; } - if (cnt > 1) printf("?"); - else if (cnt == 1) printf("A"); - else printf ("."); + if (cnt == 2) + printf("R"); + else if (cnt == 1) + printf("A"); + else if (cnt == 0) + printf("."); + else + printf("?"); } #if 0 /* This is confusing too */ @@ -447,11 +502,10 @@ } if (faulty) printf(" %d failed", faulty); #endif - printf(" ('A' == active, '.' == missing)"); + printf(" ('A' == active, '.' == missing, 'R' == replacing)"); printf("\n"); } - static void brief_examine_super1(struct supertype *st, int verbose) { struct mdp_superblock_1 *sb = st->sb; @@ -468,7 +522,12 @@ else nm = NULL; - printf("ARRAY%s%s", nm ? " /dev/md/":"", nm); + printf("ARRAY "); + if (nm) { + printf("/dev/md/"); + print_escape(nm); + putchar(' '); + } if (verbose && c) printf(" level=%s", c); sb_offset = __le64_to_cpu(sb->super_offset); @@ -485,8 +544,10 @@ if ((i&3)==0 && i != 0) printf(":"); printf("%02x", sb->set_uuid[i]); } - if (sb->set_name[0]) - printf(" name=%.32s", sb->set_name); + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } printf("\n"); } @@ -495,6 +556,7 @@ struct mdp_superblock_1 *sb = st->sb; int i; int len = 32; + int layout; printf("MD_LEVEL=%s\n", map_num(pers, __le32_to_cpu(sb->level))); printf("MD_DEVICES=%d\n", __le32_to_cpu(sb->raid_disks)); @@ -506,6 +568,24 @@ } if (len) printf("MD_NAME=%.*s\n", len, sb->set_name); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf("MD_ARRAY_SIZE=%s\n",human_size_brief(asize,JEDEC)); + } + } printf("MD_UUID="); for (i=0; i<16; i++) { if ((i&3)==0 && i != 0) printf(":"); @@ -524,6 +604,143 @@ (unsigned long long)__le64_to_cpu(sb->events)); } +static int copy_metadata1(struct supertype *st, int from, int to) +{ + /* Read superblock. If it looks good, write it out. + * Then if a bitmap is present, copy that. + * And if a bad-block-list is present, copy that too. + */ + void *buf; + unsigned long long dsize, sb_offset; + const int bufsize = 4*1024; + struct mdp_superblock_1 super, *sb; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + dsize >>= 9; + if (dsize < 24) + goto err; + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + goto err; + } + + if (lseek64(from, sb_offset << 9, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + sb = buf; + super = *sb; // save most of sb for when we reuse buf + + if (__le32_to_cpu(super.magic) != MD_SB_MAGIC || + __le32_to_cpu(super.major_version) != 1 || + __le64_to_cpu(super.super_offset) != sb_offset || + calc_sb_1_csum(sb) != super.sb_csum) + goto err; + + if (lseek64(to, sb_offset << 9, 0) < 0LL) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + + if (super.feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + unsigned long long bitmap_offset = sb_offset; + int bytes = 4096; // just an estimate. + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bitmap_offset += (int32_t)__le32_to_cpu(super.bitmap_offset); + + if (lseek64(from, bitmap_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bitmap_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + if (written == 0) { + /* have the header, can calculate + * correct bitmap bytes */ + bitmap_super_t *bms; + int bits; + bms = (void*)buf; + bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, 512); + if (n > bytes) + n = bytes; + } + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + if (super.bblog_size != 0 && + __le32_to_cpu(super.bblog_size) <= 100 && + super.bblog_offset != 0 && + (super.feature_map & __le32_to_cpu(MD_FEATURE_BAD_BLOCKS))) { + /* There is a bad block log */ + unsigned long long bb_offset = sb_offset; + int bytes = __le32_to_cpu(super.bblog_size) * 512; + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bb_offset += (int32_t)__le32_to_cpu(super.bblog_offset); + + if (lseek64(from, bb_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bb_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + free(buf); + return 0; + +err: + free(buf); + return 1; +} + static void detail_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; @@ -548,8 +765,10 @@ struct mdp_superblock_1 *sb = st->sb; int i; - if (sb->set_name[0]) - printf(" name=%.32s", sb->set_name); + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } printf(" UUID="); for (i=0; i<16; i++) { if ((i&3)==0 && i != 0) printf(":"); @@ -573,6 +792,62 @@ printf("MD_NAME=%.*s\n", len, sb->set_name); } +static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long offset; + int size; + __u64 *bbl, *bbp; + int i; + + if (!sb->bblog_size || __le32_to_cpu(sb->bblog_size) > 100 + || !sb->bblog_offset){ + printf("No bad-blocks list configured on %s\n", devname); + return 0; + } + if ((sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + == 0) { + printf("Bad-blocks list is empty in %s\n", devname); + return 0; + } + + size = __le32_to_cpu(sb->bblog_size)* 512; + if (posix_memalign((void**)&bbl, 4096, size) != 0) { + pr_err("%s could not allocate badblocks list\n", __func__); + return 0; + } + offset = __le64_to_cpu(sb->super_offset) + + (int)__le32_to_cpu(sb->bblog_offset); + offset <<= 9; + if (lseek64(fd, offset, 0) < 0) { + pr_err("Cannot seek to bad-blocks list\n"); + return 1; + } + if (read(fd, bbl, size) != size) { + pr_err("Cannot read bad-blocks list\n"); + return 1; + } + /* 64bits per entry. 10 bits is block-count, 54 bits is block + * offset. Blocks are sectors unless bblog->shift makes them bigger + */ + bbp = (__u64*)bbl; + printf("Bad-blocks on %s:\n", devname); + for (i = 0; i < size/8; i++, bbp++) { + __u64 bb = __le64_to_cpu(*bbp); + int count = bb & 0x3ff; + unsigned long long sector = bb >> 10; + + if (bb + 1 == 0) + break; + + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + + printf("%20llu for %d sectors\n", sector, count); + } + return 0; +} + #endif static int match_home1(struct supertype *st, char *homehost) @@ -597,10 +872,14 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) { struct mdp_superblock_1 *sb = st->sb; + struct bitmap_super_s *bsb = (void*)(((char*)sb)+MAX_SB_SIZE); + struct misc_dev_info *misc = (void*)(((char*)sb)+MAX_SB_SIZE+BM_SUPER_SIZE); int working = 0; unsigned int i; unsigned int role; unsigned int map_disks = info->array.raid_disks; + unsigned long long super_offset; + unsigned long long data_size; memset(info, 0, sizeof(*info)); info->array.major_version = 1; @@ -631,6 +910,67 @@ else role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]); + super_offset = __le64_to_cpu(sb->super_offset); + if (info->array.level <= 0) + data_size = __le64_to_cpu(sb->data_size); + else + data_size = __le64_to_cpu(sb->size); + if (info->data_offset < super_offset) { + unsigned long long end; + info->space_before = info->data_offset; + end = super_offset; + + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bboffset = super_offset; + bboffset += (int32_t)__le32_to_cpu(sb->bblog_offset); + if (bboffset < end) + end = bboffset; + } + + if (super_offset + info->bitmap_offset < end) + end = super_offset + info->bitmap_offset; + + if (info->data_offset + data_size < end) + info->space_after = end - data_size - info->data_offset; + else + info->space_after = 0; + } else { + unsigned long long earliest; + earliest = super_offset + (32+4)*2; /* match kernel */ + if (info->bitmap_offset > 0) { + unsigned long long bmend = info->bitmap_offset; + unsigned long long size = __le64_to_cpu(bsb->sync_size); + size /= __le32_to_cpu(bsb->chunksize) >> 9; + size = (size + 7) >> 3; + size += sizeof(bitmap_super_t); + size = ROUND_UP(size, 4096); + size /= 512; + bmend += size; + if (bmend > earliest) + bmend = earliest; + } + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bbend = super_offset; + bbend += (int32_t)__le32_to_cpu(sb->bblog_offset); + bbend += __le32_to_cpu(sb->bblog_size); + if (bbend > earliest) + earliest = bbend; + } + if (earliest < info->data_offset) + info->space_before = info->data_offset - earliest; + else + info->space_before = 0; + info->space_after = misc->device_size - data_size - info->data_offset; + } + if (info->space_before == 0 && info->space_after == 0) { + /* It will look like we don't support data_offset changes, + * be we do - it's just that there is no room. + * A change that reduced the number of devices should + * still be allowed, so set the otherwise useless value of '1' + */ + info->space_after = 1; + } + info->disk.raid_disk = -1; switch(role) { case 0xFFFF: @@ -654,6 +994,11 @@ strncpy(info->name, sb->set_name, 32); info->name[32] = 0; + if ((__le32_to_cpu(sb->feature_map)&MD_FEATURE_REPLACEMENT)) { + info->disk.state &= ~(1 << MD_DISK_SYNC); + info->disk.state |= 1 << MD_DISK_REPLACEMENT; + } + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET)) info->recovery_start = __le32_to_cpu(sb->recovery_offset); else @@ -661,6 +1006,9 @@ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { info->reshape_active = 1; + if ((sb->feature_map & __le32_to_cpu(MD_FEATURE_NEW_OFFSET)) && + sb->new_offset != 0) + info->reshape_active |= RESHAPE_NO_BACKUP; info->reshape_progress = __le64_to_cpu(sb->reshape_position); info->new_level = __le32_to_cpu(sb->new_level); info->delta_disks = __le32_to_cpu(sb->delta_disks); @@ -695,7 +1043,7 @@ if (subarray) return NULL; - info = malloc(sizeof(*info)); + info = xmalloc(sizeof(*info)); getinfo_super1(st, info, NULL); return info; } @@ -712,6 +1060,21 @@ int rv = 0; struct mdp_superblock_1 *sb = st->sb; + if (strcmp(update, "homehost") == 0 && + homehost) { + /* Note that 'homehost' is special as it is really + * a "name" update. + */ + char *c; + update = "name"; + c = strchr(sb->set_name, ':'); + if (c) + strncpy(info->name, c+1, 31 - (c-sb->set_name)); + else + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + } + if (strcmp(update, "force-one")==0) { /* Not enough devices for a working array, * so bring this one up-to-date @@ -733,7 +1096,7 @@ } else if (strcmp(update, "assemble")==0) { int d = info->disk.number; int want; - if (info->disk.state == 6) + if (info->disk.state & (1<disk.raid_disk; else want = 0xFFFF; @@ -814,16 +1177,58 @@ } } else if (strcmp(update, "no-bitmap") == 0) { sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); - } else if (strcmp(update, "homehost") == 0 && - homehost) { - char *c; - update = "name"; - c = strchr(sb->set_name, ':'); - if (c) - strncpy(info->name, c+1, 31 - (c-sb->set_name)); - else - strncpy(info->name, sb->set_name, 32); - info->name[32] = 0; + } else if (strcmp(update, "bbl") == 0) { + /* only possible if there is room after the bitmap, or if + * there is no bitmap + */ + unsigned long long sb_offset = __le64_to_cpu(sb->super_offset); + unsigned long long data_offset = __le64_to_cpu(sb->data_offset); + long bitmap_offset = (long)(int32_t)__le32_to_cpu(sb->bitmap_offset); + long bm_sectors = 0; + long space; + +#ifndef MDASSEMBLE + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)sb)+MAX_SB_SIZE); + bm_sectors = bitmap_sectors(bsb); + } +#endif + if (sb_offset < data_offset) { + /* 1.1 or 1.2. Put bbl after bitmap leaving at least 32K + */ + long bb_offset; + bb_offset = sb_offset + 8; + if (bm_sectors && bitmap_offset > 0) + bb_offset = bitmap_offset + bm_sectors; + while (bb_offset < (long)sb_offset + 8 + 32*2 + && bb_offset + 8+8 <= (long)data_offset) + /* too close to bitmap, and room to grow */ + bb_offset += 8; + if (bb_offset + 8 <= (long)data_offset) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(bb_offset); + } + } else { + /* 1.0 - Put bbl just before super block */ + if (bm_sectors && bitmap_offset < 0) + space = -bitmap_offset - bm_sectors; + else + space = sb_offset - data_offset - + __le64_to_cpu(sb->data_size); + if (space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + } + } else if (strcmp(update, "no-bbl") == 0) { + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + pr_err("Cannot remove active bbl from %s\n",devname); + else { + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } } else if (strcmp(update, "name") == 0) { if (info->name[0] == 0) sprintf(info->name, "%d", info->array.md_minor); @@ -848,6 +1253,63 @@ misc->device_size - __le64_to_cpu(sb->data_offset)); printf("Size is %llu\n", (unsigned long long) __le64_to_cpu(sb->data_size)); + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE))) + pr_err("No active reshape to revert on %s\n", + devname); + else { + __u32 temp; + unsigned long long reshape_sectors; + long reshape_chunk; + rv = 0; + /* reshape_position is a little messy. + * Its value must be a multiple of the larger + * chunk size, and of the "after" data disks. + * So when reverting we need to change it to + * be a multiple of the new "after" data disks, + * which is the old "before". + * If it isn't already a multiple of 'before', + * the only thing we could do would be + * copy some block around on the disks, which + * is easy to get wrong. + * So we reject a revert-reshape unless the + * alignment is good. + */ + if (__le32_to_cpu(sb->level) >= 4 && + __le32_to_cpu(sb->level) <= 6) { + reshape_sectors = __le64_to_cpu(sb->reshape_position); + reshape_chunk = __le32_to_cpu(sb->new_chunk); + reshape_chunk *= __le32_to_cpu(sb->raid_disks) - __le32_to_cpu(sb->delta_disks) - + (__le32_to_cpu(sb->level)==6 ? 2 : 1); + if (reshape_sectors % reshape_chunk) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + } + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + if (sb->delta_disks == 0) + sb->feature_map ^= __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + else + sb->delta_disks = __cpu_to_le32(-__le32_to_cpu(sb->delta_disks)); + + temp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = temp; + + temp = sb->new_chunk; + sb->new_chunk = sb->chunksize; + sb->chunksize = temp; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_NEW_OFFSET)) { + long offset_delta = (int32_t)__le32_to_cpu(sb->new_offset); + sb->data_offset = __cpu_to_le64(__le64_to_cpu(sb->data_offset) + offset_delta); + sb->new_offset = __cpu_to_le32(-offset_delta); + sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta); + } + } } else if (strcmp(update, "_reshape_progress")==0) sb->reshape_position = __cpu_to_le64(info->reshape_progress); else if (strcmp(update, "writemostly")==0) @@ -862,7 +1324,8 @@ } static int init_super1(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, int *uuid) + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) { struct mdp_superblock_1 *sb; int spares; @@ -871,8 +1334,7 @@ int sbsize; if (posix_memalign((void**)&sb, 4096, SUPER1_SIZE) != 0) { - fprintf(stderr, Name - ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 0; } memset(sb, 0, SUPER1_SIZE); @@ -885,7 +1347,7 @@ spares = info->working_disks - info->active_disks; if (info->raid_disks + spares > MAX_DEVS) { - fprintf(stderr, Name ": too many devices requested: %d+%d > %d\n", + pr_err("too many devices requested: %d+%d > %d\n", info->raid_disks , spares, MAX_DEVS); return 0; } @@ -926,7 +1388,7 @@ sb->chunksize = __cpu_to_le32(info->chunk_size>>9); sb->raid_disks = __cpu_to_le32(info->raid_disks); - sb->data_offset = __cpu_to_le64(0); + sb->data_offset = __cpu_to_le64(data_offset); sb->data_size = __cpu_to_le64(0); sb->super_offset = __cpu_to_le64(0); sb->recovery_offset = __cpu_to_le64(0); @@ -949,13 +1411,14 @@ struct devinfo { int fd; char *devname; + long long data_offset; mdu_disk_info_t disk; struct devinfo *next; }; #ifndef MDASSEMBLE /* Add a device to the superblock being created */ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, - int fd, char *devname) + int fd, char *devname, unsigned long long data_offset) { struct mdp_superblock_1 *sb = st->sb; __u16 *rp = sb->dev_roles + dk->number; @@ -979,10 +1442,11 @@ dip = (struct devinfo **)&st->info; while (*dip) dip = &(*dip)->next; - di = malloc(sizeof(struct devinfo)); + di = xmalloc(sizeof(struct devinfo)); di->fd = fd; di->devname = devname; di->disk = *dk; + di->data_offset = data_offset; di->next = NULL; *dip = di; @@ -1034,12 +1498,10 @@ return -EINVAL; } - - if (sb_offset != __le64_to_cpu(sb->super_offset) && 0 != __le64_to_cpu(sb->super_offset) ) { - fprintf(stderr, Name ": internal error - sb_offset is wrong\n"); + pr_err("internal error - sb_offset is wrong\n"); abort(); } @@ -1091,18 +1553,18 @@ int rfd; int rv = 0; unsigned long long bm_space; - unsigned long long reserved; struct devinfo *di; unsigned long long dsize, array_size; - unsigned long long sb_offset, headroom; + unsigned long long sb_offset; + unsigned long long data_offset; - for (di = st->info; di && ! rv ; di = di->next) { + for (di = st->info; di; di = di->next) { if (di->disk.state == 1) continue; if (di->fd < 0) continue; - while (Kill(di->devname, NULL, 0, 1, 1) == 0) + while (Kill(di->devname, NULL, 0, -1, 1) == 0) ; sb->dev_number = __cpu_to_le32(di->disk.number); @@ -1122,7 +1584,7 @@ sb->events = 0; refst = dup_super(st); - if (load_super1(refst, di->fd, NULL)==0) { + if (load_super1(refst, di->fd, NULL)==0) { struct mdp_superblock_1 *refsb = refst->sb; memcpy(sb->device_uuid, refsb->device_uuid, 16); @@ -1152,7 +1614,6 @@ goto error_out; } - /* * Calculate the position of the superblock. * It is always aligned to a 4K boundary and @@ -1160,78 +1621,72 @@ * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device. - * Depending on the array size, we might leave extra space - * for a bitmap. + * data_offset has already been set. */ array_size = __le64_to_cpu(sb->size); - /* work out how much space we left for a bitmap */ - bm_space = choose_bm_space(array_size); - - /* We try to leave 0.1% at the start for reshape - * operations, but limit this to 128Meg (0.1% of 10Gig) - * which is plenty for efficient reshapes - */ - headroom = 128 * 1024 * 2; - while (headroom << 10 > array_size) - headroom >>= 1; - + /* work out how much space we left for a bitmap, + * Add 8 sectors for bad block log */ + bm_space = choose_bm_space(array_size) + 8; + + data_offset = di->data_offset; + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; switch(st->minor_version) { case 0: + if (data_offset == INVALID_SECTORS) + data_offset = 0; sb_offset = dsize; sb_offset -= 8*2; sb_offset &= ~(4*2-1); + sb->data_offset = __cpu_to_le64(data_offset); sb->super_offset = __cpu_to_le64(sb_offset); - sb->data_offset = __cpu_to_le64(0); if (sb_offset < array_size + bm_space) bm_space = sb_offset - array_size; sb->data_size = __cpu_to_le64(sb_offset - bm_space); + if (bm_space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } break; case 1: sb->super_offset = __cpu_to_le64(0); - reserved = bm_space + 4*2; - if (reserved < headroom) - reserved = headroom; - if (reserved + array_size > dsize) - reserved = dsize - array_size; - /* Try for multiple of 1Meg so it is nicely aligned */ - #define ONE_MEG (2*1024) - if (reserved > ONE_MEG) - reserved = (reserved/ONE_MEG) * ONE_MEG; - - /* force 4K alignment */ - reserved &= ~7ULL; + if (data_offset == INVALID_SECTORS) + data_offset = 16; - sb->data_offset = __cpu_to_le64(reserved); - sb->data_size = __cpu_to_le64(dsize - reserved); + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 8 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(data_offset-8); + } break; case 2: sb_offset = 4*2; - sb->super_offset = __cpu_to_le64(4*2); - if (4*2 + 4*2 + bm_space + array_size - > dsize) - bm_space = dsize - array_size - - 4*2 - 4*2; - - reserved = bm_space + 4*2 + 4*2; - if (reserved < headroom) - reserved = headroom; - if (reserved + array_size > dsize) - reserved = dsize - array_size; - /* Try for multiple of 1Meg so it is nicely aligned */ - #define ONE_MEG (2*1024) - if (reserved > ONE_MEG) - reserved = (reserved/ONE_MEG) * ONE_MEG; - - /* force 4K alignment */ - reserved &= ~7ULL; + sb->super_offset = __cpu_to_le64(sb_offset); + if (data_offset == INVALID_SECTORS) + data_offset = 24; - sb->data_offset = __cpu_to_le64(reserved); - sb->data_size = __cpu_to_le64(dsize - reserved); + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 16 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16+16) { + sb->bblog_size = __cpu_to_le16(8); + /* '8' sectors for the bblog, and another '8' + * because we want offset from superblock, not + * start of device. + */ + sb->bblog_offset = __cpu_to_le32(data_offset-8-8); + } break; default: - fprintf(stderr, Name ": Failed to write invalid " - "metadata format 1.%i to %s\n", - st->minor_version, di->devname); + pr_err("Failed to write invalid " + "metadata format 1.%i to %s\n", + st->minor_version, di->devname); rv = -EINVAL; goto out; } @@ -1242,11 +1697,13 @@ rv = st->ss->write_bitmap(st, di->fd); close(di->fd); di->fd = -1; + if (rv) + goto error_out; } error_out: if (rv) - fprintf(stderr, Name ": Failed to write metadata to %s\n", - di->devname); + pr_err("Failed to write metadata to %s\n", + di->devname); out: return rv; } @@ -1271,8 +1728,7 @@ if (!first) { if (posix_memalign((void**)&first, 4096, SUPER1_SIZE) != 0) { - fprintf(stderr, Name - ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 1; } memcpy(first, second, SUPER1_SIZE); @@ -1346,7 +1802,7 @@ if (dsize < 24) { if (devname) - fprintf(stderr, Name ": %s is too small for md: size is %llu sectors.\n", + pr_err("%s is too small for md: size is %llu sectors.\n", devname, dsize); return 1; } @@ -1375,25 +1831,22 @@ return -EINVAL; } - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ - - if (lseek64(fd, sb_offset << 9, 0)< 0LL) { if (devname) - fprintf(stderr, Name ": Cannot seek to superblock on %s: %s\n", + pr_err("Cannot seek to superblock on %s: %s\n", devname, strerror(errno)); return 1; } if (posix_memalign((void**)&super, 4096, SUPER1_SIZE) != 0) { - fprintf(stderr, Name ": %s could not allocate superblock\n", + pr_err("%s could not allocate superblock\n", __func__); return 1; } if (aread(&afd, super, MAX_SB_SIZE) != MAX_SB_SIZE) { if (devname) - fprintf(stderr, Name ": Cannot read superblock on %s\n", + pr_err("Cannot read superblock on %s\n", devname); free(super); return 1; @@ -1401,7 +1854,7 @@ if (__le32_to_cpu(super->magic) != MD_SB_MAGIC) { if (devname) - fprintf(stderr, Name ": No super block found on %s (Expected magic %08x, got %08x)\n", + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", devname, MD_SB_MAGIC, __le32_to_cpu(super->magic)); free(super); return 2; @@ -1409,14 +1862,14 @@ if (__le32_to_cpu(super->major_version) != 1) { if (devname) - fprintf(stderr, Name ": Cannot interpret superblock on %s - version is %d\n", + pr_err("Cannot interpret superblock on %s - version is %d\n", devname, __le32_to_cpu(super->major_version)); free(super); return 2; } if (__le64_to_cpu(super->super_offset) != sb_offset) { if (devname) - fprintf(stderr, Name ": No superblock found on %s (super_offset is wrong)\n", + pr_err("No superblock found on %s (super_offset is wrong)\n", devname); free(super); return 2; @@ -1427,6 +1880,8 @@ misc = (struct misc_dev_info*) (((char*)super)+MAX_SB_SIZE+BM_SUPER_SIZE); misc->device_size = dsize; + if (st->data_offset == INVALID_SECTORS) + st->data_offset = __le64_to_cpu(super->data_offset); /* Now check on the bitmap superblock */ if ((__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) == 0) @@ -1451,17 +1906,15 @@ return 0; } - static struct supertype *match_metadata_desc1(char *arg) { - struct supertype *st = calloc(1, sizeof(*st)); - if (!st) - return st; + struct supertype *st = xcalloc(1, sizeof(*st)); - st->container_dev = NoMdDev; + st->container_devnm[0] = 0; st->ss = &super1; st->max_devs = MAX_DEVS; st->sb = NULL; + st->data_offset = INVALID_SECTORS; /* leading zeros can be safely ignored. --detail generates them. */ while (*arg == '0') arg++; @@ -1498,47 +1951,57 @@ * superblock type st, and reserving 'reserve' sectors for * a possible bitmap */ -static __u64 avail_size1(struct supertype *st, __u64 devsize) +static __u64 avail_size1(struct supertype *st, __u64 devsize, + unsigned long long data_offset) { struct mdp_superblock_1 *super = st->sb; + int bmspace = 0; + int bbspace = 0; if (devsize < 24) return 0; - if (super == NULL) - /* creating: allow suitable space for bitmap */ - devsize -= choose_bm_space(devsize); #ifndef MDASSEMBLE - else if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) { + if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) { /* hot-add. allow for actual size of bitmap */ struct bitmap_super_s *bsb; bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); - devsize -= bitmap_sectors(bsb); + bmspace = bitmap_sectors(bsb); } #endif + /* Allow space for bad block log */ + if (super->bblog_size) + bbspace = __le16_to_cpu(super->bblog_size); if (st->minor_version < 0) /* not specified, so time to set default */ st->minor_version = 2; - if (super == NULL && st->minor_version > 0) { - /* haven't committed to a size yet, so allow some - * slack for space for reshape. - * Limit slack to 128M, but aim for about 0.1% - */ - unsigned long long headroom = 128*1024*2; - while ((headroom << 10) > devsize) - headroom >>= 1; - devsize -= headroom; - } + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + + if (data_offset != INVALID_SECTORS) + switch(st->minor_version) { + case 0: + return devsize - data_offset - 8*2 - bbspace; + case 1: + case 2: + return devsize - data_offset; + default: + return 0; + } + + devsize -= bmspace; + switch(st->minor_version) { case 0: /* at end */ - return ((devsize - 8*2 ) & ~(4*2-1)); + return ((devsize - 8*2 - bbspace ) & ~(4*2-1)); case 1: /* at start, 4K for superblock and possible bitmap */ - return devsize - 4*2; + return devsize - 4*2 - bbspace; case 2: /* 4k from start, 4K for superblock and possible bitmap */ - return devsize - (4+4)*2; + return devsize - (4+4)*2 - bbspace; } return 0; } @@ -1562,6 +2025,7 @@ unsigned long long max_bits; unsigned long long min_chunk; long offset; + long bbl_offset, bbl_size; unsigned long long chunk = *chunkp; int room = 0; int creating = 0; @@ -1583,15 +2047,23 @@ */ offset = 0; room = choose_bm_space(__le64_to_cpu(sb->size)); + bbl_size = 8; } else { room = __le64_to_cpu(sb->super_offset) - __le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->data_size); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size < 8) + bbl_size = 8; + bbl_offset = (__s32)__le32_to_cpu(sb->bblog_offset); + if (bbl_size < -bbl_offset) + bbl_size = -bbl_offset; if (!may_change || (room < 3*2 && __le32_to_cpu(sb->max_dev) <= 384)) { room = 3*2; offset = 1*2; + bbl_size = 0; } else { offset = 0; /* means movable offset */ } @@ -1602,12 +2074,20 @@ if (creating) { offset = 4*2; room = choose_bm_space(__le64_to_cpu(sb->size)); + bbl_size = 8; } else { room = __le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size) + room = __le32_to_cpu(sb->bblog_offset) + bbl_size; + else + bbl_size = 8; + if (!may_change) { room -= 2; /* Leave 1K for superblock */ offset = 2; + bbl_size = 0; } else { room -= 4*2; /* leave 4K for superblock */ offset = 4*2; @@ -1618,6 +2098,7 @@ return 0; } + room -= bbl_size; if (chunk == UnSet && room > 128*2) /* Limit to 128K of bitmap when chunk size not requested */ room = 128*2; @@ -1649,7 +2130,7 @@ bits = (size*512) / chunk + 1; room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096; room *= 8; /* convert 4K blocks to sectors */ - offset = -room; + offset = -room - bbl_size; } sb->bitmap_offset = (int32_t)__cpu_to_le32(offset); @@ -1750,27 +2231,34 @@ static int validate_geometry1(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *subdev, unsigned long long *freesize, int verbose) { - unsigned long long ldsize; + unsigned long long ldsize, devsize; + int bmspace; + unsigned long long headroom; int fd; if (level == LEVEL_CONTAINER) { if (verbose) - fprintf(stderr, Name ": 1.x metadata does not support containers\n"); + pr_err("1.x metadata does not support containers\n"); return 0; } - if (chunk && *chunk == UnSet) + if (*chunk == UnSet) *chunk = DEFAULT_CHUNK; if (!subdev) return 1; + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + fd = open(subdev, O_RDONLY|O_EXCL, 0); if (fd < 0) { if (verbose) - fprintf(stderr, Name ": super1.x cannot open %s: %s\n", + pr_err("super1.x cannot open %s: %s\n", subdev, strerror(errno)); return 0; } @@ -1781,11 +2269,118 @@ } close(fd); - *freesize = avail_size1(st, ldsize >> 9); + devsize = ldsize >> 9; + if (devsize < 24) { + *freesize = 0; + return 0; + } + + /* creating: allow suitable space for bitmap */ + bmspace = choose_bm_space(devsize); + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + if (data_offset == INVALID_SECTORS) + switch (st->minor_version) { + case 0: + data_offset = 0; + break; + case 1: + case 2: + /* Choose data offset appropriate for this device + * and use as default for whole array. + * The data_offset must allow for bitmap space + * and base metadata, should allow for some headroom + * for reshape, and should be rounded to multiple + * of 1M. + * Headroom is limited to 128M, but aim for about 0.1% + */ + headroom = 128*1024*2; + while ((headroom << 10) > devsize && + (*chunk == 0 || + headroom / 2 >= ((unsigned)(*chunk)*2)*2)) + headroom >>= 1; + data_offset = 12*2 + bmspace + headroom; + #define ONE_MEG (2*1024) + if (data_offset > ONE_MEG) + data_offset = (data_offset / ONE_MEG) * ONE_MEG; + break; + } + if (st->data_offset == INVALID_SECTORS) + st->data_offset = data_offset; + switch(st->minor_version) { + case 0: /* metadata at end. Round down and subtract space to reserve */ + devsize = (devsize & ~(4ULL*2-1)); + /* space for metadata, bblog, bitmap */ + devsize -= 8*2 + 8 + bmspace; + break; + case 1: + case 2: + devsize -= data_offset; + break; + } + *freesize = devsize; return 1; } #endif /* MDASSEMBLE */ +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0) +{ + /* Create a v1.0 superblock based on 'info'*/ + void *ret; + struct mdp_superblock_1 *sb; + int i; + int rfd; + unsigned long long offset; + + if (posix_memalign(&ret, 4096, 1024) != 0) + return NULL; + sb = ret; + memset(ret, 0, 1024); + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + sprintf(sb->set_name, "%d", sb0->md_minor); + sb->ctime = __cpu_to_le32(info->array.ctime+1); + sb->level = __cpu_to_le32(info->array.level); + sb->layout = __cpu_to_le32(info->array.layout); + sb->size = __cpu_to_le64(info->component_size); + sb->chunksize = __cpu_to_le32(info->array.chunk_size/512); + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + if (info->array.level > 0) + sb->data_size = sb->size; + else + sb->data_size = st->ss->avail_size(st, st->devsize/512, 0); + sb->resync_offset = MaxSector; + sb->max_dev = __cpu_to_le32(MD_SB_DISKS); + sb->dev_number = __cpu_to_le32(info->disk.number); + sb->utime = __cpu_to_le64(info->array.utime); + + offset = st->devsize/512 - 8*2; + offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(offset); + //*(__u64*)(st->other + 128 + 8 + 8) = __cpu_to_le64(offset); + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + for (i = 0; i < MD_SB_DISKS; i++) { + int state = sb0->disks[i].state; + sb->dev_roles[i] = 0xFFFF; + if ((state & (1<dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk); + } + sb->sb_csum = calc_sb_1_csum(sb); + return ret; +} + struct superswitch super1 = { #ifndef MDASSEMBLE .examine_super = examine_super1, @@ -1797,6 +2392,8 @@ .write_init_super = write_init_super1, .validate_geometry = validate_geometry1, .add_to_super = add_to_super1, + .examine_badblocks = examine_badblocks_super1, + .copy_metadata = copy_metadata1, #endif .match_home = match_home1, .uuid_from_super = uuid_from_super1, diff -Nru mdadm-3.2.5/super-ddf.c mdadm-3.3/super-ddf.c --- mdadm-3.2.5/super-ddf.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super-ddf.c 2013-09-03 04:47:47.000000000 +0000 @@ -44,6 +44,13 @@ const unsigned char *buf, unsigned len); +#define DDF_NOTFOUND (~0U) +#define DDF_CONTAINER (DDF_NOTFOUND-1) + +/* Default for safe_mode_delay. Same value as for IMSM. + */ +static const int DDF_SAFE_MODE_DELAY = 4000; + /* The DDF metadata handling. * DDF metadata lives at the end of the device. * The last 512 byte block provides an 'anchor' which is used to locate @@ -55,6 +62,46 @@ * */ +typedef struct __be16 { + __u16 _v16; +} be16; +#define be16_eq(x, y) ((x)._v16 == (y)._v16) +#define be16_and(x, y) ((x)._v16 & (y)._v16) +#define be16_or(x, y) ((x)._v16 | (y)._v16) +#define be16_clear(x, y) ((x)._v16 &= ~(y)._v16) +#define be16_set(x, y) ((x)._v16 |= (y)._v16) + +typedef struct __be32 { + __u32 _v32; +} be32; +#define be32_eq(x, y) ((x)._v32 == (y)._v32) + +typedef struct __be64 { + __u64 _v64; +} be64; +#define be64_eq(x, y) ((x)._v64 == (y)._v64) + +#define be16_to_cpu(be) __be16_to_cpu((be)._v16) +static inline be16 cpu_to_be16(__u16 x) +{ + be16 be = { ._v16 = __cpu_to_be16(x) }; + return be; +} + +#define be32_to_cpu(be) __be32_to_cpu((be)._v32) +static inline be32 cpu_to_be32(__u32 x) +{ + be32 be = { ._v32 = __cpu_to_be32(x) }; + return be; +} + +#define be64_to_cpu(be) __be64_to_cpu((be)._v64) +static inline be64 cpu_to_be64(__u64 x) +{ + be64 be = { ._v64 = __cpu_to_be64(x) }; + return be; +} + /* Primary Raid Level (PRL) */ #define DDF_RAID0 0x00 #define DDF_RAID1 0x01 @@ -92,28 +139,28 @@ #define DDF_2SPANNED 0x03 /* This is also weird - be careful */ /* Magic numbers */ -#define DDF_HEADER_MAGIC __cpu_to_be32(0xDE11DE11) -#define DDF_CONTROLLER_MAGIC __cpu_to_be32(0xAD111111) -#define DDF_PHYS_RECORDS_MAGIC __cpu_to_be32(0x22222222) -#define DDF_PHYS_DATA_MAGIC __cpu_to_be32(0x33333333) -#define DDF_VIRT_RECORDS_MAGIC __cpu_to_be32(0xDDDDDDDD) -#define DDF_VD_CONF_MAGIC __cpu_to_be32(0xEEEEEEEE) -#define DDF_SPARE_ASSIGN_MAGIC __cpu_to_be32(0x55555555) -#define DDF_VU_CONF_MAGIC __cpu_to_be32(0x88888888) -#define DDF_VENDOR_LOG_MAGIC __cpu_to_be32(0x01dBEEF0) -#define DDF_BBM_LOG_MAGIC __cpu_to_be32(0xABADB10C) +#define DDF_HEADER_MAGIC cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC cpu_to_be32(0xABADB10C) #define DDF_GUID_LEN 24 #define DDF_REVISION_0 "01.00.00" #define DDF_REVISION_2 "01.02.00" struct ddf_header { - __u32 magic; /* DDF_HEADER_MAGIC */ - __u32 crc; + be32 magic; /* DDF_HEADER_MAGIC */ + be32 crc; char guid[DDF_GUID_LEN]; char revision[8]; /* 01.02.00 */ - __u32 seq; /* starts at '1' */ - __u32 timestamp; + be32 seq; /* starts at '1' */ + be32 timestamp; __u8 openflag; __u8 foreignflag; __u8 enforcegroups; @@ -121,38 +168,38 @@ __u8 pad1[12]; /* 12 * 0xff */ /* 64 bytes so far */ __u8 header_ext[32]; /* reserved: fill with 0xff */ - __u64 primary_lba; - __u64 secondary_lba; + be64 primary_lba; + be64 secondary_lba; __u8 type; __u8 pad2[3]; /* 0xff */ - __u32 workspace_len; /* sectors for vendor space - + be32 workspace_len; /* sectors for vendor space - * at least 32768(sectors) */ - __u64 workspace_lba; - __u16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ - __u16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ - __u16 max_partitions; /* i.e. max num of configuration + be64 workspace_lba; + be16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + be16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + be16 max_partitions; /* i.e. max num of configuration record entries per disk */ - __u16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + be16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries *12/512) */ - __u16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + be16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ __u8 pad3[54]; /* 0xff */ /* 192 bytes so far */ - __u32 controller_section_offset; - __u32 controller_section_length; - __u32 phys_section_offset; - __u32 phys_section_length; - __u32 virt_section_offset; - __u32 virt_section_length; - __u32 config_section_offset; - __u32 config_section_length; - __u32 data_section_offset; - __u32 data_section_length; - __u32 bbm_section_offset; - __u32 bbm_section_length; - __u32 diag_space_offset; - __u32 diag_space_length; - __u32 vendor_offset; - __u32 vendor_length; + be32 controller_section_offset; + be32 controller_section_length; + be32 phys_section_offset; + be32 phys_section_length; + be32 virt_section_offset; + be32 virt_section_length; + be32 config_section_offset; + be32 config_section_length; + be32 data_section_offset; + be32 data_section_length; + be32 bbm_section_offset; + be32 bbm_section_length; + be32 diag_space_offset; + be32 diag_space_length; + be32 vendor_offset; + be32 vendor_length; /* 256 bytes so far */ __u8 pad4[256]; /* 0xff */ }; @@ -164,14 +211,14 @@ /* The content of the 'controller section' - global scope */ struct ddf_controller_data { - __u32 magic; /* DDF_CONTROLLER_MAGIC */ - __u32 crc; + be32 magic; /* DDF_CONTROLLER_MAGIC */ + be32 crc; char guid[DDF_GUID_LEN]; struct controller_type { - __u16 vendor_id; - __u16 device_id; - __u16 sub_vendor_id; - __u16 sub_device_id; + be16 vendor_id; + be16 device_id; + be16 sub_vendor_id; + be16 sub_device_id; } type; char product_id[16]; __u8 pad[8]; /* 0xff */ @@ -180,17 +227,17 @@ /* The content of phys_section - global scope */ struct phys_disk { - __u32 magic; /* DDF_PHYS_RECORDS_MAGIC */ - __u32 crc; - __u16 used_pdes; - __u16 max_pdes; + be32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + be32 crc; + be16 used_pdes; + be16 max_pdes; __u8 pad[52]; struct phys_disk_entry { char guid[DDF_GUID_LEN]; - __u32 refnum; - __u16 type; - __u16 state; - __u64 config_size; /* DDF structures must be after here */ + be32 refnum; + be16 type; + be16 state; + be64 config_size; /* DDF structures must be after here */ char path[18]; /* another horrible structure really */ __u8 pad[6]; } entries[0]; @@ -221,17 +268,17 @@ /* The content of the virt_section global scope */ struct virtual_disk { - __u32 magic; /* DDF_VIRT_RECORDS_MAGIC */ - __u32 crc; - __u16 populated_vdes; - __u16 max_vdes; + be32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + be32 crc; + be16 populated_vdes; + be16 max_vdes; __u8 pad[52]; struct virtual_entry { char guid[DDF_GUID_LEN]; - __u16 unit; + be16 unit; __u16 pad0; /* 0xffff */ - __u16 guid_crc; - __u16 type; + be16 guid_crc; + be16 type; __u8 state; __u8 init_state; __u8 pad1[14]; @@ -275,25 +322,25 @@ */ struct vd_config { - __u32 magic; /* DDF_VD_CONF_MAGIC */ - __u32 crc; + be32 magic; /* DDF_VD_CONF_MAGIC */ + be32 crc; char guid[DDF_GUID_LEN]; - __u32 timestamp; - __u32 seqnum; + be32 timestamp; + be32 seqnum; __u8 pad0[24]; - __u16 prim_elmnt_count; + be16 prim_elmnt_count; __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ __u8 prl; __u8 rlq; __u8 sec_elmnt_count; __u8 sec_elmnt_seq; __u8 srl; - __u64 blocks; /* blocks per component could be different + be64 blocks; /* blocks per component could be different * on different component devices...(only * for concat I hope) */ - __u64 array_blocks; /* blocks in array */ + be64 array_blocks; /* blocks in array */ __u8 pad1[8]; - __u32 spare_refs[8]; + be32 spare_refs[8]; __u8 cache_pol[8]; __u8 bg_rate; __u8 pad2[3]; @@ -304,10 +351,11 @@ __u8 v2[16]; /* reserved- 0xff */ __u8 v3[16]; /* reserved- 0xff */ __u8 vendor[32]; - __u32 phys_refnum[0]; /* refnum of each disk in sequence */ + be32 phys_refnum[0]; /* refnum of each disk in sequence */ /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a bvd are always the same size */ }; +#define LBA_OFFSET(ddf, vd) ((be64 *) &(vd)->phys_refnum[(ddf)->mppe]) /* vd_config.cache_pol[7] is a bitmap */ #define DDF_cache_writeback 1 /* else writethrough */ @@ -319,17 +367,17 @@ #define DDF_cache_rallowed 64 /* enable read caching */ struct spare_assign { - __u32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ - __u32 crc; - __u32 timestamp; + be32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + be32 crc; + be32 timestamp; __u8 reserved[7]; __u8 type; - __u16 populated; /* SAEs used */ - __u16 max; /* max SAEs */ + be16 populated; /* SAEs used */ + be16 max; /* max SAEs */ __u8 pad[8]; struct spare_assign_entry { char guid[DDF_GUID_LEN]; - __u16 secondary_element; + be16 secondary_element; __u8 pad[6]; } spare_ents[0]; }; @@ -341,10 +389,10 @@ /* The data_section contents - local scope */ struct disk_data { - __u32 magic; /* DDF_PHYS_DATA_MAGIC */ - __u32 crc; + be32 magic; /* DDF_PHYS_DATA_MAGIC */ + be32 crc; char guid[DDF_GUID_LEN]; - __u32 refnum; /* crc of some magic drive data ... */ + be32 refnum; /* crc of some magic drive data ... */ __u8 forced_ref; /* set when above was not result of magic */ __u8 forced_guid; /* set if guid was forced rather than magic */ __u8 vendor[32]; @@ -353,16 +401,16 @@ /* bbm_section content */ struct bad_block_log { - __u32 magic; - __u32 crc; - __u16 entry_count; - __u32 spare_count; + be32 magic; + be32 crc; + be16 entry_count; + be32 spare_count; __u8 pad[10]; - __u64 first_spare; + be64 first_spare; struct mapped_block { - __u64 defective_start; - __u32 replacement_start; - __u16 remap_count; + be64 defective_start; + be32 replacement_start; + be16 remap_count; __u8 pad[2]; } entries[0]; }; @@ -395,6 +443,7 @@ struct ddf_header *active; struct phys_disk *phys; struct virtual_disk *virt; + char *conf; int pdsize, vdsize; unsigned int max_part, mppe, conf_rec_len; int currentdev; @@ -404,9 +453,8 @@ char space[512]; struct { struct vcl *next; - __u64 *lba_offset; /* location in 'conf' of - * the lba table */ unsigned int vcnum; /* index into ->virt */ + struct vd_config **other_bvds; __u64 *block_sizes; /* NULL if all the same */ }; }; @@ -421,6 +469,9 @@ char *devname; int fd; unsigned long long size; /* sectors */ + be64 primary_lba; /* sectors */ + be64 secondary_lba; /* sectors */ + be64 workspace_lba; /* sectors */ int pdnum; /* index in ->phys */ struct spare_assign *spare; void *mdupdate; /* hold metadata update */ @@ -439,21 +490,253 @@ #define offsetof(t,f) ((size_t)&(((t*)0)->f)) #endif +#if DEBUG +static int all_ff(const char *guid); +static void pr_state(struct ddf_super *ddf, const char *msg) +{ + unsigned int i; + dprintf("%s/%s: ", __func__, msg); + for (i = 0; i < be16_to_cpu(ddf->active->max_vd_entries); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + dprintf("%u(s=%02x i=%02x) ", i, + ddf->virt->entries[i].state, + ddf->virt->entries[i].init_state); + } + dprintf("\n"); +} +#else +static void pr_state(const struct ddf_super *ddf, const char *msg) {} +#endif + +static void _ddf_set_updates_pending(struct ddf_super *ddf, const char *func) +{ + ddf->updates_pending = 1; + ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1)); + pr_state(ddf, func); +} + +#define ddf_set_updates_pending(x) _ddf_set_updates_pending((x), __func__) -static unsigned int calc_crc(void *buf, int len) +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx); + +static be32 calc_crc(void *buf, int len) { /* crcs are always at the same place as in the ddf_header */ struct ddf_header *ddf = buf; - __u32 oldcrc = ddf->crc; + be32 oldcrc = ddf->crc; __u32 newcrc; - ddf->crc = 0xffffffff; + ddf->crc = cpu_to_be32(0xffffffff); newcrc = crc32(0, buf, len); ddf->crc = oldcrc; /* The crc is store (like everything) bigendian, so convert * here for simplicity */ - return __cpu_to_be32(newcrc); + return cpu_to_be32(newcrc); +} + +#define DDF_INVALID_LEVEL 0xff +#define DDF_NO_SECONDARY 0xff +static int err_bad_md_layout(const mdu_array_info_t *array) +{ + pr_err("RAID%d layout %x with %d disks is unsupported for DDF\n", + array->level, array->layout, array->raid_disks); + return -1; +} + +static int layout_md2ddf(const mdu_array_info_t *array, + struct vd_config *conf) +{ + be16 prim_elmnt_count = cpu_to_be16(array->raid_disks); + __u8 prl = DDF_INVALID_LEVEL, rlq = 0; + __u8 sec_elmnt_count = 1; + __u8 srl = DDF_NO_SECONDARY; + + switch (array->level) { + case LEVEL_LINEAR: + prl = DDF_CONCAT; + break; + case 0: + rlq = DDF_RAID0_SIMPLE; + prl = DDF_RAID0; + break; + case 1: + switch (array->raid_disks) { + case 2: + rlq = DDF_RAID1_SIMPLE; + break; + case 3: + rlq = DDF_RAID1_MULTI; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID1; + break; + case 4: + if (array->layout != 0) + return err_bad_md_layout(array); + rlq = DDF_RAID4_N; + prl = DDF_RAID4; + break; + case 5: + switch (array->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + rlq = DDF_RAID5_0_RESTART; + break; + case ALGORITHM_LEFT_SYMMETRIC: + rlq = DDF_RAID5_N_CONTINUE; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + /* not mentioned in standard */ + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID5; + break; + case 6: + switch (array->layout) { + case ALGORITHM_ROTATING_N_RESTART: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_ROTATING_ZERO_RESTART: + rlq = DDF_RAID6_0_RESTART; + break; + case ALGORITHM_ROTATING_N_CONTINUE: + rlq = DDF_RAID5_N_CONTINUE; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID6; + break; + case 10: + if (array->raid_disks % 2 == 0 && array->layout == 0x102) { + rlq = DDF_RAID1_SIMPLE; + prim_elmnt_count = cpu_to_be16(2); + sec_elmnt_count = array->raid_disks / 2; + } else if (array->raid_disks % 3 == 0 + && array->layout == 0x103) { + rlq = DDF_RAID1_MULTI; + prim_elmnt_count = cpu_to_be16(3); + sec_elmnt_count = array->raid_disks / 3; + } else + return err_bad_md_layout(array); + srl = DDF_2SPANNED; + prl = DDF_RAID1; + break; + default: + return err_bad_md_layout(array); + } + conf->prl = prl; + conf->prim_elmnt_count = prim_elmnt_count; + conf->rlq = rlq; + conf->srl = srl; + conf->sec_elmnt_count = sec_elmnt_count; + return 0; +} + +static int err_bad_ddf_layout(const struct vd_config *conf) +{ + pr_err("DDF RAID %u qualifier %u with %u disks is unsupported\n", + conf->prl, conf->rlq, be16_to_cpu(conf->prim_elmnt_count)); + return -1; +} + +static int layout_ddf2md(const struct vd_config *conf, + mdu_array_info_t *array) +{ + int level = LEVEL_UNSUPPORTED; + int layout = 0; + int raiddisks = be16_to_cpu(conf->prim_elmnt_count); + + if (conf->sec_elmnt_count > 1) { + /* see also check_secondary() */ + if (conf->prl != DDF_RAID1 || + (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED)) { + pr_err("Unsupported secondary RAID level %u/%u\n", + conf->prl, conf->srl); + return -1; + } + if (raiddisks == 2 && conf->rlq == DDF_RAID1_SIMPLE) + layout = 0x102; + else if (raiddisks == 3 && conf->rlq == DDF_RAID1_MULTI) + layout = 0x103; + else + return err_bad_ddf_layout(conf); + raiddisks *= conf->sec_elmnt_count; + level = 10; + goto good; + } + + switch (conf->prl) { + case DDF_CONCAT: + level = LEVEL_LINEAR; + break; + case DDF_RAID0: + if (conf->rlq != DDF_RAID0_SIMPLE) + return err_bad_ddf_layout(conf); + level = 0; + break; + case DDF_RAID1: + if (!((conf->rlq == DDF_RAID1_SIMPLE && raiddisks == 2) || + (conf->rlq == DDF_RAID1_MULTI && raiddisks == 3))) + return err_bad_ddf_layout(conf); + level = 1; + break; + case DDF_RAID4: + if (conf->rlq != DDF_RAID4_N) + return err_bad_ddf_layout(conf); + level = 4; + break; + case DDF_RAID5: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case DDF_RAID5_0_RESTART: + layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_LEFT_SYMMETRIC; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 5; + break; + case DDF_RAID6: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_ROTATING_N_RESTART; + break; + case DDF_RAID6_0_RESTART: + layout = ALGORITHM_ROTATING_ZERO_RESTART; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_ROTATING_N_CONTINUE; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 6; + break; + default: + return err_bad_ddf_layout(conf); + }; + +good: + array->level = level; + array->layout = layout; + array->raid_disks = raiddisks; + return 0; } static int load_ddf_header(int fd, unsigned long long lba, @@ -476,28 +759,34 @@ if (read(fd, hdr, 512) != 512) return 0; - if (hdr->magic != DDF_HEADER_MAGIC) + if (!be32_eq(hdr->magic, DDF_HEADER_MAGIC)) { + pr_err("%s: bad header magic\n", __func__); return 0; - if (calc_crc(hdr, 512) != hdr->crc) + } + if (!be32_eq(calc_crc(hdr, 512), hdr->crc)) { + pr_err("%s: bad CRC\n", __func__); return 0; + } if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || memcmp(anchor->revision, hdr->revision, 8) != 0 || - anchor->primary_lba != hdr->primary_lba || - anchor->secondary_lba != hdr->secondary_lba || + !be64_eq(anchor->primary_lba, hdr->primary_lba) || + !be64_eq(anchor->secondary_lba, hdr->secondary_lba) || hdr->type != type || memcmp(anchor->pad2, hdr->pad2, 512 - - offsetof(struct ddf_header, pad2)) != 0) + offsetof(struct ddf_header, pad2)) != 0) { + pr_err("%s: header mismatch\n", __func__); return 0; + } /* Looks good enough to me... */ return 1; } static void *load_section(int fd, struct ddf_super *super, void *buf, - __u32 offset_be, __u32 len_be, int check) + be32 offset_be, be32 len_be, int check) { - unsigned long long offset = __be32_to_cpu(offset_be); - unsigned long long len = __be32_to_cpu(len_be); + unsigned long long offset = be32_to_cpu(offset_be); + unsigned long long len = be32_to_cpu(len_be); int dofree = (buf == NULL); if (check) @@ -507,20 +796,16 @@ if (len > 1024) return NULL; - if (buf) { - /* All pre-allocated sections are a single block */ - if (len != 1) - return NULL; - } else if (posix_memalign(&buf, 512, len<<9) != 0) + if (!buf && posix_memalign(&buf, 512, len<<9) != 0) buf = NULL; if (!buf) return NULL; if (super->active->type == 1) - offset += __be64_to_cpu(super->active->primary_lba); + offset += be64_to_cpu(super->active->primary_lba); else - offset += __be64_to_cpu(super->active->secondary_lba); + offset += be64_to_cpu(super->active->secondary_lba); if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) { if (dofree) @@ -543,60 +828,63 @@ if (lseek64(fd, dsize-512, 0) < 0) { if (devname) - fprintf(stderr, - Name": Cannot seek to anchor block on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); return 1; } if (read(fd, &super->anchor, 512) != 512) { if (devname) - fprintf(stderr, - Name ": Cannot read anchor block on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); return 1; } - if (super->anchor.magic != DDF_HEADER_MAGIC) { + if (!be32_eq(super->anchor.magic, DDF_HEADER_MAGIC)) { if (devname) - fprintf(stderr, Name ": no DDF anchor found on %s\n", + pr_err("no DDF anchor found on %s\n", devname); return 2; } - if (calc_crc(&super->anchor, 512) != super->anchor.crc) { + if (!be32_eq(calc_crc(&super->anchor, 512), super->anchor.crc)) { if (devname) - fprintf(stderr, Name ": bad CRC on anchor on %s\n", + pr_err("bad CRC on anchor on %s\n", devname); return 2; } if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { if (devname) - fprintf(stderr, Name ": can only support super revision" + pr_err("can only support super revision" " %.8s and earlier, not %.8s on %s\n", DDF_REVISION_2, super->anchor.revision,devname); return 2; } - if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba), + super->active = NULL; + if (load_ddf_header(fd, be64_to_cpu(super->anchor.primary_lba), dsize >> 9, 1, &super->primary, &super->anchor) == 0) { if (devname) - fprintf(stderr, - Name ": Failed to load primary DDF header " - "on %s\n", devname); - return 2; - } - super->active = &super->primary; - if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba), + pr_err("Failed to load primary DDF header " + "on %s\n", devname); + } else + super->active = &super->primary; + + if (load_ddf_header(fd, be64_to_cpu(super->anchor.secondary_lba), dsize >> 9, 2, &super->secondary, &super->anchor)) { - if ((__be32_to_cpu(super->primary.seq) - < __be32_to_cpu(super->secondary.seq) && - !super->secondary.openflag) - || (__be32_to_cpu(super->primary.seq) - == __be32_to_cpu(super->secondary.seq) && + if (super->active == NULL + || (be32_to_cpu(super->primary.seq) + < be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) + || (be32_to_cpu(super->primary.seq) + == be32_to_cpu(super->secondary.seq) && super->primary.openflag && !super->secondary.openflag) ) super->active = &super->secondary; - } + } else if (devname) + pr_err("Failed to load secondary DDF header on %s\n", + devname); + if (super->active == NULL) + return 2; return 0; } @@ -611,13 +899,13 @@ super->active->phys_section_offset, super->active->phys_section_length, 1); - super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512; + super->pdsize = be32_to_cpu(super->active->phys_section_length) * 512; super->virt = load_section(fd, super, NULL, super->active->virt_section_offset, super->active->virt_section_length, 1); - super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512; + super->vdsize = be32_to_cpu(super->active->virt_section_length) * 512; if (!ok || !super->phys || !super->virt) { @@ -630,12 +918,60 @@ super->conflist = NULL; super->dlist = NULL; - super->max_part = __be16_to_cpu(super->active->max_partitions); - super->mppe = __be16_to_cpu(super->active->max_primary_element_entries); - super->conf_rec_len = __be16_to_cpu(super->active->config_record_len); + super->max_part = be16_to_cpu(super->active->max_partitions); + super->mppe = be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = be16_to_cpu(super->active->config_record_len); + return 0; +} + +#define DDF_UNUSED_BVD 0xff +static int alloc_other_bvds(const struct ddf_super *ddf, struct vcl *vcl) +{ + unsigned int n_vds = vcl->conf.sec_elmnt_count - 1; + unsigned int i, vdsize; + void *p; + if (n_vds == 0) { + vcl->other_bvds = NULL; + return 0; + } + vdsize = ddf->conf_rec_len * 512; + if (posix_memalign(&p, 512, n_vds * + (vdsize + sizeof(struct vd_config *))) != 0) + return -1; + vcl->other_bvds = (struct vd_config **) (p + n_vds * vdsize); + for (i = 0; i < n_vds; i++) { + vcl->other_bvds[i] = p + i * vdsize; + memset(vcl->other_bvds[i], 0, vdsize); + vcl->other_bvds[i]->sec_elmnt_seq = DDF_UNUSED_BVD; + } return 0; } +static void add_other_bvd(struct vcl *vcl, struct vd_config *vd, + unsigned int len) +{ + int i; + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == vd->sec_elmnt_seq) + break; + + if (i < vcl->conf.sec_elmnt_count-1) { + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->other_bvds[i]->seqnum)) + return; + } else { + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == DDF_UNUSED_BVD) + break; + if (i == vcl->conf.sec_elmnt_count-1) { + pr_err("no space for sec level config %u, count is %u\n", + vd->sec_elmnt_seq, vcl->conf.sec_elmnt_count); + return; + } + } + memcpy(vcl->other_bvds[i], vd, len); +} + static int load_ddf_local(int fd, struct ddf_super *super, char *devname, int keep) { @@ -645,14 +981,15 @@ unsigned int i; unsigned int confsec; int vnum; - unsigned int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries); + unsigned int max_virt_disks = be16_to_cpu + (super->active->max_vd_entries); unsigned long long dsize; /* First the local disk info */ if (posix_memalign((void**)&dl, 512, sizeof(*dl) + (super->max_part) * sizeof(dl->vlist[0])) != 0) { - fprintf(stderr, Name ": %s could not allocate disk info buffer\n", + pr_err("%s could not allocate disk info buffer\n", __func__); return 1; } @@ -661,7 +998,7 @@ super->active->data_section_offset, super->active->data_section_length, 0); - dl->devname = devname ? strdup(devname) : NULL; + dl->devname = devname ? xstrdup(devname) : NULL; fstat(fd, &stb); dl->major = major(stb.st_rdev); @@ -672,12 +1009,19 @@ dl->size = 0; if (get_dev_size(fd, devname, &dsize)) dl->size = dsize >> 9; + /* If the disks have different sizes, the LBAs will differ + * between phys disks. + * At this point here, the values in super->active must be valid + * for this phys disk. */ + dl->primary_lba = super->active->primary_lba; + dl->secondary_lba = super->active->secondary_lba; + dl->workspace_lba = super->active->workspace_lba; dl->spare = NULL; for (i = 0 ; i < super->max_part ; i++) dl->vlist[i] = NULL; super->dlist = dl; dl->pdnum = -1; - for (i = 0; i < __be16_to_cpu(super->active->max_pd_entries); i++) + for (i = 0; i < be16_to_cpu(super->active->max_pd_entries); i++) if (memcmp(super->phys->entries[i].guid, dl->disk.guid, DDF_GUID_LEN) == 0) dl->pdnum = i; @@ -688,34 +1032,33 @@ * the conflist */ - conf = load_section(fd, super, NULL, + conf = load_section(fd, super, super->conf, super->active->config_section_offset, super->active->config_section_length, 0); - + super->conf = conf; vnum = 0; for (confsec = 0; - confsec < __be32_to_cpu(super->active->config_section_length); + confsec < be32_to_cpu(super->active->config_section_length); confsec += super->conf_rec_len) { struct vd_config *vd = (struct vd_config *)((char*)conf + confsec*512); struct vcl *vcl; - if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) { + if (be32_eq(vd->magic, DDF_SPARE_ASSIGN_MAGIC)) { if (dl->spare) continue; if (posix_memalign((void**)&dl->spare, 512, super->conf_rec_len*512) != 0) { - fprintf(stderr, Name - ": %s could not allocate spare info buf\n", - __func__); + pr_err("%s could not allocate spare info buf\n", + __func__); return 1; } - + memcpy(dl->spare, vd, super->conf_rec_len*512); continue; } - if (vd->magic != DDF_VD_CONF_MAGIC) + if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC)) continue; for (vcl = super->conflist; vcl; vcl = vcl->next) { if (memcmp(vcl->conf.guid, @@ -725,27 +1068,35 @@ if (vcl) { dl->vlist[vnum++] = vcl; - if (__be32_to_cpu(vd->seqnum) <= - __be32_to_cpu(vcl->conf.seqnum)) + if (vcl->other_bvds != NULL && + vcl->conf.sec_elmnt_seq != vd->sec_elmnt_seq) { + add_other_bvd(vcl, vd, super->conf_rec_len*512); + continue; + } + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->conf.seqnum)) continue; } else { if (posix_memalign((void**)&vcl, 512, (super->conf_rec_len*512 + offsetof(struct vcl, conf))) != 0) { - fprintf(stderr, Name - ": %s could not allocate vcl buf\n", - __func__); + pr_err("%s could not allocate vcl buf\n", + __func__); return 1; } vcl->next = super->conflist; vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vcl->conf.sec_elmnt_count = vd->sec_elmnt_count; + if (alloc_other_bvds(super, vcl) != 0) { + pr_err("%s could not allocate other bvds\n", + __func__); + free(vcl); + return 1; + }; super->conflist = vcl; dl->vlist[vnum++] = vcl; } memcpy(&vcl->conf, vd, super->conf_rec_len*512); - vcl->lba_offset = (__u64*) - &vcl->conf.phys_refnum[super->mppe]; - for (i=0; i < max_virt_disks ; i++) if (memcmp(super->virt->entries[i].guid, vcl->conf.guid, DDF_GUID_LEN)==0) @@ -753,7 +1104,6 @@ if (i < max_virt_disks) vcl->vcnum = i; } - free(conf); return 0; } @@ -775,32 +1125,30 @@ if (get_dev_size(fd, devname, &dsize) == 0) return 1; - if (test_partition(fd)) + if (!st->ignore_hw_compat && test_partition(fd)) /* DDF is not allowed on partitions */ return 1; /* 32M is a lower bound */ if (dsize <= 32*1024*1024) { if (devname) - fprintf(stderr, - Name ": %s is too small for ddf: " - "size is %llu sectors.\n", - devname, dsize>>9); + pr_err("%s is too small for ddf: " + "size is %llu sectors.\n", + devname, dsize>>9); return 1; } if (dsize & 511) { if (devname) - fprintf(stderr, - Name ": %s is an odd size for ddf: " - "size is %llu bytes.\n", - devname, dsize); + pr_err("%s is an odd size for ddf: " + "size is %llu bytes.\n", + devname, dsize); return 1; } free_super_ddf(st); if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { - fprintf(stderr, Name ": malloc of %zu failed.\n", + pr_err("malloc of %zu failed.\n", sizeof(*super)); return 1; } @@ -818,9 +1166,8 @@ if (rv) { if (devname) - fprintf(stderr, - Name ": Failed to load all information " - "sections on %s\n", devname); + pr_err("Failed to load all information " + "sections on %s\n", devname); free(super); return rv; } @@ -829,9 +1176,8 @@ if (rv) { if (devname) - fprintf(stderr, - Name ": Failed to load all information " - "sections on %s\n", devname); + pr_err("Failed to load all information " + "sections on %s\n", devname); free(super); return rv; } @@ -855,11 +1201,18 @@ return; free(ddf->phys); free(ddf->virt); + free(ddf->conf); while (ddf->conflist) { struct vcl *v = ddf->conflist; ddf->conflist = v->next; if (v->block_sizes) free(v->block_sizes); + if (v->other_bvds) + /* + v->other_bvds[0] points to beginning of buffer, + see alloc_other_bvds() + */ + free(v->other_bvds[0]); free(v); } while (ddf->dlist) { @@ -893,9 +1246,7 @@ ) return NULL; - st = malloc(sizeof(*st)); - memset(st, 0, sizeof(*st)); - st->container_dev = NoMdDev; + st = xcalloc(1, sizeof(*st)); st->ss = &super_ddf; st->max_devs = 512; st->minor_version = 0; @@ -903,7 +1254,6 @@ return st; } - #ifndef MDASSEMBLE static mapping_t ddf_state[] = { @@ -956,34 +1306,7 @@ }; #endif -struct num_mapping { - int num1, num2; -}; -static struct num_mapping ddf_level_num[] = { - { DDF_RAID0, 0 }, - { DDF_RAID1, 1 }, - { DDF_RAID3, LEVEL_UNSUPPORTED }, - { DDF_RAID4, 4 }, - { DDF_RAID5, 5 }, - { DDF_RAID1E, LEVEL_UNSUPPORTED }, - { DDF_JBOD, LEVEL_UNSUPPORTED }, - { DDF_CONCAT, LEVEL_LINEAR }, - { DDF_RAID5E, LEVEL_UNSUPPORTED }, - { DDF_RAID5EE, LEVEL_UNSUPPORTED }, - { DDF_RAID6, 6}, - { MAXINT, MAXINT } -}; - -static int map_num1(struct num_mapping *map, int num) -{ - int i; - for (i=0 ; map[i].num1 != MAXINT; i++) - if (map[i].num1 == num) - break; - return map[i].num2; -} - -static int all_ff(char *guid) +static int all_ff(const char *guid) { int i; for (i = 0; i < DDF_GUID_LEN; i++) @@ -992,6 +1315,22 @@ return 1; } +static const char *guid_str(const char *guid) +{ + static char buf[DDF_GUID_LEN*2+1]; + int i; + char *p = buf; + for (i = 0; i < DDF_GUID_LEN; i++) { + unsigned char c = guid[i]; + if (c >= 32 && c < 127) + p += sprintf(p, "%c", c); + else + p += sprintf(p, "%02x", c); + } + *p = '\0'; + return (const char *) buf; +} + #ifndef MDASSEMBLE static void print_guid(char *guid, int tstamp) { @@ -1038,19 +1377,20 @@ unsigned int i; struct vd_config *vc = &vcl->conf; - if (calc_crc(vc, crl*512) != vc->crc) + if (!be32_eq(calc_crc(vc, crl*512), vc->crc)) continue; if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) continue; /* Ok, we know about this VD, let's give more details */ printf(" Raid Devices[%d] : %d (", n, - __be16_to_cpu(vc->prim_elmnt_count)); - for (i = 0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) { + be16_to_cpu(vc->prim_elmnt_count)); + for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) { int j; - int cnt = __be16_to_cpu(sb->phys->used_pdes); + int cnt = be16_to_cpu(sb->phys->used_pdes); for (j=0; jphys_refnum[i] == sb->phys->entries[j].refnum) + if (be32_eq(vc->phys_refnum[i], + sb->phys->entries[j].refnum)) break; if (i) printf(" "); if (j < cnt) @@ -1060,8 +1400,8 @@ } printf(")\n"); if (vc->chunk_shift != 255) - printf(" Chunk Size[%d] : %d sectors\n", n, - 1 << vc->chunk_shift); + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); printf(" Raid Level[%d] : %s\n", n, map_num(ddf_level, vc->prl)?:"-unknown-"); if (vc->sec_elmnt_count != 1) { @@ -1071,32 +1411,34 @@ map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); } printf(" Device Size[%d] : %llu\n", n, - (unsigned long long)__be64_to_cpu(vc->blocks)/2); + be64_to_cpu(vc->blocks)/2); printf(" Array Size[%d] : %llu\n", n, - (unsigned long long)__be64_to_cpu(vc->array_blocks)/2); + be64_to_cpu(vc->array_blocks)/2); } } static void examine_vds(struct ddf_super *sb) { - int cnt = __be16_to_cpu(sb->virt->populated_vdes); - int i; + int cnt = be16_to_cpu(sb->virt->populated_vdes); + unsigned int i; printf(" Virtual Disks : %d\n", cnt); - for (i=0; ivirt->max_vdes); i++) { struct virtual_entry *ve = &sb->virt->entries[i]; + if (all_ff(ve->guid)) + continue; printf("\n"); printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); printf("\n"); - printf(" unit[%d] : %d\n", i, __be16_to_cpu(ve->unit)); + printf(" unit[%d] : %d\n", i, be16_to_cpu(ve->unit)); printf(" state[%d] : %s, %s%s\n", i, map_num(ddf_state, ve->state & 7), - (ve->state & 8) ? "Morphing, ": "", - (ve->state & 16)? "Not Consistent" : "Consistent"); + (ve->state & DDF_state_morphing) ? "Morphing, ": "", + (ve->state & DDF_state_inconsistent)? "Not Consistent" : "Consistent"); printf(" init state[%d] : %s\n", i, - map_num(ddf_init_state, ve->init_state&3)); + map_num(ddf_init_state, ve->init_state&DDF_initstate_mask)); printf(" access[%d] : %s\n", i, - map_num(ddf_access, (ve->init_state>>6) & 3)); + map_num(ddf_access, (ve->init_state & DDF_access_mask) >> 6)); printf(" Name[%d] : %.16s\n", i, ve->name); examine_vd(i, sb, ve->guid); } @@ -1105,7 +1447,7 @@ static void examine_pds(struct ddf_super *sb) { - int cnt = __be16_to_cpu(sb->phys->used_pdes); + int cnt = be16_to_cpu(sb->phys->used_pdes); int i; struct dl *dl; printf(" Physical Disks : %d\n", cnt); @@ -1113,17 +1455,17 @@ for (i=0 ; iphys->entries[i]; - int type = __be16_to_cpu(pd->type); - int state = __be16_to_cpu(pd->state); + int type = be16_to_cpu(pd->type); + int state = be16_to_cpu(pd->state); //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); //printf("\n"); printf(" %3d %08x ", i, - __be32_to_cpu(pd->refnum)); - printf("%8lluK ", - (unsigned long long)__be64_to_cpu(pd->config_size)>>1); + be32_to_cpu(pd->refnum)); + printf("%8lluK ", + be64_to_cpu(pd->config_size)>>1); for (dl = sb->dlist; dl ; dl = dl->next) { - if (dl->disk.refnum == pd->refnum) { + if (be32_eq(dl->disk.refnum, pd->refnum)) { char *dv = map_dev(dl->major, dl->minor, 0); if (dv) { printf("%-15s", dv); @@ -1158,14 +1500,15 @@ { struct ddf_super *sb = st->sb; - printf(" Magic : %08x\n", __be32_to_cpu(sb->anchor.magic)); + printf(" Magic : %08x\n", be32_to_cpu(sb->anchor.magic)); printf(" Version : %.8s\n", sb->anchor.revision); printf("Controller GUID : "); print_guid(sb->controller.guid, 0); printf("\n"); printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); printf("\n"); - printf(" Seq : %08x\n", __be32_to_cpu(sb->active->seq)); - printf(" Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", be32_eq(sb->secondary.magic, + DDF_HEADER_MAGIC) ?"yes" : "no"); examine_vds(sb); examine_pds(sb); @@ -1173,8 +1516,40 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map); +static void uuid_from_ddf_guid(const char *guid, int uuid[4]); static void uuid_from_super_ddf(struct supertype *st, int uuid[4]); +static unsigned int get_vd_num_of_subarray(struct supertype *st) +{ + /* + * Figure out the VD number for this supertype. + * Returns DDF_CONTAINER for the container itself, + * and DDF_NOTFOUND on error. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *sra; + char *sub, *end; + unsigned int vcnum; + + if (*st->container_devnm == '\0') + return DDF_CONTAINER; + + sra = sysfs_read(-1, st->devnm, GET_VERSION); + if (!sra || sra->array.major_version != -1 || + sra->array.minor_version != -2 || + !is_subarray(sra->text_version)) + return DDF_NOTFOUND; + + sub = strchr(sra->text_version + 1, '/'); + if (sub != NULL) + vcnum = strtoul(sub + 1, &end, 10); + if (sub == NULL || *sub == '\0' || *end != '\0' || + vcnum >= be16_to_cpu(ddf->active->max_vd_entries)) + return DDF_NOTFOUND; + + return vcnum; +} + static void brief_examine_super_ddf(struct supertype *st, int verbose) { /* We just write a generic DDF ARRAY entry @@ -1198,7 +1573,7 @@ getinfo_super_ddf(st, &info, NULL); fname_from_uuid(st, &info, nbuf, ':'); - for (i = 0; i < __be16_to_cpu(ddf->virt->max_vdes); i++) { + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { struct virtual_entry *ve = &ddf->virt->entries[i]; struct vcl vcl; char nbuf1[64]; @@ -1222,8 +1597,71 @@ printf("MD_METADATA=ddf\n"); printf("MD_LEVEL=container\n"); printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", + be16_to_cpu(((struct ddf_super *)st->sb)->phys->used_pdes)); +} + +static int copy_metadata_ddf(struct supertype *st, int from, int to) +{ + void *buf; + unsigned long long dsize, offset; + int bytes; + struct ddf_header *ddf; + int written = 0; + + /* The meta consists of an anchor, a primary, and a secondary. + * This all lives at the end of the device. + * So it is easiest to find the earliest of primary and + * secondary, and copy everything from there. + * + * Anchor is 512 from end It contains primary_lba and secondary_lba + * we choose one of those + */ + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-512, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + ddf = buf; + if (!be32_eq(ddf->magic, DDF_HEADER_MAGIC) || + !be32_eq(calc_crc(ddf, 512), ddf->crc) || + (memcmp(ddf->revision, DDF_REVISION_0, 8) != 0 && + memcmp(ddf->revision, DDF_REVISION_2, 8) != 0)) + goto err; + + offset = dsize - 512; + if ((be64_to_cpu(ddf->primary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->primary_lba) << 9; + if ((be64_to_cpu(ddf->secondary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->secondary_lba) << 9; + + bytes = dsize - offset; + + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < bytes) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; } - static void detail_super_ddf(struct supertype *st, char *homehost) { @@ -1237,13 +1675,16 @@ static void brief_detail_super_ddf(struct supertype *st) { - /* FIXME I really need to know which array we are detailing. - * Can that be stored in ddf_super?? - */ -// struct ddf_super *ddf = st->sb; struct mdinfo info; char nbuf[64]; - getinfo_super_ddf(st, &info, NULL); + struct ddf_super *ddf = st->sb; + unsigned int vcnum = get_vd_num_of_subarray(st); + if (vcnum == DDF_CONTAINER) + uuid_from_super_ddf(st, info.uuid); + else if (vcnum == DDF_NOTFOUND) + return; + else + uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, info.uuid); fname_from_uuid(st, &info, nbuf,':'); printf(" UUID=%s", nbuf + 5); } @@ -1269,29 +1710,100 @@ } #ifndef MDASSEMBLE -static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst) +static int find_index_in_bvd(const struct ddf_super *ddf, + const struct vd_config *conf, unsigned int n, + unsigned int *n_bvd) +{ + /* + * Find the index of the n-th valid physical disk in this BVD + */ + unsigned int i, j; + for (i = 0, j = 0; i < ddf->mppe && + j < be16_to_cpu(conf->prim_elmnt_count); i++) { + if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) { + if (n == j) { + *n_bvd = i; + return 1; + } + j++; + } + } + dprintf("%s: couldn't find BVD member %u (total %u)\n", + __func__, n, be16_to_cpu(conf->prim_elmnt_count)); + return 0; +} + +static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst, + unsigned int n, + unsigned int *n_bvd, struct vcl **vcl) { struct vcl *v; - for (v = ddf->conflist; v; v = v->next) - if (inst == v->vcnum) - return &v->conf; + for (v = ddf->conflist; v; v = v->next) { + unsigned int nsec, ibvd = 0; + struct vd_config *conf; + if (inst != v->vcnum) + continue; + conf = &v->conf; + if (conf->sec_elmnt_count == 1) { + if (find_index_in_bvd(ddf, conf, n, n_bvd)) { + *vcl = v; + return conf; + } else + goto bad; + } + if (v->other_bvds == NULL) { + pr_err("%s: BUG: other_bvds is NULL, nsec=%u\n", + __func__, conf->sec_elmnt_count); + goto bad; + } + nsec = n / be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_seq != nsec) { + for (ibvd = 1; ibvd < conf->sec_elmnt_count; ibvd++) { + if (v->other_bvds[ibvd-1]->sec_elmnt_seq + == nsec) + break; + } + if (ibvd == conf->sec_elmnt_count) + goto bad; + conf = v->other_bvds[ibvd-1]; + } + if (!find_index_in_bvd(ddf, conf, + n - nsec*conf->sec_elmnt_count, n_bvd)) + goto bad; + dprintf("%s: found disk %u as member %u in bvd %d of array %u\n" + , __func__, n, *n_bvd, ibvd, inst); + *vcl = v; + return conf; + } +bad: + pr_err("%s: Could't find disk %d in array %u\n", __func__, n, inst); return NULL; } #endif -static int find_phys(struct ddf_super *ddf, __u32 phys_refnum) +static int find_phys(const struct ddf_super *ddf, be32 phys_refnum) { /* Find the entry in phys_disk which has the given refnum * and return it's index */ unsigned int i; - for (i = 0; i < __be16_to_cpu(ddf->phys->max_pdes); i++) - if (ddf->phys->entries[i].refnum == phys_refnum) + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) + if (be32_eq(ddf->phys->entries[i].refnum, phys_refnum)) return i; return -1; } +static void uuid_from_ddf_guid(const char *guid, int uuid[4]) +{ + char buf[20]; + struct sha1_ctx ctx; + sha1_init_ctx(&ctx); + sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) { /* The uuid returned here is used for: @@ -1302,7 +1814,7 @@ * not the device-set. * uuid to recognise same set when adding a missing device back * to an array. This is a uuid for the device-set. - * + * * For each of these we can make do with a truncated * or hashed uuid rather than the original, as long as * everyone agrees. @@ -1316,18 +1828,12 @@ struct ddf_super *ddf = st->sb; struct vcl *vcl = ddf->currentconf; char *guid; - char buf[20]; - struct sha1_ctx ctx; if (vcl) guid = vcl->conf.guid; else guid = ddf->anchor.guid; - - sha1_init_ctx(&ctx); - sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); - sha1_finish_ctx(&ctx, buf); - memcpy(uuid, buf, 4*4); + uuid_from_ddf_guid(guid, uuid); } static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map); @@ -1344,7 +1850,7 @@ } memset(info, 0, sizeof(*info)); - info->array.raid_disks = __be16_to_cpu(ddf->phys->used_pdes); + info->array.raid_disks = be16_to_cpu(ddf->phys->used_pdes); info->array.level = LEVEL_CONTAINER; info->array.layout = 0; info->array.md_minor = -1; @@ -1355,16 +1861,15 @@ info->array.chunk_size = 0; info->container_enough = 1; - info->disk.major = 0; info->disk.minor = 0; if (ddf->dlist) { - info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum); info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); - info->data_offset = __be64_to_cpu(ddf->phys-> - entries[info->disk.raid_disk]. - config_size); + info->data_offset = be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); info->component_size = ddf->dlist->size - info->data_offset; } else { info->disk.number = -1; @@ -1373,7 +1878,6 @@ } info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); - info->recovery_start = MaxSector; info->reshape_active = 0; info->recovery_blocked = 0; @@ -1390,8 +1894,10 @@ int i; for (i = 0 ; i < map_disks; i++) { if (i < info->array.raid_disks && - (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) && - !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed)) + (be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Failed)) map[i] = 1; else map[i] = 0; @@ -1399,42 +1905,47 @@ } } -static int rlq_to_layout(int rlq, int prl, int raiddisks); - static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map) { struct ddf_super *ddf = st->sb; struct vcl *vc = ddf->currentconf; int cd = ddf->currentdev; + int n_prim; int j; struct dl *dl; int map_disks = info->array.raid_disks; __u32 *cptr; + struct vd_config *conf; memset(info, 0, sizeof(*info)); - /* FIXME this returns BVD info - what if we want SVD ?? */ - - info->array.raid_disks = __be16_to_cpu(vc->conf.prim_elmnt_count); - info->array.level = map_num1(ddf_level_num, vc->conf.prl); - info->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, - info->array.raid_disks); + if (layout_ddf2md(&vc->conf, &info->array) == -1) + return; info->array.md_minor = -1; cptr = (__u32 *)(vc->conf.guid + 16); info->array.ctime = DECADE + __be32_to_cpu(*cptr); - info->array.utime = DECADE + __be32_to_cpu(vc->conf.timestamp); + info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp); info->array.chunk_size = 512 << vc->conf.chunk_shift; info->custom_array_size = 0; + conf = &vc->conf; + n_prim = be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_count > 1 && cd >= n_prim) { + int ibvd = cd / n_prim - 1; + cd %= n_prim; + conf = vc->other_bvds[ibvd]; + } + if (cd >= 0 && (unsigned)cd < ddf->mppe) { - info->data_offset = __be64_to_cpu(vc->lba_offset[cd]); + info->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, conf)[cd]); if (vc->block_sizes) info->component_size = vc->block_sizes[cd]; else - info->component_size = __be64_to_cpu(vc->conf.blocks); + info->component_size = be64_to_cpu(conf->blocks); } for (dl = ddf->dlist; dl ; dl = dl->next) - if (dl->raiddisk == ddf->currentdev) + if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd])) break; info->disk.major = 0; @@ -1443,7 +1954,8 @@ if (dl) { info->disk.major = dl->major; info->disk.minor = dl->minor; - info->disk.raid_disk = dl->raiddisk; + info->disk.raid_disk = cd + conf->sec_elmnt_seq + * be16_to_cpu(conf->prim_elmnt_count); info->disk.number = dl->pdnum; info->disk.state = (1<array.major_version = -1; info->array.minor_version = -2; sprintf(info->text_version, "/%s/%d", - devnum2devname(st->container_dev), + st->container_devnm, info->container_member); - info->safe_mode_delay = 200; + info->safe_mode_delay = DDF_SAFE_MODE_DELAY; memcpy(info->name, ddf->virt->entries[info->container_member].name, 16); info->name[16]=0; @@ -1481,15 +1993,16 @@ map[j] = 0; if (j < info->array.raid_disks) { int i = find_phys(ddf, vc->conf.phys_refnum[j]); - if (i >= 0 && - (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) && - !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed)) + if (i >= 0 && + (be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Failed)) map[i] = 1; } } } - static int update_super_ddf(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, @@ -1561,7 +2074,7 @@ static void make_header_guid(char *guid) { - __u32 stamp; + be32 stamp; /* Create a DDF Header of Virtual Disk GUID */ /* 24 bytes of fiction required. @@ -1570,28 +2083,68 @@ * Remaining 8 random number plus timestamp */ memcpy(guid, T10, sizeof(T10)); - stamp = __cpu_to_be32(0xdeadbeef); + stamp = cpu_to_be32(0xdeadbeef); memcpy(guid+8, &stamp, 4); - stamp = __cpu_to_be32(0); + stamp = cpu_to_be32(0); memcpy(guid+12, &stamp, 4); - stamp = __cpu_to_be32(time(0) - DECADE); + stamp = cpu_to_be32(time(0) - DECADE); memcpy(guid+16, &stamp, 4); - stamp = random32(); + stamp._v32 = random32(); memcpy(guid+20, &stamp, 4); } -static int init_super_ddf_bvd(struct supertype *st, - mdu_array_info_t *info, - unsigned long long size, - char *name, char *homehost, - int *uuid); - -static int init_super_ddf(struct supertype *st, - mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, - int *uuid) +static unsigned int find_unused_vde(const struct ddf_super *ddf) { - /* This is primarily called by Create when creating a new array. + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static unsigned int find_vde_by_name(const struct ddf_super *ddf, + const char *name) +{ + unsigned int i; + if (name == NULL) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + if (!strncmp(name, ddf->virt->entries[i].name, + sizeof(ddf->virt->entries[i].name))) + return i; + } + return DDF_NOTFOUND; +} + +#ifndef MDASSEMBLE +static unsigned int find_vde_by_guid(const struct ddf_super *ddf, + const char *guid) +{ + unsigned int i; + if (guid == NULL || all_ff(guid)) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) + if (!memcmp(ddf->virt->entries[i].guid, guid, DDF_GUID_LEN)) + return i; + return DDF_NOTFOUND; +} +#endif + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset); + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. * We will then get add_to_super called for each component, and then * write_init_super called to write it out to each device. * For DDF, Create can create on fresh devices or on a pre-existing @@ -1625,11 +2178,17 @@ struct phys_disk *pd; struct virtual_disk *vd; + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not supported by DDF\n"); + return 0; + } + if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, uuid); + return init_super_ddf_bvd(st, info, size, name, homehost, uuid, + data_offset); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { - fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 0; } memset(ddf, 0, sizeof(*ddf)); @@ -1654,37 +2213,37 @@ make_header_guid(ddf->anchor.guid); memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); - ddf->anchor.seq = __cpu_to_be32(1); - ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE); + ddf->anchor.seq = cpu_to_be32(1); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); ddf->anchor.openflag = 0xFF; ddf->anchor.foreignflag = 0; ddf->anchor.enforcegroups = 0; /* Is this best?? */ ddf->anchor.pad0 = 0xff; memset(ddf->anchor.pad1, 0xff, 12); memset(ddf->anchor.header_ext, 0xff, 32); - ddf->anchor.primary_lba = ~(__u64)0; - ddf->anchor.secondary_lba = ~(__u64)0; + ddf->anchor.primary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.secondary_lba = cpu_to_be64(~(__u64)0); ddf->anchor.type = DDF_HEADER_ANCHOR; memset(ddf->anchor.pad2, 0xff, 3); - ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */ - ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom - of 32M reserved.. */ + ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */ + /* Put this at bottom of 32M reserved.. */ + ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0); max_phys_disks = 1023; /* Should be enough */ - ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks); + ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks); max_virt_disks = 255; - ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */ - ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */ + ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks); /* ?? */ + ddf->anchor.max_partitions = cpu_to_be16(64); /* ?? */ ddf->max_part = 64; ddf->mppe = 256; ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; - ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len); - ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe); + ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe); memset(ddf->anchor.pad3, 0xff, 54); /* controller sections is one sector long immediately * after the ddf header */ sector = 1; - ddf->anchor.controller_section_offset = __cpu_to_be32(sector); - ddf->anchor.controller_section_length = __cpu_to_be32(1); + ddf->anchor.controller_section_offset = cpu_to_be32(sector); + ddf->anchor.controller_section_length = cpu_to_be32(1); sector += 1; /* phys is 8 sectors after that */ @@ -1695,9 +2254,9 @@ case 2: case 8: case 32: case 128: case 512: break; default: abort(); } - ddf->anchor.phys_section_offset = __cpu_to_be32(sector); + ddf->anchor.phys_section_offset = cpu_to_be32(sector); ddf->anchor.phys_section_length = - __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ sector += pdsize/512; /* virt is another 32 sectors */ @@ -1708,26 +2267,26 @@ case 2: case 8: case 32: case 128: case 512: break; default: abort(); } - ddf->anchor.virt_section_offset = __cpu_to_be32(sector); + ddf->anchor.virt_section_offset = cpu_to_be32(sector); ddf->anchor.virt_section_length = - __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ sector += vdsize/512; clen = ddf->conf_rec_len * (ddf->max_part+1); - ddf->anchor.config_section_offset = __cpu_to_be32(sector); - ddf->anchor.config_section_length = __cpu_to_be32(clen); + ddf->anchor.config_section_offset = cpu_to_be32(sector); + ddf->anchor.config_section_length = cpu_to_be32(clen); sector += clen; - ddf->anchor.data_section_offset = __cpu_to_be32(sector); - ddf->anchor.data_section_length = __cpu_to_be32(1); + ddf->anchor.data_section_offset = cpu_to_be32(sector); + ddf->anchor.data_section_length = cpu_to_be32(1); sector += 1; - ddf->anchor.bbm_section_length = __cpu_to_be32(0); - ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF); - ddf->anchor.diag_space_length = __cpu_to_be32(0); - ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF); - ddf->anchor.vendor_length = __cpu_to_be32(0); - ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF); + ddf->anchor.bbm_section_length = cpu_to_be32(0); + ddf->anchor.bbm_section_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = cpu_to_be32(0); + ddf->anchor.diag_space_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = cpu_to_be32(0); + ddf->anchor.vendor_offset = cpu_to_be32(0xFFFFFFFF); memset(ddf->anchor.pad4, 0xff, 256); @@ -1756,10 +2315,10 @@ for (i = strlen(T10) ; i+hostlen < 24; i++) ddf->controller.guid[i] = ' '; - ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD); - ddf->controller.type.device_id = __cpu_to_be16(0xBEEF); - ddf->controller.type.sub_vendor_id = 0; - ddf->controller.type.sub_device_id = 0; + ddf->controller.type.vendor_id = cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = cpu_to_be16(0); + ddf->controller.type.sub_device_id = cpu_to_be16(0); memcpy(ddf->controller.product_id, "What Is My PID??", 16); memset(ddf->controller.pad, 0xff, 8); memset(ddf->controller.vendor_data, 0xff, 448); @@ -1767,7 +2326,7 @@ strcpy((char*)ddf->controller.vendor_data, homehost); if (posix_memalign((void**)&pd, 512, pdsize) != 0) { - fprintf(stderr, Name ": %s could not allocate pd\n", __func__); + pr_err("%s could not allocate pd\n", __func__); return 0; } ddf->phys = pd; @@ -1776,27 +2335,29 @@ memset(pd, 0xff, pdsize); memset(pd, 0, sizeof(*pd)); pd->magic = DDF_PHYS_RECORDS_MAGIC; - pd->used_pdes = __cpu_to_be16(0); - pd->max_pdes = __cpu_to_be16(max_phys_disks); + pd->used_pdes = cpu_to_be16(0); + pd->max_pdes = cpu_to_be16(max_phys_disks); memset(pd->pad, 0xff, 52); + for (i = 0; i < max_phys_disks; i++) + memset(pd->entries[i].guid, 0xff, DDF_GUID_LEN); if (posix_memalign((void**)&vd, 512, vdsize) != 0) { - fprintf(stderr, Name ": %s could not allocate vd\n", __func__); + pr_err("%s could not allocate vd\n", __func__); return 0; } ddf->virt = vd; ddf->vdsize = vdsize; memset(vd, 0, vdsize); vd->magic = DDF_VIRT_RECORDS_MAGIC; - vd->populated_vdes = __cpu_to_be16(0); - vd->max_vdes = __cpu_to_be16(max_virt_disks); + vd->populated_vdes = cpu_to_be16(0); + vd->max_vdes = cpu_to_be16(max_virt_disks); memset(vd->pad, 0xff, 52); for (i=0; ientries[i], 0xff, sizeof(struct virtual_entry)); st->sb = ddf; - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); return 1; } @@ -1805,100 +2366,6 @@ return ffs(chunksize/512)-1; } -static int level_to_prl(int level) -{ - switch (level) { - case LEVEL_LINEAR: return DDF_CONCAT; - case 0: return DDF_RAID0; - case 1: return DDF_RAID1; - case 4: return DDF_RAID4; - case 5: return DDF_RAID5; - case 6: return DDF_RAID6; - default: return -1; - } -} -static int layout_to_rlq(int level, int layout, int raiddisks) -{ - switch(level) { - case 0: - return DDF_RAID0_SIMPLE; - case 1: - switch(raiddisks) { - case 2: return DDF_RAID1_SIMPLE; - case 3: return DDF_RAID1_MULTI; - default: return -1; - } - case 4: - switch(layout) { - case 0: return DDF_RAID4_N; - } - break; - case 5: - switch(layout) { - case ALGORITHM_LEFT_ASYMMETRIC: - return DDF_RAID5_N_RESTART; - case ALGORITHM_RIGHT_ASYMMETRIC: - return DDF_RAID5_0_RESTART; - case ALGORITHM_LEFT_SYMMETRIC: - return DDF_RAID5_N_CONTINUE; - case ALGORITHM_RIGHT_SYMMETRIC: - return -1; /* not mentioned in standard */ - } - case 6: - switch(layout) { - case ALGORITHM_ROTATING_N_RESTART: - return DDF_RAID5_N_RESTART; - case ALGORITHM_ROTATING_ZERO_RESTART: - return DDF_RAID6_0_RESTART; - case ALGORITHM_ROTATING_N_CONTINUE: - return DDF_RAID5_N_CONTINUE; - } - } - return -1; -} - -static int rlq_to_layout(int rlq, int prl, int raiddisks) -{ - switch(prl) { - case DDF_RAID0: - return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */ - case DDF_RAID1: - return 0; /* hopefully rlq == SIMPLE or MULTI depending - on raiddisks*/ - case DDF_RAID4: - switch(rlq) { - case DDF_RAID4_N: - return 0; - default: - /* not supported */ - return -1; /* FIXME this isn't checked */ - } - case DDF_RAID5: - switch(rlq) { - case DDF_RAID5_N_RESTART: - return ALGORITHM_LEFT_ASYMMETRIC; - case DDF_RAID5_0_RESTART: - return ALGORITHM_RIGHT_ASYMMETRIC; - case DDF_RAID5_N_CONTINUE: - return ALGORITHM_LEFT_SYMMETRIC; - default: - return -1; - } - case DDF_RAID6: - switch(rlq) { - case DDF_RAID5_N_RESTART: - return ALGORITHM_ROTATING_N_RESTART; - case DDF_RAID6_0_RESTART: - return ALGORITHM_ROTATING_ZERO_RESTART; - case DDF_RAID5_N_CONTINUE: - return ALGORITHM_ROTATING_N_CONTINUE; - default: - return -1; - } - } - return -1; -} - #ifndef MDASSEMBLE struct extent { unsigned long long start, size; @@ -1920,33 +2387,34 @@ * (dnum) of the given ddf. * Return a malloced array of 'struct extent' -FIXME ignore DDF_Legacy devices? + * FIXME ignore DDF_Legacy devices? */ struct extent *rv; int n = 0; - unsigned int i, j; + unsigned int i; + __u16 state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state); - rv = malloc(sizeof(struct extent) * (ddf->max_part + 2)); - if (!rv) + if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online) return NULL; + rv = xmalloc(sizeof(struct extent) * (ddf->max_part + 2)); + for (i = 0; i < ddf->max_part; i++) { + const struct vd_config *bvd; + unsigned int ibvd; struct vcl *v = dl->vlist[i]; - if (v == NULL) + if (v == NULL || + get_pd_index_from_refnum(v, dl->disk.refnum, ddf->mppe, + &bvd, &ibvd) == DDF_NOTFOUND) continue; - for (j = 0; j < v->conf.prim_elmnt_count; j++) - if (v->conf.phys_refnum[j] == dl->disk.refnum) { - /* This device plays role 'j' in 'v'. */ - rv[n].start = __be64_to_cpu(v->lba_offset[j]); - rv[n].size = __be64_to_cpu(v->conf.blocks); - n++; - break; - } + rv[n].start = be64_to_cpu(LBA_OFFSET(ddf, bvd)[ibvd]); + rv[n].size = be64_to_cpu(bvd->blocks); + n++; } qsort(rv, n, sizeof(*rv), cmp_extent); - rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].start = be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); rv[n].size = 0; return rv; } @@ -1956,45 +2424,25 @@ mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, - int *uuid) + int *uuid, unsigned long long data_offset) { /* We are creating a BVD inside a pre-existing container. * so st->sb is already set. * We need to create a new vd_config and a new virtual_entry */ struct ddf_super *ddf = st->sb; - unsigned int venum; + unsigned int venum, i; struct virtual_entry *ve; struct vcl *vcl; struct vd_config *vc; - if (__be16_to_cpu(ddf->virt->populated_vdes) - >= __be16_to_cpu(ddf->virt->max_vdes)) { - fprintf(stderr, Name": This ddf already has the " - "maximum of %d virtual devices\n", - __be16_to_cpu(ddf->virt->max_vdes)); + if (find_vde_by_name(ddf, name) != DDF_NOTFOUND) { + pr_err("This ddf already has an array called %s\n", name); return 0; } - - if (name) - for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++) - if (!all_ff(ddf->virt->entries[venum].guid)) { - char *n = ddf->virt->entries[venum].name; - - if (strncmp(name, n, 16) == 0) { - fprintf(stderr, Name ": This ddf already" - " has an array called %s\n", - name); - return 0; - } - } - - for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++) - if (all_ff(ddf->virt->entries[venum].guid)) - break; - if (venum == __be16_to_cpu(ddf->virt->max_vdes)) { - fprintf(stderr, Name ": Cannot find spare slot for " - "virtual disk - DDF is corrupt\n"); + venum = find_unused_vde(ddf); + if (venum == DDF_NOTFOUND) { + pr_err("Cannot find spare slot for virtual disk\n"); return 0; } ve = &ddf->virt->entries[venum]; @@ -2003,10 +2451,11 @@ * timestamp, random number */ make_header_guid(ve->guid); - ve->unit = __cpu_to_be16(info->md_minor); + ve->unit = cpu_to_be16(info->md_minor); ve->pad0 = 0xFFFF; - ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN); - ve->type = 0; + ve->guid_crc._v16 = crc32(0, (unsigned char *)ddf->anchor.guid, + DDF_GUID_LEN); + ve->type = cpu_to_be16(0); ve->state = DDF_state_degraded; /* Will be modified as devices are added */ if (info->state & 1) /* clean */ ve->init_state = DDF_init_full; @@ -2018,45 +2467,51 @@ if (name) strncpy(ve->name, name, 16); ddf->virt->populated_vdes = - __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1); + cpu_to_be16(be16_to_cpu(ddf->virt->populated_vdes)+1); /* Now create a new vd_config */ if (posix_memalign((void**)&vcl, 512, (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) { - fprintf(stderr, Name ": %s could not allocate vd_config\n", __func__); + pr_err("%s could not allocate vd_config\n", __func__); return 0; } - vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe]; vcl->vcnum = venum; vcl->block_sizes = NULL; /* FIXME not for CONCAT */ - vc = &vcl->conf; vc->magic = DDF_VD_CONF_MAGIC; memcpy(vc->guid, ve->guid, DDF_GUID_LEN); - vc->timestamp = __cpu_to_be32(time(0)-DECADE); - vc->seqnum = __cpu_to_be32(1); + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(1); memset(vc->pad0, 0xff, 24); - vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks); vc->chunk_shift = chunk_to_shift(info->chunk_size); - vc->prl = level_to_prl(info->level); - vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks); - vc->sec_elmnt_count = 1; + if (layout_md2ddf(info, vc) == -1 || + be16_to_cpu(vc->prim_elmnt_count) > ddf->mppe) { + pr_err("%s: unsupported RAID level/layout %d/%d with %d disks\n", + __func__, info->level, info->layout, info->raid_disks); + free(vcl); + return 0; + } vc->sec_elmnt_seq = 0; - vc->srl = 0; - vc->blocks = __cpu_to_be64(info->size * 2); - vc->array_blocks = __cpu_to_be64( + if (alloc_other_bvds(ddf, vcl) != 0) { + pr_err("%s could not allocate other bvds\n", + __func__); + free(vcl); + return 0; + } + vc->blocks = cpu_to_be64(info->size * 2); + vc->array_blocks = cpu_to_be64( calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, info->size*2)); memset(vc->pad1, 0xff, 8); - vc->spare_refs[0] = 0xffffffff; - vc->spare_refs[1] = 0xffffffff; - vc->spare_refs[2] = 0xffffffff; - vc->spare_refs[3] = 0xffffffff; - vc->spare_refs[4] = 0xffffffff; - vc->spare_refs[5] = 0xffffffff; - vc->spare_refs[6] = 0xffffffff; - vc->spare_refs[7] = 0xffffffff; + vc->spare_refs[0] = cpu_to_be32(0xffffffff); + vc->spare_refs[1] = cpu_to_be32(0xffffffff); + vc->spare_refs[2] = cpu_to_be32(0xffffffff); + vc->spare_refs[3] = cpu_to_be32(0xffffffff); + vc->spare_refs[4] = cpu_to_be32(0xffffffff); + vc->spare_refs[5] = cpu_to_be32(0xffffffff); + vc->spare_refs[6] = cpu_to_be32(0xffffffff); + vc->spare_refs[7] = cpu_to_be32(0xffffffff); memset(vc->cache_pol, 0, 8); vc->bg_rate = 0x80; memset(vc->pad2, 0xff, 3); @@ -2071,14 +2526,22 @@ memset(vc->phys_refnum, 0xff, 4*ddf->mppe); memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe); + for (i = 1; i < vc->sec_elmnt_count; i++) { + memcpy(vcl->other_bvds[i-1], vc, ddf->conf_rec_len * 512); + vcl->other_bvds[i-1]->sec_elmnt_seq = i; + } + vcl->next = ddf->conflist; ddf->conflist = vcl; ddf->currentconf = vcl; - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); return 1; } + #ifndef MDASSEMBLE +static int get_svd_state(const struct ddf_super *, const struct vcl *); + static void add_to_super_ddf_bvd(struct supertype *st, mdu_disk_info_t *dk, int fd, char *devname) { @@ -2095,11 +2558,10 @@ struct dl *dl; struct ddf_super *ddf = st->sb; struct vd_config *vc; - __u64 *lba_offset; - unsigned int working; unsigned int i; unsigned long long blocks, pos, esize; struct extent *ex; + unsigned int raid_disk = dk->raid_disk; if (fd == -1) { for (dl = ddf->dlist; dl ; dl = dl->next) @@ -2115,14 +2577,19 @@ return; vc = &ddf->currentconf->conf; - lba_offset = ddf->currentconf->lba_offset; + if (vc->sec_elmnt_count > 1) { + unsigned int n = be16_to_cpu(vc->prim_elmnt_count); + if (raid_disk >= n) + vc = ddf->currentconf->other_bvds[raid_disk / n - 1]; + raid_disk %= n; + } ex = get_extents(ddf, dl); if (!ex) return; i = 0; pos = 0; - blocks = __be64_to_cpu(vc->blocks); + blocks = be64_to_cpu(vc->blocks); if (ddf->currentconf->block_sizes) blocks = ddf->currentconf->block_sizes[dk->raid_disk]; @@ -2139,8 +2606,8 @@ return; ddf->currentdev = dk->raid_disk; - vc->phys_refnum[dk->raid_disk] = dl->disk.refnum; - lba_offset[dk->raid_disk] = __cpu_to_be64(pos); + vc->phys_refnum[raid_disk] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[raid_disk] = cpu_to_be64(pos); for (i = 0; i < ddf->max_part ; i++) if (dl->vlist[i] == NULL) @@ -2154,38 +2621,38 @@ if (devname) dl->devname = devname; - /* Check how many working raid_disks, and if we can mark - * array as optimal yet - */ - working = 0; - - for (i = 0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) - if (vc->phys_refnum[i] != 0xffffffff) - working++; - - /* Find which virtual_entry */ + /* Check if we can mark array as optimal yet */ i = ddf->currentconf->vcnum; - if (working == __be16_to_cpu(vc->prim_elmnt_count)) - ddf->virt->entries[i].state = - (ddf->virt->entries[i].state & ~DDF_state_mask) - | DDF_state_optimal; - - if (vc->prl == DDF_RAID6 && - working+1 == __be16_to_cpu(vc->prim_elmnt_count)) - ddf->virt->entries[i].state = - (ddf->virt->entries[i].state & ~DDF_state_mask) - | DDF_state_part_optimal; + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | get_svd_state(ddf, ddf->currentconf); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + dprintf("%s: added disk %d/%08x to VD %d/%s as disk %d\n", + __func__, dl->pdnum, be32_to_cpu(dl->disk.refnum), + ddf->currentconf->vcnum, guid_str(vc->guid), + dk->raid_disk); + ddf_set_updates_pending(ddf); +} - ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare); - ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD); - ddf->updates_pending = 1; +static unsigned int find_unused_pde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) { + if (all_ff(ddf->phys->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; } /* add a device to a container, either while creating it or while * expanding a pre-existing container */ static int add_to_super_ddf(struct supertype *st, - mdu_disk_info_t *dk, int fd, char *devname) + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) { struct ddf_super *ddf = st->sb; struct dl *dd; @@ -2206,11 +2673,25 @@ * a phys_disk entry and a more detailed disk_data entry. */ fstat(fd, &stb); + n = find_unused_pde(ddf); + if (n == DDF_NOTFOUND) { + pr_err("%s: No free slot in array, cannot add disk\n", + __func__); + return 1; + } + pde = &ddf->phys->entries[n]; + get_dev_size(fd, NULL, &size); + if (size <= 32*1024*1024) { + pr_err("%s: device size must be at least 32MB\n", + __func__); + return 1; + } + size >>= 9; + if (posix_memalign((void**)&dd, 512, sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) { - fprintf(stderr, Name - ": %s could allocate buffer for new disk, aborting\n", - __func__); + pr_err("%s could allocate buffer for new disk, aborting\n", + __func__); return 1; } dd->major = major(stb.st_rdev); @@ -2230,10 +2711,11 @@ do { /* Cannot be bothered finding a CRC of some irrelevant details*/ - dd->disk.refnum = random32(); - for (i = __be16_to_cpu(ddf->active->max_pd_entries); + dd->disk.refnum._v32 = random32(); + for (i = be16_to_cpu(ddf->active->max_pd_entries); i > 0; i--) - if (ddf->phys->entries[i-1].refnum == dd->disk.refnum) + if (be32_eq(ddf->phys->entries[i-1].refnum, + dd->disk.refnum)) break; } while (i > 0); @@ -2245,8 +2727,6 @@ for (i = 0; i < ddf->max_part ; i++) dd->vlist[i] = NULL; - n = __be16_to_cpu(ddf->phys->used_pdes); - pde = &ddf->phys->entries[n]; dd->pdnum = n; if (st->update_tail) { @@ -2254,34 +2734,51 @@ sizeof(struct phys_disk_entry)); struct phys_disk *pd; - pd = malloc(len); + pd = xmalloc(len); pd->magic = DDF_PHYS_RECORDS_MAGIC; - pd->used_pdes = __cpu_to_be16(n); + pd->used_pdes = cpu_to_be16(n); pde = &pd->entries[0]; dd->mdupdate = pd; - } else { - n++; - ddf->phys->used_pdes = __cpu_to_be16(n); - } + } else + ddf->phys->used_pdes = cpu_to_be16( + 1 + be16_to_cpu(ddf->phys->used_pdes)); memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); pde->refnum = dd->disk.refnum; - pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); - pde->state = __cpu_to_be16(DDF_Online); - get_dev_size(fd, NULL, &size); - /* We are required to reserve 32Meg, and record the size in sectors */ - pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512); + pde->type = cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = cpu_to_be16(DDF_Online); + dd->size = size; + /* + * If there is already a device in dlist, try to reserve the same + * amount of workspace. Otherwise, use 32MB. + * We checked disk size above already. + */ +#define __calc_lba(new, old, lba, mb) do { \ + unsigned long long dif; \ + if ((old) != NULL) \ + dif = (old)->size - be64_to_cpu((old)->lba); \ + else \ + dif = (new)->size; \ + if ((new)->size > dif) \ + (new)->lba = cpu_to_be64((new)->size - dif); \ + else \ + (new)->lba = cpu_to_be64((new)->size - (mb*1024*2)); \ + } while (0) + __calc_lba(dd, ddf->dlist, workspace_lba, 32); + __calc_lba(dd, ddf->dlist, primary_lba, 16); + __calc_lba(dd, ddf->dlist, secondary_lba, 32); + pde->config_size = dd->workspace_lba; + sprintf(pde->path, "%17.17s","Information: nil") ; memset(pde->pad, 0xff, 6); - dd->size = size >> 9; if (st->update_tail) { dd->next = ddf->add_list; ddf->add_list = dd; } else { dd->next = ddf->dlist; ddf->dlist = dd; - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); } return 0; @@ -2312,14 +2809,15 @@ sizeof(struct phys_disk_entry)); struct phys_disk *pd; - pd = malloc(len); + pd = xmalloc(len); pd->magic = DDF_PHYS_RECORDS_MAGIC; - pd->used_pdes = __cpu_to_be16(dl->pdnum); - pd->entries[0].state = __cpu_to_be16(DDF_Missing); + pd->used_pdes = cpu_to_be16(dl->pdnum); + pd->entries[0].state = cpu_to_be16(DDF_Missing); append_metadata_update(st, pd, len); } return 0; } +#endif /* * This is the write_init_super method for a ddf container. It is @@ -2328,116 +2826,173 @@ */ #define NULL_CONF_SZ 4096 -static int __write_init_super_ddf(struct supertype *st) +static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type) { + unsigned long long sector; + struct ddf_header *header; + int fd, i, n_config, conf_size, buf_size; + int ret = 0; + char *conf; - struct ddf_super *ddf = st->sb; - int i; - struct dl *d; - int n_config; - int conf_size; - int attempts = 0; - int successes = 0; - unsigned long long size, sector; - char *null_aligned; + fd = d->fd; - if (posix_memalign((void**)&null_aligned, 4096, NULL_CONF_SZ) != 0) { - return -ENOMEM; + switch (type) { + case DDF_HEADER_PRIMARY: + header = &ddf->primary; + sector = be64_to_cpu(header->primary_lba); + break; + case DDF_HEADER_SECONDARY: + header = &ddf->secondary; + sector = be64_to_cpu(header->secondary_lba); + break; + default: + return 0; } - memset(null_aligned, 0xff, NULL_CONF_SZ); - - /* try to write updated metadata, - * if we catch a failure move on to the next disk - */ - for (d = ddf->dlist; d; d=d->next) { - int fd = d->fd; - if (fd < 0) - continue; - - attempts++; - /* We need to fill in the primary, (secondary) and workspace - * lba's in the headers, set their checksums, - * Also checksum phys, virt.... - * - * Then write everything out, finally the anchor is written. - */ - get_dev_size(fd, NULL, &size); - size /= 512; - ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2); - ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2); - ddf->anchor.seq = __cpu_to_be32(1); - memcpy(&ddf->primary, &ddf->anchor, 512); - memcpy(&ddf->secondary, &ddf->anchor, 512); - - ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ - ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */ - ddf->anchor.crc = calc_crc(&ddf->anchor, 512); - - ddf->primary.openflag = 0; - ddf->primary.type = DDF_HEADER_PRIMARY; + header->type = type; + header->openflag = 1; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + goto out; + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + if (write(fd, &ddf->controller, 512) < 0) + goto out; + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + if (write(fd, ddf->phys, ddf->pdsize) < 0) + goto out; + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + if (write(fd, ddf->virt, ddf->vdsize) < 0) + goto out; + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + conf = ddf->conf; + buf_size = conf_size * (n_config + 1); + if (!conf) { + if (posix_memalign((void**)&conf, 512, buf_size) != 0) + goto out; + ddf->conf = conf; + } + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c; + struct vd_config *vdc = NULL; + if (i == n_config) { + c = (struct vcl *)d->spare; + if (c) + vdc = &c->conf; + } else { + unsigned int dummy; + c = d->vlist[i]; + if (c) + get_pd_index_from_refnum( + c, d->disk.refnum, + ddf->mppe, + (const struct vd_config **)&vdc, + &dummy); + } + if (c) { + dprintf("writing conf record %i on disk %08x for %s/%u\n", + i, be32_to_cpu(d->disk.refnum), + guid_str(vdc->guid), + vdc->sec_elmnt_seq); + vdc->seqnum = header->seq; + vdc->crc = calc_crc(vdc, conf_size); + memcpy(conf + i*conf_size, vdc, conf_size); + } else + memset(conf + i*conf_size, 0xff, conf_size); + } + if (write(fd, conf, buf_size) != buf_size) + goto out; + + d->disk.crc = calc_crc(&d->disk, 512); + if (write(fd, &d->disk, 512) < 0) + goto out; + + ret = 1; +out: + header->openflag = 0; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + ret = 0; - ddf->secondary.openflag = 0; - ddf->secondary.type = DDF_HEADER_SECONDARY; + return ret; +} - ddf->primary.crc = calc_crc(&ddf->primary, 512); - ddf->secondary.crc = calc_crc(&ddf->secondary, 512); +static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d) +{ + unsigned long long size; + int fd = d->fd; + if (fd < 0) + return 0; - sector = size - 16*1024*2; - lseek64(fd, sector<<9, 0); - if (write(fd, &ddf->primary, 512) < 0) - continue; + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + if (be64_to_cpu(d->workspace_lba) != 0ULL) + ddf->anchor.workspace_lba = d->workspace_lba; + else + ddf->anchor.workspace_lba = + cpu_to_be64(size - 32*1024*2); + if (be64_to_cpu(d->primary_lba) != 0ULL) + ddf->anchor.primary_lba = d->primary_lba; + else + ddf->anchor.primary_lba = + cpu_to_be64(size - 16*1024*2); + if (be64_to_cpu(d->secondary_lba) != 0ULL) + ddf->anchor.secondary_lba = d->secondary_lba; + else + ddf->anchor.secondary_lba = + cpu_to_be64(size - 32*1024*2); + ddf->anchor.seq = ddf->active->seq; + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); - ddf->controller.crc = calc_crc(&ddf->controller, 512); - if (write(fd, &ddf->controller, 512) < 0) - continue; + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = cpu_to_be32(0xFFFFFFFF); /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); - ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + if (!__write_ddf_structure(d, ddf, DDF_HEADER_PRIMARY)) + return 0; - if (write(fd, ddf->phys, ddf->pdsize) < 0) - continue; + if (!__write_ddf_structure(d, ddf, DDF_HEADER_SECONDARY)) + return 0; - ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); - if (write(fd, ddf->virt, ddf->vdsize) < 0) - continue; + lseek64(fd, (size-1)*512, SEEK_SET); + if (write(fd, &ddf->anchor, 512) < 0) + return 0; - /* Now write lots of config records. */ - n_config = ddf->max_part; - conf_size = ddf->conf_rec_len * 512; - for (i = 0 ; i <= n_config ; i++) { - struct vcl *c = d->vlist[i]; - if (i == n_config) - c = (struct vcl*)d->spare; + return 1; +} - if (c) { - c->conf.crc = calc_crc(&c->conf, conf_size); - if (write(fd, &c->conf, conf_size) < 0) - break; - } else { - unsigned int togo = conf_size; - while (togo > NULL_CONF_SZ) { - if (write(fd, null_aligned, NULL_CONF_SZ) < 0) - break; - togo -= NULL_CONF_SZ; - } - if (write(fd, null_aligned, togo) < 0) - break; - } - } - if (i <= n_config) - continue; - d->disk.crc = calc_crc(&d->disk, 512); - if (write(fd, &d->disk, 512) < 0) - continue; +#ifndef MDASSEMBLE +static int __write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct dl *d; + int attempts = 0; + int successes = 0; - /* Maybe do the same for secondary */ + pr_state(ddf, __func__); - lseek64(fd, (size-1)*512, SEEK_SET); - if (write(fd, &ddf->anchor, 512) < 0) - continue; - successes++; + /* try to write updated metadata, + * if we catch a failure move on to the next disk + */ + for (d = ddf->dlist; d; d=d->next) { + attempts++; + successes += _write_super_to_disk(ddf, d); } - free(null_aligned); return attempts != successes; } @@ -2454,7 +3009,8 @@ /* queue the virtual_disk and vd_config as metadata updates */ struct virtual_disk *vd; struct vd_config *vc; - int len; + int len, tlen; + unsigned int i; if (!currentconf) { int len = (sizeof(struct phys_disk) + @@ -2473,31 +3029,37 @@ /* First the virtual disk. We have a slightly fake header */ len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); - vd = malloc(len); + vd = xmalloc(len); *vd = *ddf->virt; vd->entries[0] = ddf->virt->entries[currentconf->vcnum]; - vd->populated_vdes = __cpu_to_be16(currentconf->vcnum); + vd->populated_vdes = cpu_to_be16(currentconf->vcnum); append_metadata_update(st, vd, len); /* Then the vd_config */ len = ddf->conf_rec_len * 512; - vc = malloc(len); + tlen = len * currentconf->conf.sec_elmnt_count; + vc = xmalloc(tlen); memcpy(vc, ¤tconf->conf, len); - append_metadata_update(st, vc, len); + for (i = 1; i < currentconf->conf.sec_elmnt_count; i++) + memcpy((char *)vc + i*len, currentconf->other_bvds[i-1], + len); + append_metadata_update(st, vc, tlen); /* FIXME I need to close the fds! */ return 0; - } else { + } else { struct dl *d; - for (d = ddf->dlist; d; d=d->next) - while (Kill(d->devname, NULL, 0, 1, 1) == 0); + if (!currentconf) + for (d = ddf->dlist; d; d=d->next) + while (Kill(d->devname, NULL, 0, -1, 1) == 0); return __write_init_super_ddf(st); } } #endif -static __u64 avail_size_ddf(struct supertype *st, __u64 devsize) +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize, + unsigned long long data_offset) { /* We must reserve the last 32Meg */ if (devsize <= 32*1024*2) @@ -2521,7 +3083,7 @@ int cnt = 0; for (dl = ddf->dlist; dl ; dl=dl->next) { - dl->raiddisk = -1; + dl->raiddisk = -1; dl->esize = 0; } /* Now find largest extent on each device */ @@ -2554,7 +3116,7 @@ free(e); } if (cnt < raiddisks) { - fprintf(stderr, Name ": not enough devices with space to create array.\n"); + pr_err("not enough devices with space to create array.\n"); return 0; /* No enough free spaces large enough */ } if (size == 0) { @@ -2577,7 +3139,7 @@ } *freesize = size; if (size < 32) { - fprintf(stderr, Name ": not enough spare devices to create array.\n"); + pr_err("not enough spare devices to create array.\n"); return 0; } } @@ -2587,31 +3149,32 @@ for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) { if (dl->esize < size) continue; - + dl->raiddisk = cnt; cnt++; } return 1; } - - static int validate_geometry_ddf_container(struct supertype *st, int level, int layout, int raiddisks, int chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose); static int validate_geometry_ddf_bvd(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose); static int validate_geometry_ddf(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose) { @@ -2626,28 +3189,29 @@ * If given BVDs, we make an SVD, changing all the GUIDs in the process. */ - if (chunk && *chunk == UnSet) + if (*chunk == UnSet) *chunk = DEFAULT_CHUNK; - + if (level == -1000000) level = LEVEL_CONTAINER; if (level == LEVEL_CONTAINER) { /* Must be a fresh device to add to a container */ return validate_geometry_ddf_container(st, level, layout, - raiddisks, chunk?*chunk:0, - size, dev, freesize, + raiddisks, *chunk, + size, data_offset, dev, + freesize, verbose); } if (!dev) { - /* Initial sanity check. Exclude illegal levels. */ - int i; - for (i=0; ddf_level_num[i].num1 != MAXINT; i++) - if (ddf_level_num[i].num2 == level) - break; - if (ddf_level_num[i].num1 == MAXINT) { + mdu_array_info_t array = { + .level = level, .layout = layout, + .raid_disks = raiddisks + }; + struct vd_config conf; + if (layout_md2ddf(&array, &conf) == -1) { if (verbose) - fprintf(stderr, Name ": DDF does not support level %d arrays\n", - level); + pr_err("DDF does not support level %d /layout %d arrays with %d disks\n", + level, layout, raiddisks); return 0; } /* Should check layout? etc */ @@ -2660,7 +3224,7 @@ * chosen so that add_to_super/getinfo_super * can return them. */ - return reserve_space(st, raiddisks, size, chunk?*chunk:0, freesize); + return reserve_space(st, raiddisks, size, *chunk, freesize); } return 1; } @@ -2671,7 +3235,8 @@ * Should make a distinction one day. */ return validate_geometry_ddf_bvd(st, level, layout, raiddisks, - chunk, size, dev, freesize, + chunk, size, data_offset, dev, + freesize, verbose); } /* This is the first device for the array. @@ -2683,7 +3248,7 @@ */ fd = open(dev, O_RDONLY|O_EXCL, 0); if (fd >= 0) { - sra = sysfs_read(fd, 0, GET_VERSION); + sra = sysfs_read(fd, NULL, GET_VERSION); close(fd); if (sra && sra->array.major_version == -1 && strcmp(sra->text_version, "ddf") == 0) { @@ -2695,16 +3260,15 @@ } if (verbose) - fprintf(stderr, - Name ": ddf: Cannot create this array " - "on device %s - a container is required.\n", - dev); + pr_err("ddf: Cannot create this array " + "on device %s - a container is required.\n", + dev); return 0; } if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { if (verbose) - fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", - dev, strerror(errno)); + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); return 0; } /* Well, it is in use by someone, maybe a 'ddf' container. */ @@ -2712,11 +3276,11 @@ if (cfd < 0) { close(fd); if (verbose) - fprintf(stderr, Name ": ddf: Cannot use %s: %s\n", - dev, strerror(EBUSY)); + pr_err("ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); return 0; } - sra = sysfs_read(cfd, 0, GET_VERSION); + sra = sysfs_read(cfd, NULL, GET_VERSION); close(fd); if (sra && sra->array.major_version == -1 && strcmp(sra->text_version, "ddf") == 0) { @@ -2726,10 +3290,11 @@ struct ddf_super *ddf; if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) { st->sb = ddf; - st->container_dev = fd2devnum(cfd); + strcpy(st->container_devnm, fd2devnm(cfd)); close(cfd); return validate_geometry_ddf_bvd(st, level, layout, raiddisks, chunk, size, + data_offset, dev, freesize, verbose); } @@ -2744,6 +3309,7 @@ validate_geometry_ddf_container(struct supertype *st, int level, int layout, int raiddisks, int chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose) { @@ -2758,8 +3324,8 @@ fd = open(dev, O_RDONLY|O_EXCL, 0); if (fd < 0) { if (verbose) - fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", - dev, strerror(errno)); + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); return 0; } if (!get_dev_size(fd, dev, &ldsize)) { @@ -2768,7 +3334,7 @@ } close(fd); - *freesize = avail_size_ddf(st, ldsize >> 9); + *freesize = avail_size_ddf(st, ldsize >> 9, INVALID_SECTORS); if (*freesize == 0) return 0; @@ -2778,6 +3344,7 @@ static int validate_geometry_ddf_bvd(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose) { @@ -2791,7 +3358,7 @@ /* ddf/bvd supports lots of things, but not containers */ if (level == LEVEL_CONTAINER) { if (verbose) - fprintf(stderr, Name ": DDF cannot create a container within an container\n"); + pr_err("DDF cannot create a container within an container\n"); return 0; } /* We must have the container info already read in. */ @@ -2828,10 +3395,9 @@ } if (dcnt < raiddisks) { if (verbose) - fprintf(stderr, - Name ": ddf: Not enough devices with " - "space for this array (%d < %d)\n", - dcnt, raiddisks); + pr_err("ddf: Not enough devices with " + "space for this array (%d < %d)\n", + dcnt, raiddisks); return 0; } return 1; @@ -2848,22 +3414,22 @@ } if (!dl) { if (verbose) - fprintf(stderr, Name ": ddf: %s is not in the " - "same DDF set\n", - dev); + pr_err("ddf: %s is not in the " + "same DDF set\n", + dev); return 0; } e = get_extents(ddf, dl); maxsize = 0; i = 0; if (e) do { - unsigned long long esize; - esize = e[i].start - pos; - if (esize >= maxsize) - maxsize = esize; - pos = e[i].start + e[i].size; - i++; - } while (e[i-1].size); + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); *freesize = maxsize; // FIXME here I am @@ -2903,7 +3469,7 @@ rv = load_ddf_headers(dfd, super, NULL); close(dfd); if (rv == 0) { - seq = __be32_to_cpu(super->active->seq); + seq = be32_to_cpu(super->active->seq); if (super->active->openflag) seq--; if (!best || seq > bestseq) { @@ -2943,7 +3509,7 @@ st->minor_version = 0; st->max_devs = 512; } - st->container_dev = fd2devnum(fd); + strcpy(st->container_devnm, fd2devnm(fd)); return 0; } @@ -2955,6 +3521,115 @@ #endif /* MDASSEMBLE */ +static int check_secondary(const struct vcl *vc) +{ + const struct vd_config *conf = &vc->conf; + int i; + + /* The only DDF secondary RAID level md can support is + * RAID 10, if the stripe sizes and Basic volume sizes + * are all equal. + * Other configurations could in theory be supported by exposing + * the BVDs to user space and using device mapper for the secondary + * mapping. So far we don't support that. + */ + + __u64 sec_elements[4] = {0, 0, 0, 0}; +#define __set_sec_seen(n) (sec_elements[(n)>>6] |= (1<<((n)&63))) +#define __was_sec_seen(n) ((sec_elements[(n)>>6] & (1<<((n)&63))) != 0) + + if (vc->other_bvds == NULL) { + pr_err("No BVDs for secondary RAID found\n"); + return -1; + } + if (conf->prl != DDF_RAID1) { + pr_err("Secondary RAID level only supported for mirrored BVD\n"); + return -1; + } + if (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED) { + pr_err("Secondary RAID level %d is unsupported\n", + conf->srl); + return -1; + } + __set_sec_seen(conf->sec_elmnt_seq); + for (i = 0; i < conf->sec_elmnt_count-1; i++) { + const struct vd_config *bvd = vc->other_bvds[i]; + if (bvd->sec_elmnt_seq == DDF_UNUSED_BVD) + continue; + if (bvd->srl != conf->srl) { + pr_err("Inconsistent secondary RAID level across BVDs\n"); + return -1; + } + if (bvd->prl != conf->prl) { + pr_err("Different RAID levels for BVDs are unsupported\n"); + return -1; + } + if (!be16_eq(bvd->prim_elmnt_count, conf->prim_elmnt_count)) { + pr_err("All BVDs must have the same number of primary elements\n"); + return -1; + } + if (bvd->chunk_shift != conf->chunk_shift) { + pr_err("Different strip sizes for BVDs are unsupported\n"); + return -1; + } + if (!be64_eq(bvd->array_blocks, conf->array_blocks)) { + pr_err("Different BVD sizes are unsupported\n"); + return -1; + } + __set_sec_seen(bvd->sec_elmnt_seq); + } + for (i = 0; i < conf->sec_elmnt_count; i++) { + if (!__was_sec_seen(i)) { + pr_err("BVD %d is missing\n", i); + return -1; + } + } + return 0; +} + +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx) +{ + unsigned int i, j, n, sec, cnt; + + cnt = be16_to_cpu(vc->conf.prim_elmnt_count); + sec = (vc->conf.sec_elmnt_count == 1 ? 0 : vc->conf.sec_elmnt_seq); + + for (i = 0, j = 0 ; i < nmax ; i++) { + /* j counts valid entries for this BVD */ + if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff) + j++; + if (be32_eq(vc->conf.phys_refnum[i], refnum)) { + *bvd = &vc->conf; + *idx = i; + return sec * cnt + j - 1; + } + } + if (vc->other_bvds == NULL) + goto bad; + + for (n = 1; n < vc->conf.sec_elmnt_count; n++) { + struct vd_config *vd = vc->other_bvds[n-1]; + sec = vd->sec_elmnt_seq; + if (sec == DDF_UNUSED_BVD) + continue; + for (i = 0, j = 0 ; i < nmax ; i++) { + if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff) + j++; + if (be32_eq(vd->phys_refnum[i], refnum)) { + *bvd = vd; + *idx = i; + return sec * cnt + j - 1; + } + } + } +bad: + *bvd = NULL; + return DDF_NOTFOUND; +} + static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray) { /* Given a container loaded by load_super_ddf_all, @@ -2976,29 +3651,32 @@ struct mdinfo *this; char *ep; __u32 *cptr; + unsigned int pd; if (subarray && (strtoul(subarray, &ep, 10) != vc->vcnum || *ep != '\0')) continue; - this = malloc(sizeof(*this)); - memset(this, 0, sizeof(*this)); + if (vc->conf.sec_elmnt_count > 1) { + if (check_secondary(vc) != 0) + continue; + } + + this = xcalloc(1, sizeof(*this)); this->next = rest; rest = this; - this->array.level = map_num1(ddf_level_num, vc->conf.prl); - this->array.raid_disks = - __be16_to_cpu(vc->conf.prim_elmnt_count); - this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, - this->array.raid_disks); + if (layout_ddf2md(&vc->conf, &this->array)) + continue; this->array.md_minor = -1; this->array.major_version = -1; this->array.minor_version = -2; + this->safe_mode_delay = DDF_SAFE_MODE_DELAY; cptr = (__u32 *)(vc->conf.guid + 16); this->array.ctime = DECADE + __be32_to_cpu(*cptr); this->array.utime = DECADE + - __be32_to_cpu(vc->conf.timestamp); + be32_to_cpu(vc->conf.timestamp); this->array.chunk_size = 512 << vc->conf.chunk_shift; i = vc->vcnum; @@ -3018,64 +3696,65 @@ this->name[j] = 0; memset(this->uuid, 0, sizeof(this->uuid)); - this->component_size = __be64_to_cpu(vc->conf.blocks); + this->component_size = be64_to_cpu(vc->conf.blocks); this->array.size = this->component_size / 2; this->container_member = i; ddf->currentconf = vc; uuid_from_super_ddf(st, this->uuid); - ddf->currentconf = NULL; + if (!subarray) + ddf->currentconf = NULL; sprintf(this->text_version, "/%s/%d", - devnum2devname(st->container_dev), - this->container_member); + st->container_devnm, this->container_member); - for (i = 0 ; i < ddf->mppe ; i++) { + for (pd = 0; pd < be16_to_cpu(ddf->phys->used_pdes); pd++) { struct mdinfo *dev; struct dl *d; + const struct vd_config *bvd; + unsigned int iphys; int stt; - int pd; - if (vc->conf.phys_refnum[i] == 0xFFFFFFFF) + if (be32_to_cpu(ddf->phys->entries[pd].refnum) + == 0xFFFFFFFF) continue; - for (pd = __be16_to_cpu(ddf->phys->used_pdes); - pd--;) - if (ddf->phys->entries[pd].refnum - == vc->conf.phys_refnum[i]) - break; - if (pd < 0) - continue; - - stt = __be16_to_cpu(ddf->phys->entries[pd].state); + stt = be16_to_cpu(ddf->phys->entries[pd].state); if ((stt & (DDF_Online|DDF_Failed|DDF_Rebuilding)) != DDF_Online) continue; + i = get_pd_index_from_refnum( + vc, ddf->phys->entries[pd].refnum, + ddf->mppe, &bvd, &iphys); + if (i == DDF_NOTFOUND) + continue; + this->array.working_disks++; for (d = ddf->dlist; d ; d=d->next) - if (d->disk.refnum == vc->conf.phys_refnum[i]) + if (be32_eq(d->disk.refnum, + ddf->phys->entries[pd].refnum)) break; if (d == NULL) /* Haven't found that one yet, maybe there are others */ continue; - dev = malloc(sizeof(*dev)); - memset(dev, 0, sizeof(*dev)); + dev = xcalloc(1, sizeof(*dev)); dev->next = this->devs; this->devs = dev; - dev->disk.number = __be32_to_cpu(d->disk.refnum); + dev->disk.number = be32_to_cpu(d->disk.refnum); dev->disk.major = d->major; dev->disk.minor = d->minor; dev->disk.raid_disk = i; dev->disk.state = (1<recovery_start = MaxSector; - dev->events = __be32_to_cpu(ddf->primary.seq); - dev->data_offset = __be64_to_cpu(vc->lba_offset[i]); - dev->component_size = __be64_to_cpu(vc->conf.blocks); + dev->events = be32_to_cpu(ddf->primary.seq); + dev->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]); + dev->component_size = be64_to_cpu(bvd->blocks); if (d->devname) strcpy(dev->name, d->devname); } @@ -3093,15 +3772,36 @@ if (!ddf) return 1; - /* ->dlist and ->conflist will be set for updates, currently not - * supported - */ - if (ddf->dlist || ddf->conflist) - return 1; - if (!get_dev_size(fd, NULL, &dsize)) return 1; + if (ddf->dlist || ddf->conflist) { + struct stat sta; + struct dl *dl; + int ofd, ret; + + if (fstat(fd, &sta) == -1 || !S_ISBLK(sta.st_mode)) { + pr_err("%s: file descriptor for invalid device\n", + __func__); + return 1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == (int)major(sta.st_rdev) && + dl->minor == (int)minor(sta.st_rdev)) + break; + if (!dl) { + pr_err("%s: couldn't find disk %d/%d\n", __func__, + (int)major(sta.st_rdev), + (int)minor(sta.st_rdev)); + return 1; + } + ofd = dl->fd; + dl->fd = fd; + ret = (_write_super_to_disk(ddf, dl) != 1); + dl->fd = ofd; + return ret; + } + if (posix_memalign(&buf, 512, 512) != 0) return 1; memset(buf, 0, 512); @@ -3125,6 +3825,9 @@ */ struct ddf_super *first = st->sb; struct ddf_super *second = tst->sb; + struct dl *dl1, *dl2; + struct vcl *vl1, *vl2; + unsigned int max_vds, max_pds, pd, vd; if (!first) { st->sb = tst->sb; @@ -3135,7 +3838,146 @@ if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) return 2; + if (!be32_eq(first->anchor.seq, second->anchor.seq)) { + dprintf("%s: sequence number mismatch %u/%u\n", __func__, + be32_to_cpu(first->anchor.seq), + be32_to_cpu(second->anchor.seq)); + return 3; + } + if (first->max_part != second->max_part || + !be16_eq(first->phys->used_pdes, second->phys->used_pdes) || + !be16_eq(first->virt->populated_vdes, + second->virt->populated_vdes)) { + dprintf("%s: PD/VD number mismatch\n", __func__); + return 3; + } + + max_pds = be16_to_cpu(first->phys->used_pdes); + for (dl2 = second->dlist; dl2; dl2 = dl2->next) { + for (pd = 0; pd < max_pds; pd++) + if (be32_eq(first->phys->entries[pd].refnum, + dl2->disk.refnum)) + break; + if (pd == max_pds) { + dprintf("%s: no match for disk %08x\n", __func__, + be32_to_cpu(dl2->disk.refnum)); + return 3; + } + } + + max_vds = be16_to_cpu(first->active->max_vd_entries); + for (vl2 = second->conflist; vl2; vl2 = vl2->next) { + if (!be32_eq(vl2->conf.magic, DDF_VD_CONF_MAGIC)) + continue; + for (vd = 0; vd < max_vds; vd++) + if (!memcmp(first->virt->entries[vd].guid, + vl2->conf.guid, DDF_GUID_LEN)) + break; + if (vd == max_vds) { + dprintf("%s: no match for VD config\n", __func__); + return 3; + } + } /* FIXME should I look at anything else? */ + + /* + At this point we are fairly sure that the meta data matches. + But the new disk may contain additional local data. + Add it to the super block. + */ + for (vl2 = second->conflist; vl2; vl2 = vl2->next) { + for (vl1 = first->conflist; vl1; vl1 = vl1->next) + if (!memcmp(vl1->conf.guid, vl2->conf.guid, + DDF_GUID_LEN)) + break; + if (vl1) { + if (vl1->other_bvds != NULL && + vl1->conf.sec_elmnt_seq != + vl2->conf.sec_elmnt_seq) { + dprintf("%s: adding BVD %u\n", __func__, + vl2->conf.sec_elmnt_seq); + add_other_bvd(vl1, &vl2->conf, + first->conf_rec_len*512); + } + continue; + } + + if (posix_memalign((void **)&vl1, 512, + (first->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("%s could not allocate vcl buf\n", + __func__); + return 3; + } + + vl1->next = first->conflist; + vl1->block_sizes = NULL; + memcpy(&vl1->conf, &vl2->conf, first->conf_rec_len*512); + if (alloc_other_bvds(first, vl1) != 0) { + pr_err("%s could not allocate other bvds\n", + __func__); + free(vl1); + return 3; + } + for (vd = 0; vd < max_vds; vd++) + if (!memcmp(first->virt->entries[vd].guid, + vl1->conf.guid, DDF_GUID_LEN)) + break; + vl1->vcnum = vd; + dprintf("%s: added config for VD %u\n", __func__, vl1->vcnum); + first->conflist = vl1; + } + + for (dl2 = second->dlist; dl2; dl2 = dl2->next) { + for (dl1 = first->dlist; dl1; dl1 = dl1->next) + if (be32_eq(dl1->disk.refnum, dl2->disk.refnum)) + break; + if (dl1) + continue; + + if (posix_memalign((void **)&dl1, 512, + sizeof(*dl1) + (first->max_part) * sizeof(dl1->vlist[0])) + != 0) { + pr_err("%s could not allocate disk info buffer\n", + __func__); + return 3; + } + memcpy(dl1, dl2, sizeof(*dl1)); + dl1->mdupdate = NULL; + dl1->next = first->dlist; + dl1->fd = -1; + for (pd = 0; pd < max_pds; pd++) + if (be32_eq(first->phys->entries[pd].refnum, + dl1->disk.refnum)) + break; + dl1->pdnum = pd; + if (dl2->spare) { + if (posix_memalign((void **)&dl1->spare, 512, + first->conf_rec_len*512) != 0) { + pr_err("%s could not allocate spare info buf\n", + __func__); + return 3; + } + memcpy(dl1->spare, dl2->spare, first->conf_rec_len*512); + } + for (vd = 0 ; vd < first->max_part ; vd++) { + if (!dl2->vlist[vd]) { + dl1->vlist[vd] = NULL; + continue; + } + for (vl1 = first->conflist; vl1; vl1 = vl1->next) { + if (!memcmp(vl1->conf.guid, + dl2->vlist[vd]->conf.guid, + DDF_GUID_LEN)) + break; + dl1->vlist[vd] = vl1; + } + } + first->dlist = dl1; + dprintf("%s: added disk %d: %08x\n", __func__, dl1->pdnum, + be32_to_cpu(dl1->disk.refnum)); + } + return 0; } @@ -3148,8 +3990,41 @@ */ static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) { - dprintf("ddf: open_new %s\n", inst); - a->info.container_member = atoi(inst); + struct ddf_super *ddf = c->sb; + int n = atoi(inst); + struct mdinfo *dev; + struct dl *dl; + static const char faulty[] = "faulty"; + + if (all_ff(ddf->virt->entries[n].guid)) { + pr_err("%s: subarray %d doesn't exist\n", __func__, n); + return -ENODEV; + } + dprintf("%s: new subarray %d, GUID: %s\n", __func__, n, + guid_str(ddf->virt->entries[n].guid)); + for (dev = a->info.devs; dev; dev = dev->next) { + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == dev->disk.major && + dl->minor == dev->disk.minor) + break; + if (!dl) { + pr_err("%s: device %d/%d of subarray %d not found in meta data\n", + __func__, dev->disk.major, dev->disk.minor, n); + return -1; + } + if ((be16_to_cpu(ddf->phys->entries[dl->pdnum].state) & + (DDF_Online|DDF_Missing|DDF_Failed)) != DDF_Online) { + pr_err("%s: new subarray %d contains broken device %d/%d (%02x)\n", + __func__, n, dl->major, dl->minor, + be16_to_cpu( + ddf->phys->entries[dl->pdnum].state)); + if (write(dev->state_fd, faulty, sizeof(faulty)-1) != + sizeof(faulty) - 1) + pr_err("Write to state_fd failed\n"); + dev->curr_state = DS_FAULTY; + } + } + a->info.container_member = n; return 0; } @@ -3178,7 +4053,7 @@ else ddf->virt->entries[inst].state |= DDF_state_inconsistent; if (old != ddf->virt->entries[inst].state) - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); old = ddf->virt->entries[inst].init_state; ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; @@ -3189,16 +4064,96 @@ else ddf->virt->entries[inst].init_state |= DDF_init_quick; if (old != ddf->virt->entries[inst].init_state) - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); - dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty", + dprintf("ddf mark %d/%s (%d) %s %llu\n", inst, + guid_str(ddf->virt->entries[inst].guid), a->curr_state, + consistent?"clean":"dirty", a->info.resync_start); return consistent; } -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) +static int get_bvd_state(const struct ddf_super *ddf, + const struct vd_config *vc) +{ + unsigned int i, n_bvd, working = 0; + unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count); + int pd, st, state; + for (i = 0; i < n_prim; i++) { + if (!find_index_in_bvd(ddf, vc, i, &n_bvd)) + continue; + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0) + continue; + st = be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) + working++; + } + + state = DDF_state_degraded; + if (working == n_prim) + state = DDF_state_optimal; + else + switch (vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + else if (working >= 2) + state = DDF_state_part_optimal; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < n_prim - 1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < n_prim - 2) + state = DDF_state_failed; + else if (working == n_prim - 1) + state = DDF_state_part_optimal; + break; + } + return state; +} + +static int secondary_state(int state, int other, int seclevel) +{ + if (state == DDF_state_optimal && other == DDF_state_optimal) + return DDF_state_optimal; + if (seclevel == DDF_2MIRRORED) { + if (state == DDF_state_optimal || other == DDF_state_optimal) + return DDF_state_part_optimal; + if (state == DDF_state_failed && other == DDF_state_failed) + return DDF_state_failed; + return DDF_state_degraded; + } else { + if (state == DDF_state_failed || other == DDF_state_failed) + return DDF_state_failed; + if (state == DDF_state_degraded || other == DDF_state_degraded) + return DDF_state_degraded; + return DDF_state_part_optimal; + } +} + +static int get_svd_state(const struct ddf_super *ddf, const struct vcl *vcl) +{ + int state = get_bvd_state(ddf, &vcl->conf); + unsigned int i; + for (i = 1; i < vcl->conf.sec_elmnt_count; i++) { + state = secondary_state( + state, + get_bvd_state(ddf, vcl->other_bvds[i-1]), + vcl->conf.srl); + } + return state; +} + /* * The state of each disk is stored in the global phys_disk structure * in phys_disk.entries[n].state. @@ -3216,13 +4171,15 @@ static void ddf_set_disk(struct active_array *a, int n, int state) { struct ddf_super *ddf = a->container->sb; - unsigned int inst = a->info.container_member; - struct vd_config *vc = find_vdcr(ddf, inst); - int pd = find_phys(ddf, vc->phys_refnum[n]); - int i, st, working; + unsigned int inst = a->info.container_member, n_bvd; + struct vcl *vcl; + struct vd_config *vc = find_vdcr(ddf, inst, (unsigned int)n, + &n_bvd, &vcl); + int pd; struct mdinfo *mdi; struct dl *dl; + dprintf("%s: %d to %x\n", __func__, n, state); if (vc == NULL) { dprintf("ddf: cannot find instance %d!!\n", inst); return; @@ -3231,8 +4188,11 @@ for (mdi = a->info.devs; mdi; mdi = mdi->next) if (mdi->disk.raid_disk == n) break; - if (!mdi) + if (!mdi) { + pr_err("%s: cannot find raid disk %d\n", + __func__, n); return; + } /* and find the 'dl' entry corresponding to that. */ for (dl = ddf->dlist; dl; dl = dl->next) @@ -3240,80 +4200,59 @@ mdi->disk.major == dl->major && mdi->disk.minor == dl->minor) break; - if (!dl) + if (!dl) { + pr_err("%s: cannot find raid disk %d (%d/%d)\n", + __func__, n, + mdi->disk.major, mdi->disk.minor); return; + } + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); if (pd < 0 || pd != dl->pdnum) { /* disk doesn't currently exist or has changed. * If it is now in_sync, insert it. */ + dprintf("%s: phys disk not found for %d: %d/%d ref %08x\n", + __func__, dl->pdnum, dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + dprintf("%s: array %u disk %u ref %08x pd %d\n", + __func__, inst, n_bvd, + be32_to_cpu(vc->phys_refnum[n_bvd]), pd); if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) { - struct vcl *vcl; - pd = dl->pdnum; - vc->phys_refnum[n] = dl->disk.refnum; - vcl = container_of(vc, struct vcl, conf); - vcl->lba_offset[n] = mdi->data_offset; - ddf->phys->entries[pd].type &= - ~__cpu_to_be16(DDF_Global_Spare); - ddf->phys->entries[pd].type |= - __cpu_to_be16(DDF_Active_in_VD); - ddf->updates_pending = 1; + pd = dl->pdnum; /* FIXME: is this really correct ? */ + vc->phys_refnum[n_bvd] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[n_bvd] = + cpu_to_be64(mdi->data_offset); + be16_clear(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Active_in_VD)); + ddf_set_updates_pending(ddf); } } else { - int old = ddf->phys->entries[pd].state; + be16 old = ddf->phys->entries[pd].state; if (state & DS_FAULTY) - ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Failed); + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed)); if (state & DS_INSYNC) { - ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Online); - ddf->phys->entries[pd].state &= __cpu_to_be16(~DDF_Rebuilding); + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Rebuilding)); } - if (old != ddf->phys->entries[pd].state) - ddf->updates_pending = 1; + if (!be16_eq(old, ddf->phys->entries[pd].state)) + ddf_set_updates_pending(ddf); } - dprintf("ddf: set_disk %d to %x\n", n, state); + dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n, + be32_to_cpu(dl->disk.refnum), state, + be16_to_cpu(ddf->phys->entries[pd].state)); /* Now we need to check the state of the array and update * virtual_disk.entries[n].state. * It needs to be one of "optimal", "degraded", "failed". * I don't understand 'deleted' or 'missing'. */ - working = 0; - for (i=0; i < a->info.array.raid_disks; i++) { - pd = find_phys(ddf, vc->phys_refnum[i]); - if (pd < 0) - continue; - st = __be16_to_cpu(ddf->phys->entries[pd].state); - if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) - == DDF_Online) - working++; - } - state = DDF_state_degraded; - if (working == a->info.array.raid_disks) - state = DDF_state_optimal; - else switch(vc->prl) { - case DDF_RAID0: - case DDF_CONCAT: - case DDF_JBOD: - state = DDF_state_failed; - break; - case DDF_RAID1: - if (working == 0) - state = DDF_state_failed; - else if (working == 2 && state == DDF_state_degraded) - state = DDF_state_part_optimal; - break; - case DDF_RAID4: - case DDF_RAID5: - if (working < a->info.array.raid_disks-1) - state = DDF_state_failed; - break; - case DDF_RAID6: - if (working < a->info.array.raid_disks-2) - state = DDF_state_failed; - else if (working == a->info.array.raid_disks-1) - state = DDF_state_part_optimal; - break; - } + state = get_svd_state(ddf, vcl); if (ddf->virt->entries[inst].state != ((ddf->virt->entries[inst].state & ~DDF_state_mask) @@ -3322,7 +4261,7 @@ ddf->virt->entries[inst].state = (ddf->virt->entries[inst].state & ~DDF_state_mask) | state; - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); } } @@ -3345,6 +4284,112 @@ dprintf("ddf: sync_metadata\n"); } +static int del_from_conflist(struct vcl **list, const char *guid) +{ + struct vcl **p; + int found = 0; + for (p = list; p && *p; p = &((*p)->next)) + if (!memcmp((*p)->conf.guid, guid, DDF_GUID_LEN)) { + found = 1; + *p = (*p)->next; + } + return found; +} + +static int _kill_subarray_ddf(struct ddf_super *ddf, const char *guid) +{ + struct dl *dl; + unsigned int vdnum, i; + vdnum = find_vde_by_guid(ddf, guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("%s: could not find VD %s\n", __func__, + guid_str(guid)); + return -1; + } + if (del_from_conflist(&ddf->conflist, guid) == 0) { + pr_err("%s: could not find conf %s\n", __func__, + guid_str(guid)); + return -1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + for (i = 0; i < ddf->max_part; i++) + if (dl->vlist[i] != NULL && + !memcmp(dl->vlist[i]->conf.guid, guid, + DDF_GUID_LEN)) + dl->vlist[i] = NULL; + memset(ddf->virt->entries[vdnum].guid, 0xff, DDF_GUID_LEN); + dprintf("%s: deleted %s\n", __func__, guid_str(guid)); + return 0; +} + +static int kill_subarray_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + /* + * currentconf is set in container_content_ddf, + * called with subarray arg + */ + struct vcl *victim = ddf->currentconf; + struct vd_config *conf; + ddf->currentconf = NULL; + unsigned int vdnum; + if (!victim) { + pr_err("%s: nothing to kill\n", __func__); + return -1; + } + conf = &victim->conf; + vdnum = find_vde_by_guid(ddf, conf->guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("%s: could not find VD %s\n", __func__, + guid_str(conf->guid)); + return -1; + } + if (st->update_tail) { + struct virtual_disk *vd; + int len = sizeof(struct virtual_disk) + + sizeof(struct virtual_entry); + vd = xmalloc(len); + if (vd == NULL) { + pr_err("%s: failed to allocate %d bytes\n", __func__, + len); + return -1; + } + memset(vd, 0 , len); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + memcpy(vd->entries[0].guid, conf->guid, DDF_GUID_LEN); + /* we use DDF_state_deleted as marker */ + vd->entries[0].state = DDF_state_deleted; + append_metadata_update(st, vd, len); + } else { + _kill_subarray_ddf(ddf, conf->guid); + ddf_set_updates_pending(ddf); + ddf_sync_metadata(st); + } + return 0; +} + +static void copy_matching_bvd(struct ddf_super *ddf, + struct vd_config *conf, + const struct metadata_update *update) +{ + unsigned int mppe = + be16_to_cpu(ddf->anchor.max_primary_element_entries); + unsigned int len = ddf->conf_rec_len * 512; + char *p; + struct vd_config *vc; + for (p = update->buf; p < update->buf + update->len; p += len) { + vc = (struct vd_config *) p; + if (vc->sec_elmnt_seq == conf->sec_elmnt_seq) { + memcpy(conf->phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + return; + } + } + pr_err("%s: no match for BVD %d of %s in update\n", __func__, + conf->sec_elmnt_seq, guid_str(conf->guid)); +} + static void ddf_process_update(struct supertype *st, struct metadata_update *update) { @@ -3376,33 +4421,32 @@ * a spare-assignment record. */ struct ddf_super *ddf = st->sb; - __u32 *magic = (__u32*)update->buf; + be32 *magic = (be32 *)update->buf; struct phys_disk *pd; struct virtual_disk *vd; struct vd_config *vc; struct vcl *vcl; struct dl *dl; - unsigned int mppe; unsigned int ent; - unsigned int pdnum, pd2; + unsigned int pdnum, pd2, len; - dprintf("Process update %x\n", *magic); + dprintf("Process update %x\n", be32_to_cpu(*magic)); - switch (*magic) { - case DDF_PHYS_RECORDS_MAGIC: + if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) { if (update->len != (sizeof(struct phys_disk) + sizeof(struct phys_disk_entry))) return; pd = (struct phys_disk*)update->buf; - ent = __be16_to_cpu(pd->used_pdes); - if (ent >= __be16_to_cpu(ddf->phys->max_pdes)) + ent = be16_to_cpu(pd->used_pdes); + if (ent >= be16_to_cpu(ddf->phys->max_pdes)) return; - if (pd->entries[0].state & __cpu_to_be16(DDF_Missing)) { + if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) { struct dl **dlp; /* removing this disk. */ - ddf->phys->entries[ent].state |= __cpu_to_be16(DDF_Missing); + be16_set(ddf->phys->entries[ent].state, + cpu_to_be16(DDF_Missing)); for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) { struct dl *dl = *dlp; if (dl->pdnum == (signed)ent) { @@ -3415,15 +4459,15 @@ break; } } - ddf->updates_pending = 1; + ddf_set_updates_pending(ddf); return; } if (!all_ff(ddf->phys->entries[ent].guid)) return; ddf->phys->entries[ent] = pd->entries[0]; - ddf->phys->used_pdes = __cpu_to_be16(1 + - __be16_to_cpu(ddf->phys->used_pdes)); - ddf->updates_pending = 1; + ddf->phys->used_pdes = cpu_to_be16 + (1 + be16_to_cpu(ddf->phys->used_pdes)); + ddf_set_updates_pending(ddf); if (ddf->add_list) { struct active_array *a; struct dl *al = ddf->add_list; @@ -3438,123 +4482,173 @@ for (a = st->arrays ; a; a=a->next) a->check_degraded = 1; } - break; - - case DDF_VIRT_RECORDS_MAGIC: + } else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) { if (update->len != (sizeof(struct virtual_disk) + sizeof(struct virtual_entry))) return; vd = (struct virtual_disk*)update->buf; - ent = __be16_to_cpu(vd->populated_vdes); - if (ent >= __be16_to_cpu(ddf->virt->max_vdes)) - return; - if (!all_ff(ddf->virt->entries[ent].guid)) - return; - ddf->virt->entries[ent] = vd->entries[0]; - ddf->virt->populated_vdes = __cpu_to_be16(1 + - __be16_to_cpu(ddf->virt->populated_vdes)); - ddf->updates_pending = 1; - break; + if (vd->entries[0].state == DDF_state_deleted) { + if (_kill_subarray_ddf(ddf, vd->entries[0].guid)) + return; + } else { - case DDF_VD_CONF_MAGIC: - dprintf("len %d %d\n", update->len, ddf->conf_rec_len); + ent = find_vde_by_guid(ddf, vd->entries[0].guid); + if (ent != DDF_NOTFOUND) { + dprintf("%s: VD %s exists already in slot %d\n", + __func__, guid_str(vd->entries[0].guid), + ent); + return; + } + ent = find_unused_vde(ddf); + if (ent == DDF_NOTFOUND) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = + cpu_to_be16( + 1 + be16_to_cpu( + ddf->virt->populated_vdes)); + dprintf("%s: added VD %s in slot %d(s=%02x i=%02x)\n", + __func__, guid_str(vd->entries[0].guid), ent, + ddf->virt->entries[ent].state, + ddf->virt->entries[ent].init_state); + } + ddf_set_updates_pending(ddf); + } - mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); - if ((unsigned)update->len != ddf->conf_rec_len * 512) - return; + else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { vc = (struct vd_config*)update->buf; + len = ddf->conf_rec_len * 512; + if ((unsigned int)update->len != len * vc->sec_elmnt_count) { + pr_err("%s: %s: insufficient data (%d) for %u BVDs\n", + __func__, guid_str(vc->guid), update->len, + vc->sec_elmnt_count); + return; + } for (vcl = ddf->conflist; vcl ; vcl = vcl->next) if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) break; - dprintf("vcl = %p\n", vcl); + dprintf("%s: conf update for %s (%s)\n", __func__, + guid_str(vc->guid), (vcl ? "old" : "new")); if (vcl) { /* An update, just copy the phys_refnum and lba_offset * fields */ - memcpy(vcl->conf.phys_refnum, vc->phys_refnum, - mppe * (sizeof(__u32) + sizeof(__u64))); + unsigned int i; + unsigned int k; + copy_matching_bvd(ddf, &vcl->conf, update); + for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", 0, + be32_to_cpu(vcl->conf.phys_refnum[k]), + be64_to_cpu(LBA_OFFSET(ddf, + &vcl->conf)[k])); + for (i = 1; i < vc->sec_elmnt_count; i++) { + copy_matching_bvd(ddf, vcl->other_bvds[i-1], + update); + for (k = 0; k < be16_to_cpu( + vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", i, + be32_to_cpu + (vcl->other_bvds[i-1]-> + phys_refnum[k]), + be64_to_cpu + (LBA_OFFSET + (ddf, + vcl->other_bvds[i-1])[k])); + } } else { /* A new VD_CONF */ + unsigned int i; if (!update->space) return; vcl = update->space; update->space = NULL; vcl->next = ddf->conflist; - memcpy(&vcl->conf, vc, update->len); - vcl->lba_offset = (__u64*) - &vcl->conf.phys_refnum[mppe]; - for (ent = 0; - ent < __be16_to_cpu(ddf->virt->populated_vdes); - ent++) - if (memcmp(vc->guid, ddf->virt->entries[ent].guid, - DDF_GUID_LEN) == 0) { - vcl->vcnum = ent; - break; - } + memcpy(&vcl->conf, vc, len); + ent = find_vde_by_guid(ddf, vc->guid); + if (ent == DDF_NOTFOUND) + return; + vcl->vcnum = ent; ddf->conflist = vcl; + for (i = 1; i < vc->sec_elmnt_count; i++) + memcpy(vcl->other_bvds[i-1], + update->buf + len * i, len); } /* Set DDF_Transition on all Failed devices - to help * us detect those that are no longer in use */ - for (pdnum = 0; pdnum < __be16_to_cpu(ddf->phys->used_pdes); pdnum++) - if (ddf->phys->entries[pdnum].state - & __be16_to_cpu(DDF_Failed)) - ddf->phys->entries[pdnum].state - |= __be16_to_cpu(DDF_Transition); + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->used_pdes); + pdnum++) + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_set(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition)); /* Now make sure vlist is correct for each dl. */ for (dl = ddf->dlist; dl; dl = dl->next) { - unsigned int dn; unsigned int vn = 0; int in_degraded = 0; - for (vcl = ddf->conflist; vcl ; vcl = vcl->next) - for (dn=0; dn < ddf->mppe ; dn++) - if (vcl->conf.phys_refnum[dn] == - dl->disk.refnum) { - int vstate; - dprintf("dev %d has %p at %d\n", - dl->pdnum, vcl, vn); - /* Clear the Transition flag */ - if (ddf->phys->entries[dl->pdnum].state - & __be16_to_cpu(DDF_Failed)) - ddf->phys->entries[dl->pdnum].state &= - ~__be16_to_cpu(DDF_Transition); - - dl->vlist[vn++] = vcl; - vstate = ddf->virt->entries[vcl->vcnum].state - & DDF_state_mask; - if (vstate == DDF_state_degraded || - vstate == DDF_state_part_optimal) - in_degraded = 1; - break; - } + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) { + unsigned int dn, ibvd; + const struct vd_config *conf; + int vstate; + dn = get_pd_index_from_refnum(vcl, + dl->disk.refnum, + ddf->mppe, + &conf, &ibvd); + if (dn == DDF_NOTFOUND) + continue; + dprintf("dev %d/%08x has %s (sec=%u) at %d\n", + dl->pdnum, + be32_to_cpu(dl->disk.refnum), + guid_str(conf->guid), + conf->sec_elmnt_seq, vn); + /* Clear the Transition flag */ + if (be16_and + (ddf->phys->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_clear(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Transition)); + dl->vlist[vn++] = vcl; + vstate = ddf->virt->entries[vcl->vcnum].state + & DDF_state_mask; + if (vstate == DDF_state_degraded || + vstate == DDF_state_part_optimal) + in_degraded = 1; + } while (vn < ddf->max_part) dl->vlist[vn++] = NULL; if (dl->vlist[0]) { - ddf->phys->entries[dl->pdnum].type &= - ~__cpu_to_be16(DDF_Global_Spare); - if (!(ddf->phys->entries[dl->pdnum].type & - __cpu_to_be16(DDF_Active_in_VD))) { - ddf->phys->entries[dl->pdnum].type |= - __cpu_to_be16(DDF_Active_in_VD); - if (in_degraded) - ddf->phys->entries[dl->pdnum].state |= - __cpu_to_be16(DDF_Rebuilding); - } + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + if (!be16_and(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD))) { + be16_set(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + if (in_degraded) + be16_set(ddf->phys + ->entries[dl->pdnum] + .state, + cpu_to_be16 + (DDF_Rebuilding)); + } } if (dl->spare) { - ddf->phys->entries[dl->pdnum].type &= - ~__cpu_to_be16(DDF_Global_Spare); - ddf->phys->entries[dl->pdnum].type |= - __cpu_to_be16(DDF_Spare); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); } if (!dl->vlist[0] && !dl->spare) { - ddf->phys->entries[dl->pdnum].type |= - __cpu_to_be16(DDF_Global_Spare); - ddf->phys->entries[dl->pdnum].type &= - ~__cpu_to_be16(DDF_Spare | - DDF_Active_in_VD); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); } } @@ -3563,32 +4657,40 @@ * Once done, we need to update all dl->pdnum numbers. */ pd2 = 0; - for (pdnum = 0; pdnum < __be16_to_cpu(ddf->phys->used_pdes); pdnum++) - if ((ddf->phys->entries[pdnum].state - & __be16_to_cpu(DDF_Failed)) - && (ddf->phys->entries[pdnum].state - & __be16_to_cpu(DDF_Transition))) - /* skip this one */; - else if (pdnum == pd2) + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->used_pdes); + pdnum++) { + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed)) + && be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition))) { + /* skip this one unless in dlist*/ + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + break; + if (!dl) + continue; + } + if (pdnum == pd2) pd2++; else { - ddf->phys->entries[pd2] = ddf->phys->entries[pdnum]; + ddf->phys->entries[pd2] = + ddf->phys->entries[pdnum]; for (dl = ddf->dlist; dl; dl = dl->next) if (dl->pdnum == (int)pdnum) dl->pdnum = pd2; pd2++; } - ddf->phys->used_pdes = __cpu_to_be16(pd2); + } + ddf->phys->used_pdes = cpu_to_be16(pd2); while (pd2 < pdnum) { - memset(ddf->phys->entries[pd2].guid, 0xff, DDF_GUID_LEN); + memset(ddf->phys->entries[pd2].guid, 0xff, + DDF_GUID_LEN); pd2++; } - ddf->updates_pending = 1; - break; - case DDF_SPARE_ASSIGN_MAGIC: - default: break; + ddf_set_updates_pending(ddf); } + /* case DDF_SPARE_ASSIGN_MAGIC */ } static void ddf_prepare_update(struct supertype *st, @@ -3599,12 +4701,66 @@ * If a malloc is needed, do it here. */ struct ddf_super *ddf = st->sb; - __u32 *magic = (__u32*)update->buf; - if (*magic == DDF_VD_CONF_MAGIC) + be32 *magic = (be32 *)update->buf; + if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + struct vcl *vcl; + struct vd_config *conf = (struct vd_config *) update->buf; if (posix_memalign(&update->space, 512, - offsetof(struct vcl, conf) - + ddf->conf_rec_len * 512) != 0) + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512) != 0) { + update->space = NULL; + return; + } + vcl = update->space; + vcl->conf.sec_elmnt_count = conf->sec_elmnt_count; + if (alloc_other_bvds(ddf, vcl) != 0) { + free(update->space); update->space = NULL; + } + } +} + +/* + * Check degraded state of a RAID10. + * returns 2 for good, 1 for degraded, 0 for failed, and -1 for error + */ +static int raid10_degraded(struct mdinfo *info) +{ + int n_prim, n_bvds; + int i; + struct mdinfo *d; + char *found; + int ret = -1; + + n_prim = info->array.layout & ~0x100; + n_bvds = info->array.raid_disks / n_prim; + found = xmalloc(n_bvds); + if (found == NULL) + return ret; + memset(found, 0, n_bvds); + for (d = info->devs; d; d = d->next) { + i = d->disk.raid_disk / n_prim; + if (i >= n_bvds) { + pr_err("%s: BUG: invalid raid disk\n", __func__); + goto out; + } + if (d->state_fd > 0) + found[i]++; + } + ret = 2; + for (i = 0; i < n_bvds; i++) + if (!found[i]) { + dprintf("%s: BVD %d/%d failed\n", __func__, i, n_bvds); + ret = 0; + goto out; + } else if (found[i] < n_prim) { + dprintf("%s: BVD %d/%d degraded\n", __func__, i, + n_bvds); + ret = 1; + } +out: + free(found); + return ret; } /* @@ -3634,19 +4790,22 @@ struct metadata_update *mu; struct dl *dl; int i; + unsigned int j; + struct vcl *vcl; struct vd_config *vc; - __u64 *lba; + unsigned int n_bvd; for (d = a->info.devs ; d ; d = d->next) { if ((d->curr_state & DS_FAULTY) && - d->state_fd >= 0) + d->state_fd >= 0) /* wait for Removal to happen */ return NULL; if (d->state_fd >= 0) working ++; } - dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks, + dprintf("%s: working=%d (%d) level=%d\n", __func__, working, + a->info.array.raid_disks, a->info.array.level); if (working == a->info.array.raid_disks) return NULL; /* array not degraded */ @@ -3664,6 +4823,10 @@ if (working < a->info.array.raid_disks - 2) return NULL; /* failed */ break; + case 10: + if (raid10_degraded(&a->info) < 1) + return NULL; + break; default: /* concat or stripe */ return NULL; /* failed */ } @@ -3688,24 +4851,35 @@ int is_dedicated = 0; struct extent *ex; unsigned int j; + be16 state = ddf->phys->entries[dl->pdnum].state; + if (be16_and(state, + cpu_to_be16(DDF_Failed|DDF_Missing)) || + !be16_and(state, + cpu_to_be16(DDF_Online))) + continue; + /* If in this array, skip */ for (d2 = a->info.devs ; d2 ; d2 = d2->next) if (d2->state_fd >= 0 && d2->disk.major == dl->major && d2->disk.minor == dl->minor) { - dprintf("%x:%x already in array\n", dl->major, dl->minor); + dprintf("%x:%x (%08x) already in array\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); break; } if (d2) continue; - if (ddf->phys->entries[dl->pdnum].type & - __cpu_to_be16(DDF_Spare)) { + if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare))) { /* Check spare assign record */ if (dl->spare) { if (dl->spare->type & DDF_spare_dedicated) { /* check spare_ents for guid */ for (j = 0 ; - j < __be16_to_cpu(dl->spare->populated); + j < be16_to_cpu + (dl->spare + ->populated); j++) { if (memcmp(dl->spare->spare_ents[j].guid, ddf->virt->entries[a->info.container_member].guid, @@ -3715,14 +4889,19 @@ } else is_global = 1; } - } else if (ddf->phys->entries[dl->pdnum].type & - __cpu_to_be16(DDF_Global_Spare)) { + } else if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare))) { + is_global = 1; + } else if (!be16_and(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) { + /* we can possibly use some of this */ is_global = 1; } if ( ! (is_dedicated || (is_global && global_ok))) { dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, - is_dedicated, is_global); + is_dedicated, is_global); continue; } @@ -3754,10 +4933,7 @@ } /* Cool, we have a device with some space at pos */ - di = malloc(sizeof(*di)); - if (!di) - continue; - memset(di, 0, sizeof(*di)); + di = xcalloc(1, sizeof(*di)); di->disk.number = i; di->disk.raid_disk = i; di->disk.major = dl->major; @@ -3769,8 +4945,9 @@ di->container_member = dl->pdnum; di->next = rv; rv = di; - dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, - i, pos); + dprintf("%x:%x (%08x) to be %d at %llu\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum), i, pos); break; } @@ -3789,35 +4966,51 @@ * Create a metadata_update record to update the * phys_refnum and lba_offset values */ - mu = malloc(sizeof(*mu)); - if (mu && posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { + vc = find_vdcr(ddf, a->info.container_member, rv->disk.raid_disk, + &n_bvd, &vcl); + if (vc == NULL) + return NULL; + + mu = xmalloc(sizeof(*mu)); + if (posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { free(mu); mu = NULL; } - if (!mu) { - while (rv) { - struct mdinfo *n = rv->next; - free(rv); - rv = n; - } - return NULL; - } - - mu->buf = malloc(ddf->conf_rec_len * 512); - mu->len = ddf->conf_rec_len * 512; + mu->len = ddf->conf_rec_len * 512 * vcl->conf.sec_elmnt_count; + mu->buf = xmalloc(mu->len); mu->space = NULL; mu->space_list = NULL; mu->next = *updates; - vc = find_vdcr(ddf, a->info.container_member); - memcpy(mu->buf, vc, ddf->conf_rec_len * 512); + memcpy(mu->buf, &vcl->conf, ddf->conf_rec_len * 512); + for (j = 1; j < vcl->conf.sec_elmnt_count; j++) + memcpy(mu->buf + j * ddf->conf_rec_len * 512, + vcl->other_bvds[j-1], ddf->conf_rec_len * 512); vc = (struct vd_config*)mu->buf; - lba = (__u64*)&vc->phys_refnum[ddf->mppe]; for (di = rv ; di ; di = di->next) { - vc->phys_refnum[di->disk.raid_disk] = - ddf->phys->entries[dl->pdnum].refnum; - lba[di->disk.raid_disk] = di->data_offset; + unsigned int i_sec, i_prim; + i_sec = di->disk.raid_disk + / be16_to_cpu(vcl->conf.prim_elmnt_count); + i_prim = di->disk.raid_disk + % be16_to_cpu(vcl->conf.prim_elmnt_count); + vc = (struct vd_config *)(mu->buf + + i_sec * ddf->conf_rec_len * 512); + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == di->disk.major + && dl->minor == di->disk.minor) + break; + if (!dl) { + pr_err("%s: BUG: can't find disk %d (%d/%d)\n", + __func__, di->disk.raid_disk, + di->disk.major, di->disk.minor); + return NULL; + } + vc->phys_refnum[i_prim] = ddf->phys->entries[dl->pdnum].refnum; + LBA_OFFSET(ddf, vc)[i_prim] = cpu_to_be64(di->data_offset); + dprintf("BVD %u gets %u: %08x at %llu\n", i_sec, i_prim, + be32_to_cpu(vc->phys_refnum[i_prim]), + be64_to_cpu(LBA_OFFSET(ddf, vc)[i_prim])); } *updates = mu; return rv; @@ -3863,6 +5056,8 @@ .add_to_super = add_to_super_ddf, .remove_from_super = remove_from_super_ddf, .load_container = load_container_ddf, + .copy_metadata = copy_metadata_ddf, + .kill_subarray = kill_subarray_ddf, #endif .match_home = match_home_ddf, .uuid_from_super= uuid_from_super_ddf, diff -Nru mdadm-3.2.5/super-gpt.c mdadm-3.3/super-gpt.c --- mdadm-3.2.5/super-gpt.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super-gpt.c 2013-09-03 04:47:47.000000000 +0000 @@ -77,28 +77,26 @@ free_gpt(st); if (posix_memalign((void**)&super, 4096, 32*512) != 0) { - fprintf(stderr, Name ": %s could not allocate superblock\n", + pr_err("%s could not allocate superblock\n", __func__); return 1; } - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ - lseek(fd, 0, 0); if (read(fd, super, sizeof(*super)) != sizeof(*super)) { no_read: if (devname) - fprintf(stderr, Name ": Cannot read partition table on %s\n", + pr_err("Cannot read partition table on %s\n", devname); free(super); return 1; } - + if (super->magic != MBR_SIGNATURE_MAGIC || super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) { not_found: if (devname) - fprintf(stderr, Name ": No partition table found on %s\n", + pr_err("No partition table found on %s\n", devname); free(super); return 1; @@ -175,7 +173,7 @@ static struct supertype *match_metadata_desc(char *arg) { - struct supertype *st = malloc(sizeof(*st)); + struct supertype *st = xmalloc(sizeof(*st)); if (!st) return st; @@ -196,10 +194,11 @@ static int validate_geometry(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *subdev, unsigned long long *freesize, int verbose) { - fprintf(stderr, Name ": gpt metadata cannot be used this way\n"); + pr_err("gpt metadata cannot be used this way\n"); return 0; } #endif diff -Nru mdadm-3.2.5/super-intel.c mdadm-3.3/super-intel.c --- mdadm-3.2.5/super-intel.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super-intel.c 2013-09-03 04:47:47.000000000 +0000 @@ -101,7 +101,7 @@ #define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ #define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ __u32 status; /* 0xF0 - 0xF3 */ - __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ + __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */ #define IMSM_DISK_FILLERS 3 __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */ @@ -235,7 +235,6 @@ struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES]; } __attribute__ ((__packed__)); - #ifndef MDASSEMBLE static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; #endif @@ -249,7 +248,6 @@ * MIGR_REC_BUF_SIZE <= MIGR_REC_POSITION */ - #define UNIT_SRC_NORMAL 0 /* Source data for curr_migr_unit must * be recovered using srcMap */ #define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has @@ -295,7 +293,7 @@ struct md_list *next; }; -#define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg)) +#define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg)) static __u8 migr_type(struct imsm_dev *dev) { @@ -431,9 +429,9 @@ }; struct geo_params { - int dev_id; + char devnm[32]; char *dev_name; - long long size; + unsigned long long size; int level; int layout; int chunksize; @@ -508,7 +506,6 @@ enum imsm_update_type type; }; - static const char *_sys_dev_type[] = { [SYS_DEV_UNKNOWN] = "Unknown", [SYS_DEV_SAS] = "SAS", @@ -525,14 +522,14 @@ static struct intel_hba * alloc_intel_hba(struct sys_dev *device) { - struct intel_hba *result = malloc(sizeof(*result)); - if (result) { - result->type = device->type; - result->path = strdup(device->path); - result->next = NULL; - if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL) - result->pci_id++; - } + struct intel_hba *result = xmalloc(sizeof(*result)); + + result->type = device->type; + result->path = xstrdup(device->path); + result->next = NULL; + if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL) + result->pci_id++; + return result; } @@ -558,25 +555,16 @@ if (super->hba == NULL) { super->hba = alloc_intel_hba(device); return 1; - } - - hba = super->hba; - /* Intel metadata allows for all disks attached to the same type HBA. - * Do not sypport odf HBA types mixing - */ - if (device->type != hba->type) + } else + /* IMSM metadata disallows to attach disks to multiple + * controllers. + */ return 2; - - while (hba->next) - hba = hba->next; - - hba->next = alloc_intel_hba(device); - return 1; } static struct sys_dev* find_disk_attached_hba(int fd, const char *devname) { - struct sys_dev *list, *elem, *prev; + struct sys_dev *list, *elem; char *disk_path; if ((list = find_intel_devices()) == NULL) @@ -587,32 +575,19 @@ else disk_path = diskfd_to_devpath(fd); - if (!disk_path) { - free_sys_dev(&list); + if (!disk_path) return 0; - } - for (prev = NULL, elem = list; elem; prev = elem, elem = elem->next) { - if (path_attached_to_hba(disk_path, elem->path)) { - if (prev == NULL) - list = list->next; - else - prev->next = elem->next; - elem->next = NULL; - if (disk_path != devname) - free(disk_path); - free_sys_dev(&list); + for (elem = list; elem; elem = elem->next) + if (path_attached_to_hba(disk_path, elem->path)) return elem; - } - } + if (disk_path != devname) free(disk_path); - free_sys_dev(&list); return NULL; } - static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname); @@ -625,11 +600,7 @@ ) return NULL; - st = malloc(sizeof(*st)); - if (!st) - return NULL; - memset(st, 0, sizeof(*st)); - st->container_dev = NoMdDev; + st = xcalloc(1, sizeof(*st)); st->ss = &super_imsm; st->max_devs = IMSM_MAX_DEVICES; st->minor_version = 0; @@ -688,12 +659,12 @@ __u32 *p = (__u32 *) mpb; __u32 sum = 0; - while (end--) { - sum += __le32_to_cpu(*p); + while (end--) { + sum += __le32_to_cpu(*p); p++; } - return sum - __le32_to_cpu(mpb->check_sum); + return sum - __le32_to_cpu(mpb->check_sum); } static size_t sizeof_imsm_map(struct imsm_map *map) @@ -964,9 +935,7 @@ else reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; - rv = malloc(sizeof(struct extent) * (memberships + 1)); - if (!rv) - return NULL; + rv = xcalloc(sizeof(struct extent), (memberships + 1)); e = rv; for (i = 0; i < super->anchor->num_raid_devs; i++) { @@ -981,7 +950,7 @@ } qsort(rv, memberships, sizeof(*rv), cmp_extent); - /* determine the start of the metadata + /* determine the start of the metadata * when no raid devices are defined use the default * ...otherwise allow the metadata to truncate the value * as is the case with older versions of imsm @@ -1325,12 +1294,12 @@ /******************************************************************************* * function: imsm_check_attributes * Description: Function checks if features represented by attributes flags - * are supported by mdadm. + * are supported by mdadm. * Parameters: * attributes - Attributes read from metadata * Returns: - * 0 - passed attributes contains unsupported features flags - * 1 - all features are supported + * 0 - passed attributes contains unsupported features flags + * 1 - all features are supported ******************************************************************************/ static int imsm_check_attributes(__u32 attributes) { @@ -1341,7 +1310,7 @@ not_supported &= attributes; if (not_supported) { - fprintf(stderr, Name "(IMSM): Unsupported attributes : %x\n", + pr_err("(IMSM): Unsupported attributes : %x\n", (unsigned)__le32_to_cpu(not_supported)); if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n"); @@ -1537,6 +1506,59 @@ printf("MD_DEVICES=%u\n", mpb->num_disks); } +static int copy_metadata_imsm(struct supertype *st, int from, int to) +{ + /* The second last 512byte sector of the device contains + * the "struct imsm_super" metadata. + * This contains mpb_size which is the size in bytes of the + * extended metadata. This is located immediately before + * the imsm_super. + * We want to read all that, plus the last sector which + * may contain a migration record, and write it all + * to the target. + */ + void *buf; + unsigned long long dsize, offset; + int sectors; + struct imsm_super *sb; + int written = 0; + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-1024, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + sb = buf; + if (strncmp((char*)sb->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) + goto err; + + sectors = mpb_sectors(sb) + 2; + offset = dsize - sectors * 512; + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < sectors * 512) { + int n = sectors*512 - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + static void detail_super_imsm(struct supertype *st, char *homehost) { struct mdinfo info; @@ -1572,8 +1594,8 @@ unsigned long port_mask = (1 << port_count) - 1; if (port_count > (int)sizeof(port_mask) * 8) { - if (verbose) - fprintf(stderr, Name ": port_count %d out of range\n", port_count); + if (verbose > 0) + pr_err("port_count %d out of range\n", port_count); return 2; } @@ -1605,15 +1627,15 @@ /* retrieve the scsi device type */ if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) { - if (verbose) - fprintf(stderr, Name ": failed to allocate 'device'\n"); + if (verbose > 0) + pr_err("failed to allocate 'device'\n"); err = 2; break; } sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor); if (load_sys(device, buf) != 0) { - if (verbose) - fprintf(stderr, Name ": failed to read device type for %s\n", + if (verbose > 0) + pr_err("failed to read device type for %s\n", path); err = 2; free(device); @@ -1665,8 +1687,8 @@ /* chop device path to 'host%d' and calculate the port number */ c = strchr(&path[hba_len], '/'); if (!c) { - if (verbose) - fprintf(stderr, Name ": %s - invalid path name\n", path + hba_len); + if (verbose > 0) + pr_err("%s - invalid path name\n", path + hba_len); err = 2; break; } @@ -1674,9 +1696,9 @@ if (sscanf(&path[hba_len], "host%d", &port) == 1) port -= host_base; else { - if (verbose) { + if (verbose > 0) { *c = '/'; /* repair the full string */ - fprintf(stderr, Name ": failed to determine port number for %s\n", + pr_err("failed to determine port number for %s\n", path); } err = 2; @@ -1725,7 +1747,7 @@ static void print_found_intel_controllers(struct sys_dev *elem) { for (; elem; elem = elem->next) { - fprintf(stderr, Name ": found Intel(R) "); + pr_err("found Intel(R) "); if (elem->type == SYS_DEV_SATA) fprintf(stderr, "SATA "); else if (elem->type == SYS_DEV_SAS) @@ -1803,7 +1825,42 @@ return; } -static int detail_platform_imsm(int verbose, int enumerate_only) +static void print_imsm_capability_export(const struct imsm_orom *orom) +{ + printf("MD_FIRMWARE_TYPE=imsm\n"); + printf("IMSM_VERSION=%d.%d.%d.%d\n",orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? "raid0 " : "", + imsm_orom_has_raid1(orom) ? "raid1 " : "", + imsm_orom_has_raid1e(orom) ? "raid1e " : "", + imsm_orom_has_raid5(orom) ? "raid10 " : "", + imsm_orom_has_raid10(orom) ? "raid5 " : ""); + printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? "2k " : "", + imsm_orom_has_chunk(orom, 4) ? "4k " : "", + imsm_orom_has_chunk(orom, 8) ? "8k " : "", + imsm_orom_has_chunk(orom, 16) ? "16k " : "", + imsm_orom_has_chunk(orom, 32) ? "32k " : "", + imsm_orom_has_chunk(orom, 64) ? "64k " : "", + imsm_orom_has_chunk(orom, 128) ? "128k " : "", + imsm_orom_has_chunk(orom, 256) ? "256k " : "", + imsm_orom_has_chunk(orom, 512) ? "512k " : "", + imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "", + imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "", + imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "", + imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "", + imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "", + imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "", + imsm_orom_has_chunk(orom, 1024*64) ? "64M " : ""); + printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no"); + printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no"); + printf("IMSM_MAX_DISKS=%d\n",orom->tds); + printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa); + printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba); +} + +static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path) { /* There are two components to imsm platform support, the ahci SATA * controller and the option-rom. To find the SATA controller we @@ -1820,7 +1877,7 @@ struct sys_dev *list, *hba; int host_base = 0; int port_count = 0; - int result=0; + int result=1; if (enumerate_only) { if (check_env("IMSM_NO_PLATFORM")) @@ -1834,48 +1891,83 @@ result = 2; break; } + else + result = 0; } - free_sys_dev(&list); return result; } list = find_intel_devices(); if (!list) { - if (verbose) - fprintf(stderr, Name ": no active Intel(R) RAID " + if (verbose > 0) + pr_err("no active Intel(R) RAID " "controller found.\n"); - free_sys_dev(&list); return 2; - } else if (verbose) + } else if (verbose > 0) print_found_intel_controllers(list); for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path,controller_path) != 0)) + continue; orom = find_imsm_capability(hba->type); if (!orom) - fprintf(stderr, Name ": imsm capabilities not found for controller: %s (type %s)\n", + pr_err("imsm capabilities not found for controller: %s (type %s)\n", hba->path, get_sys_dev_type(hba->type)); - else + else { + result = 0; print_imsm_capability(orom); + printf(" I/O Controller : %s (%s)\n", + hba->path, get_sys_dev_type(hba->type)); + if (hba->type == SYS_DEV_SATA) { + host_base = ahci_get_port_count(hba->path, &port_count); + if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) { + if (verbose > 0) + pr_err("failed to enumerate " + "ports on SATA controller at %s.\n", hba->pci_id); + result |= 2; + } + } + } } - for (hba = list; hba; hba = hba->next) { - printf(" I/O Controller : %s (%s)\n", - hba->path, get_sys_dev_type(hba->type)); + if (controller_path && result == 1) + pr_err("no active Intel(R) RAID " + "controller found under %s\n",controller_path); - if (hba->type == SYS_DEV_SATA) { - host_base = ahci_get_port_count(hba->path, &port_count); - if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) { - if (verbose) - fprintf(stderr, Name ": failed to enumerate " - "ports on SATA controller at %s.", hba->pci_id); - result |= 2; - } + return result; +} + +static int export_detail_platform_imsm(int verbose, char *controller_path) +{ + const struct imsm_orom *orom; + struct sys_dev *list, *hba; + int result=1; + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n"); + result = 2; + return result; + } + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path,controller_path) != 0)) + continue; + orom = find_imsm_capability(hba->type); + if (!orom) { + if (verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n",hba->path); + } + else { + print_imsm_capability_export(orom); + result = 0; } } - free_sys_dev(&list); return result; } + #endif static int match_home_imsm(struct supertype *st, char *homehost) @@ -1902,7 +1994,7 @@ * not the device-set. * uuid to recognise same set when adding a missing device back * to an array. This is a uuid for the device-set. - * + * * For each of these we can make do with a truncated * or hashed uuid rather than the original, as long as * everyone agrees. @@ -2183,16 +2275,14 @@ get_dev_size(fd, NULL, &dsize); if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { - fprintf(stderr, - Name ": Cannot seek to anchor block: %s\n", - strerror(errno)); + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); goto out; } if (read(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) { - fprintf(stderr, - Name ": Cannot read migr record block: %s\n", - strerror(errno)); + pr_err("Cannot read migr record block: %s\n", + strerror(errno)); goto out; } ret_val = 0; @@ -2319,7 +2409,7 @@ update_memory_size = sizeof(struct imsm_update_general_migration_checkpoint); - *u = calloc(1, update_memory_size); + *u = xcalloc(1, update_memory_size); if (*u == NULL) { dprintf("error: cannot get memory for " "imsm_create_metadata_checkpoint_update update\n"); @@ -2333,7 +2423,6 @@ return update_memory_size; } - static void imsm_update_metadata_locally(struct supertype *st, void *buf, int len); @@ -2389,16 +2478,14 @@ continue; get_dev_size(fd, NULL, &dsize); if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { - fprintf(stderr, - Name ": Cannot seek to anchor block: %s\n", - strerror(errno)); + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); goto out; } if (write(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) { - fprintf(stderr, - Name ": Cannot write migr record block: %s\n", - strerror(errno)); + pr_err("Cannot write migr record block: %s\n", + strerror(errno)); goto out; } close(fd); @@ -2492,7 +2579,6 @@ struct imsm_map *prev_map = get_imsm_map(dev, MAP_1); struct imsm_map *map_to_analyse = map; struct dl *dl; - char *devname; int map_disks = info->array.raid_disks; memset(info, 0, sizeof(*info)); @@ -2654,11 +2740,7 @@ info->array.major_version = -1; info->array.minor_version = -2; - devname = devnum2devname(st->container_dev); - *info->text_version = '\0'; - if (devname) - sprintf(info->text_version, "/%s/%d", devname, info->container_member); - free(devname); + sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member); info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ uuid_from_super_imsm(st, info->uuid); @@ -2683,7 +2765,6 @@ static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, int look_in_map); - #ifndef MDASSEMBLE static void manage_second_map(struct intel_super *super, struct imsm_dev *dev) { @@ -2734,7 +2815,7 @@ info->array.level = LEVEL_CONTAINER; info->array.layout = 0; info->array.md_minor = -1; - info->array.ctime = 0; /* N/A for imsm */ + info->array.ctime = 0; /* N/A for imsm */ info->array.utime = 0; info->array.chunk_size = 0; @@ -2849,23 +2930,11 @@ if (!super || !super->disks) return NULL; dl = super->disks; - mddev = malloc(sizeof(*mddev)); - if (!mddev) { - fprintf(stderr, Name ": Failed to allocate memory.\n"); - return NULL; - } - memset(mddev, 0, sizeof(*mddev)); + mddev = xcalloc(1, sizeof(*mddev)); while (dl) { struct mdinfo *tmp; disk = &dl->disk; - tmp = malloc(sizeof(*tmp)); - if (!tmp) { - fprintf(stderr, Name ": Failed to allocate memory.\n"); - if (mddev) - sysfs_free(mddev); - return NULL; - } - memset(tmp, 0, sizeof(*tmp)); + tmp = xcalloc(1, sizeof(*tmp)); if (mddev->devs) tmp->next = mddev->devs; mddev->devs = tmp; @@ -2971,7 +3040,8 @@ return size; } -static __u64 avail_size_imsm(struct supertype *st, __u64 devsize) +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize, + unsigned long long data_offset) { if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) return 0; @@ -3008,21 +3078,22 @@ struct intel_super *first = st->sb; struct intel_super *sec = tst->sb; - if (!first) { - st->sb = tst->sb; - tst->sb = NULL; - return 0; - } + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } /* in platform dependent environment test if the disks * use the same Intel hba + * If not on Intel hba at all, allow anything. */ if (!check_env("IMSM_NO_PLATFORM")) { - if (!first->hba || !sec->hba || - (first->hba->type != sec->hba->type)) { + if (first->hba && sec->hba && + strcmp(first->hba->path, sec->hba->path) != 0) { fprintf(stderr, "HBAs of devices does not match %s != %s\n", - first->hba ? get_sys_dev_type(first->hba->type) : NULL, - sec->hba ? get_sys_dev_type(sec->hba->type) : NULL); + first->hba ? first->hba->path : NULL, + sec->hba ? sec->hba->path : NULL); return 3; } } @@ -3053,7 +3124,6 @@ } - /* if 'first' is a spare promote it to a populated mpb with sec's * family number */ @@ -3067,14 +3137,8 @@ * fails here we don't associate the spare */ for (i = 0; i < sec->anchor->num_raid_devs; i++) { - dv = malloc(sizeof(*dv)); - if (!dv) - break; - dev = malloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); - if (!dev) { - free(dv); - break; - } + dv = xmalloc(sizeof(*dv)); + dev = xmalloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); dv->dev = dev; dv->index = i; dv->next = first->devlist; @@ -3083,7 +3147,7 @@ if (i < sec->anchor->num_raid_devs) { /* allocation failure */ free_devlist(first); - fprintf(stderr, "imsm: failed to associate spare\n"); + pr_err("imsm: failed to associate spare\n"); return 3; } first->anchor->num_raid_devs = sec->anchor->num_raid_devs; @@ -3149,18 +3213,16 @@ if (rv != 0) { if (devname) - fprintf(stderr, - Name ": Failed to retrieve serial for %s\n", - devname); + pr_err("Failed to retrieve serial for %s\n", + devname); return rv; } rsp_len = scsi_serial[3]; if (!rsp_len) { if (devname) - fprintf(stderr, - Name ": Failed to retrieve serial for %s\n", - devname); + pr_err("Failed to retrieve serial for %s\n", + devname); return 2; } rsp_buf = (char *) &scsi_serial[4]; @@ -3249,14 +3311,7 @@ if (rv != 0) return 2; - dl = calloc(1, sizeof(*dl)); - if (!dl) { - if (devname) - fprintf(stderr, - Name ": failed to allocate disk buffer for %s\n", - devname); - return 2; - } + dl = xcalloc(1, sizeof(*dl)); fstat(fd, &stb); dl->major = major(stb.st_rdev); @@ -3270,9 +3325,9 @@ dl->e = NULL; fd2devname(fd, name); if (devname) - dl->devname = strdup(devname); + dl->devname = xstrdup(devname); else - dl->devname = strdup(name); + dl->devname = xstrdup(name); /* look up this disk's index in the current anchor */ disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); @@ -3399,18 +3454,12 @@ if (len_migr > len) space_needed += len_migr - len; - dv = malloc(sizeof(*dv)); - if (!dv) - return 1; + dv = xmalloc(sizeof(*dv)); if (max_len < len_migr) max_len = len_migr; if (max_len > len_migr) space_needed += max_len - len_migr; - dev_new = malloc(max_len); - if (!dev_new) { - free(dv); - return 1; - } + dev_new = xmalloc(max_len); imsm_copy_dev(dev_new, dev_iter); dv->dev = dev_new; dv->index = i; @@ -3446,7 +3495,7 @@ if (__le32_to_cpu(mpb->bbm_log_size)) { ptr = mpb; ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size); - } + } return ptr; } @@ -3505,40 +3554,35 @@ get_dev_size(fd, NULL, &dsize); if (dsize < 1024) { if (devname) - fprintf(stderr, - Name ": %s: device to small for imsm\n", - devname); + pr_err("%s: device to small for imsm\n", + devname); return 1; } if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) { if (devname) - fprintf(stderr, Name - ": Cannot seek to anchor block on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); return 1; } if (posix_memalign((void**)&anchor, 512, 512) != 0) { if (devname) - fprintf(stderr, - Name ": Failed to allocate imsm anchor buffer" - " on %s\n", devname); + pr_err("Failed to allocate imsm anchor buffer" + " on %s\n", devname); return 1; } if (read(fd, anchor, 512) != 512) { if (devname) - fprintf(stderr, - Name ": Cannot read anchor block on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); free(anchor); return 1; } if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { if (devname) - fprintf(stderr, - Name ": no IMSM anchor on %s\n", devname); + pr_err("no IMSM anchor on %s\n", devname); free(anchor); return 2; } @@ -3551,9 +3595,8 @@ super->len = ROUND_UP(anchor->mpb_size, 512); if (posix_memalign(&super->buf, 512, super->len) != 0) { if (devname) - fprintf(stderr, - Name ": unable to allocate %zu byte mpb buffer\n", - super->len); + pr_err("unable to allocate %zu byte mpb buffer\n", + super->len); free(anchor); return 2; } @@ -3563,8 +3606,7 @@ free(anchor); if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { - fprintf(stderr, Name - ": %s could not allocate migr_rec buffer\n", __func__); + pr_err("%s could not allocate migr_rec buffer\n", __func__); free(super->buf); return 2; } @@ -3574,11 +3616,10 @@ check_sum = __gen_imsm_checksum(super->anchor); if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { if (devname) - fprintf(stderr, - Name ": IMSM checksum %x != %x on %s\n", - check_sum, - __le32_to_cpu(super->anchor->check_sum), - devname); + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); return 2; } @@ -3588,27 +3629,24 @@ /* read the extended mpb */ if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { if (devname) - fprintf(stderr, - Name ": Cannot seek to extended mpb on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); return 1; } if ((unsigned)read(fd, super->buf + 512, super->len - 512) != super->len - 512) { if (devname) - fprintf(stderr, - Name ": Cannot read extended mpb on %s: %s\n", - devname, strerror(errno)); + pr_err("Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); return 2; } check_sum = __gen_imsm_checksum(super->anchor); if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { if (devname) - fprintf(stderr, - Name ": IMSM checksum %x != %x on %s\n", - check_sum, __le32_to_cpu(super->anchor->check_sum), - devname); + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); return 3; } @@ -3747,13 +3785,10 @@ static struct intel_super *alloc_super(void) { - struct intel_super *super = malloc(sizeof(*super)); + struct intel_super *super = xcalloc(1, sizeof(*super)); - if (super) { - memset(super, 0, sizeof(*super)); - super->current_vol = -1; - super->create_offset = ~((unsigned long long) 0); - } + super->current_vol = -1; + super->create_offset = ~((unsigned long long) 0); return super; } @@ -3773,9 +3808,8 @@ hba_name = find_disk_attached_hba(fd, NULL); if (!hba_name) { if (devname) - fprintf(stderr, - Name ": %s is not attached to Intel(R) RAID controller.\n", - devname); + pr_err("%s is not attached to Intel(R) RAID controller.\n", + devname); return 1; } rv = attach_hba_to_super(super, hba_name); @@ -3783,7 +3817,7 @@ if (devname) { struct intel_hba *hba = super->hba; - fprintf(stderr, Name ": %s is attached to Intel(R) %s RAID " + pr_err("%s is attached to Intel(R) %s RAID " "controller (%s),\n" " but the container is assigned to Intel(R) " "%s RAID controller (", @@ -3799,15 +3833,13 @@ hba = hba->next; } - fprintf(stderr, ").\n" - " Mixing devices attached to different controllers " - "is not allowed.\n"); + fprintf(stderr, ").\n"); + cont_err("Mixing devices attached to multiple controllers " + "is not allowed.\n"); } - free_sys_dev(&hba_name); return 2; } super->orom = find_imsm_capability(hba_name->type); - free_sys_dev(&hba_name); if (!super->orom) return 3; return 0; @@ -3830,13 +3862,11 @@ if (dl) continue; - dl = malloc(sizeof(*dl)); - if (!dl) - return 1; + dl = xmalloc(sizeof(*dl)); dl->major = 0; dl->minor = 0; dl->fd = -1; - dl->devname = strdup("missing"); + dl->devname = xstrdup("missing"); dl->index = i; serialcpy(dl->serial, disk->serial); dl->disk = *disk; @@ -3951,9 +3981,7 @@ is_failed(&idisk->disk)) idisk->disk.status &= ~(SPARE_DISK); } else { - idisk = calloc(1, sizeof(*idisk)); - if (!idisk) - return -1; + idisk = xcalloc(1, sizeof(*idisk)); idisk->owner = IMSM_UNKNOWN_OWNER; idisk->disk = *disk; idisk->next = *disk_list; @@ -4008,7 +4036,7 @@ for (s = super_list; s; s = s->next) { if (family_num != s->anchor->family_num) continue; - fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + pr_err("Conflict, offlining family %#x on '%s'\n", __le32_to_cpu(family_num), s->disks->devname); } } @@ -4094,7 +4122,7 @@ champion = s; if (conflict) - fprintf(stderr, "Chose family %#x on '%s', " + pr_err("Chose family %#x on '%s', " "assemble conflicts to new container with '--update=uuid'\n", __le32_to_cpu(s->anchor->family_num), s->disks->devname); @@ -4167,16 +4195,14 @@ return champion; } - static int get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd); -static int get_super_block(struct intel_super **super_list, int devnum, char *devname, +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, int major, int minor, int keep_fd); static int get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, int *max, int keep_fd); - static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, char *devname, struct md_list *devlist, int keep_fd) @@ -4219,7 +4245,7 @@ /* Check migration compatibility */ if ((err == 0) && (check_mpb_migr_compatibility(super) != 0)) { - fprintf(stderr, Name ": Unsupported migration detected"); + pr_err("Unsupported migration detected"); if (devname) fprintf(stderr, " on %s\n", devname); else @@ -4239,15 +4265,14 @@ free_imsm(s); } - if (err) return err; *sbp = super; if (fd >= 0) - st->container_dev = fd2devnum(fd); + strcpy(st->container_devnm, fd2devnm(fd)); else - st->container_dev = NoMdDev; + st->container_devnm[0] = 0; if (err == 0 && st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; @@ -4256,7 +4281,6 @@ return 0; } - static int get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, int *max, int keep_fd) @@ -4272,7 +4296,7 @@ int lmax = 0; int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL); if (fd < 0) { - fprintf(stderr, Name ": cannot open device %s: %s\n", + pr_err("cannot open device %s: %s\n", tmpdev->devname, strerror(errno)); err = 8; goto error; @@ -4290,7 +4314,7 @@ int major = major(tmpdev->st_rdev); int minor = minor(tmpdev->st_rdev); err = get_super_block(super_list, - -1, + NULL, tmpdev->devname, major, minor, keep_fd); @@ -4306,13 +4330,12 @@ return err; } -static int get_super_block(struct intel_super **super_list, int devnum, char *devname, +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, int major, int minor, int keep_fd) { struct intel_super*s = NULL; char nm[32]; int dfd = -1; - int rv; int err = 0; int retry; @@ -4329,17 +4352,11 @@ goto error; } - rv = find_intel_hba_capability(dfd, s, devname); - /* no orom/efi or non-intel hba of the disk */ - if (rv != 0) { - err = 4; - goto error; - } - + find_intel_hba_capability(dfd, s, devname); err = load_and_parse_mpb(dfd, s, NULL, keep_fd); /* retry the load if we might have raced against mdmon */ - if (err == 3 && (devnum != -1) && mdmon_running(devnum)) + if (err == 3 && devnm && mdmon_running(devnm)) for (retry = 0; retry < 3; retry++) { usleep(3000); err = load_and_parse_mpb(dfd, s, NULL, keep_fd); @@ -4366,11 +4383,11 @@ get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd) { struct mdinfo *sra; - int devnum; + char *devnm; struct mdinfo *sd; int err = 0; int i = 0; - sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); if (!sra) return 1; @@ -4381,9 +4398,9 @@ goto error; } /* load all mpbs */ - devnum = fd2devnum(fd); + devnm = fd2devnm(fd); for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { - if (get_super_block(super_list, devnum, devname, + if (get_super_block(super_list, devnm, devname, sd->disk.major, sd->disk.minor, keep_fd) != 0) { err = 7; goto error; @@ -4406,19 +4423,13 @@ struct intel_super *super; int rv; - if (test_partition(fd)) + if (!st->ignore_hw_compat && test_partition(fd)) /* IMSM not allowed on partitions */ return 1; free_super_imsm(st); super = alloc_super(); - if (!super) { - fprintf(stderr, - Name ": malloc of %zu failed.\n", - sizeof(*super)); - return 1; - } /* Load hba and capabilities if they exist. * But do not preclude loading metadata in case capabilities or hba are * non-compliant and ignore_hw_compat is set. @@ -4427,8 +4438,7 @@ /* no orom/efi or non-intel hba of the disk */ if ((rv != 0) && (st->ignore_hw_compat == 0)) { if (devname) - fprintf(stderr, - Name ": No OROM/EFI properties for %s\n", devname); + pr_err("No OROM/EFI properties for %s\n", devname); free_imsm(super); return 2; } @@ -4436,9 +4446,8 @@ if (rv) { if (devname) - fprintf(stderr, - Name ": Failed to load all information " - "sections on %s\n", devname); + pr_err("Failed to load all information " + "sections on %s\n", devname); free_imsm(super); return rv; } @@ -4454,8 +4463,7 @@ if (load_imsm_migr_rec(super, NULL) == 0) { /* Check for unsupported migration features */ if (check_mpb_migr_compatibility(super) != 0) { - fprintf(stderr, - Name ": Unsupported migration detected"); + pr_err("Unsupported migration detected"); if (devname) fprintf(stderr, " on %s\n", devname); else @@ -4549,14 +4557,15 @@ } if (reason && !quiet) - fprintf(stderr, Name ": imsm volume name %s\n", reason); + pr_err("imsm volume name %s\n", reason); return !reason; } static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, - char *homehost, int *uuid) + char *homehost, int *uuid, + long long data_offset) { /* We are creating a volume inside a pre-existing container. * so st->sb is already set. @@ -4574,7 +4583,7 @@ unsigned long long num_data_stripes; if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { - fprintf(stderr, Name": This imsm-container already has the " + pr_err("This imsm-container already has the " "maximum of %d volumes\n", super->orom->vpa); return 0; } @@ -4587,14 +4596,13 @@ size_t size_round = ROUND_UP(size_new, 512); if (posix_memalign(&mpb_new, 512, size_round) != 0) { - fprintf(stderr, Name": could not allocate new mpb\n"); + pr_err("could not allocate new mpb\n"); return 0; } if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { - fprintf(stderr, Name - ": %s could not allocate migr_rec buffer\n", - __func__); + pr_err("%s could not allocate migr_rec buffer\n", + __func__); free(super->buf); free(super); free(mpb_new); @@ -4638,25 +4646,15 @@ for (d = super->missing; d; d = d->next) missing++; if (info->failed_disks > missing) { - fprintf(stderr, Name": unable to add 'missing' disk to container\n"); + pr_err("unable to add 'missing' disk to container\n"); return 0; } } if (!check_name(super, name, 0)) return 0; - dv = malloc(sizeof(*dv)); - if (!dv) { - fprintf(stderr, Name ": failed to allocate device list entry\n"); - return 0; - } - dev = calloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); - if (!dev) { - free(dv); - fprintf(stderr, Name": could not allocate raid device\n"); - return 0; - } - + dv = xmalloc(sizeof(*dv)); + dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); array_blocks = calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, @@ -4678,7 +4676,8 @@ map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); map->failed_disk_num = ~0; if (info->level > 0) - map->map_state = IMSM_T_STATE_UNINITIALIZED; + map->map_state = (info->state ? IMSM_T_STATE_NORMAL + : IMSM_T_STATE_UNINITIALIZED); else map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED : IMSM_T_STATE_NORMAL; @@ -4687,7 +4686,7 @@ if (info->level == 1 && info->raid_disks > 2) { free(dev); free(dv); - fprintf(stderr, Name": imsm does not support more than 2 disks" + pr_err("imsm does not support more than 2 disks" "in a raid1 volume\n"); return 0; } @@ -4725,7 +4724,8 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, - char *homehost, int *uuid) + char *homehost, int *uuid, + unsigned long long data_offset) { /* This is primarily called by Create when creating a new array. * We will then get add_to_super called for each component, and then @@ -4740,8 +4740,14 @@ size_t mpb_size; char *version; + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not supported by imsm\n"); + return 0; + } + if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, uuid); + return init_super_imsm_volume(st, info, size, name, homehost, uuid, + data_offset); if (info) mpb_size = disks_to_mpb_size(info->nr_disks); @@ -4754,13 +4760,11 @@ super = NULL; } if (!super) { - fprintf(stderr, Name - ": %s could not allocate superblock\n", __func__); + pr_err("%s could not allocate superblock\n", __func__); return 0; } if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { - fprintf(stderr, Name - ": %s could not allocate migr_rec buffer\n", __func__); + pr_err("%s could not allocate migr_rec buffer\n", __func__); free(super->buf); free(super); return 0; @@ -4801,7 +4805,7 @@ map = get_imsm_map(dev, MAP_0); if (! (dk->state & (1<index); if (slot >= 0 && (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) { - fprintf(stderr, Name ": %s has been included in this array twice\n", + pr_err("%s has been included in this array twice\n", devname); return 1; } @@ -4890,7 +4894,7 @@ _disk = __get_imsm_disk(mpb, dl->index); if (!_dev || !_disk) { - fprintf(stderr, Name ": BUG mpb setup error\n"); + pr_err("BUG mpb setup error\n"); return 1; } *_dev = *dev; @@ -4937,7 +4941,8 @@ } static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, - int fd, char *devname) + int fd, char *devname, + unsigned long long data_offset) { struct intel_super *super = st->sb; struct dl *dd; @@ -4963,23 +4968,16 @@ return add_to_super_imsm_volume(st, dk, fd, devname); fstat(fd, &stb); - dd = malloc(sizeof(*dd)); - if (!dd) { - fprintf(stderr, - Name ": malloc failed %s:%d.\n", __func__, __LINE__); - return 1; - } - memset(dd, 0, sizeof(*dd)); + dd = xcalloc(sizeof(*dd), 1); dd->major = major(stb.st_rdev); dd->minor = minor(stb.st_rdev); - dd->devname = devname ? strdup(devname) : NULL; + dd->devname = devname ? xstrdup(devname) : NULL; dd->fd = fd; dd->e = NULL; dd->action = DISK_ADD; rv = imsm_read_serial(fd, devname, dd->serial); if (rv) { - fprintf(stderr, - Name ": failed to retrieve scsi serial, aborting\n"); + pr_err("failed to retrieve scsi serial, aborting\n"); free(dd); abort(); } @@ -5010,7 +5008,6 @@ return 0; } - static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk) { struct intel_super *super = st->sb; @@ -5021,18 +5018,11 @@ * is prepared. */ if (!st->update_tail) { - fprintf(stderr, - Name ": %s shall be used in mdmon context only" - "(line %d).\n", __func__, __LINE__); - return 1; - } - dd = malloc(sizeof(*dd)); - if (!dd) { - fprintf(stderr, - Name ": malloc failed %s:%d.\n", __func__, __LINE__); + pr_err("%s shall be used in mdmon context only" + "(line %d).\n", __func__, __LINE__); return 1; } - memset(dd, 0, sizeof(*dd)); + dd = xcalloc(1, sizeof(*dd)); dd->major = dk->major; dd->minor = dk->minor; dd->fd = -1; @@ -5042,7 +5032,6 @@ dd->next = super->disk_mgmt_list; super->disk_mgmt_list = dd; - return 0; } @@ -5089,7 +5078,7 @@ spare->check_sum = __cpu_to_le32(sum); if (store_imsm_mpb(d->fd, spare)) { - fprintf(stderr, "%s: failed for device %d:%d %s\n", + pr_err("%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); return 1; } @@ -5199,7 +5188,6 @@ return 0; } - static int create_array(struct supertype *st, int dev_idx) { size_t len; @@ -5213,13 +5201,7 @@ len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + sizeof(*inf) * map->num_members; - u = malloc(len); - if (!u) { - fprintf(stderr, "%s: failed to allocate update buffer\n", - __func__); - return 1; - } - + u = xmalloc(len); u->type = update_create_array; u->dev_idx = dev_idx; imsm_copy_dev(&u->dev, dev); @@ -5245,13 +5227,7 @@ return 0; len = sizeof(*u); - u = malloc(len); - if (!u) { - fprintf(stderr, "%s: failed to allocate update buffer\n", - __func__); - return 1; - } - + u = xmalloc(len); u->type = update_add_remove_disk; append_metadata_update(st, u, len); @@ -5284,7 +5260,7 @@ } else { struct dl *d; for (d = super->disks; d; d = d->next) - Kill(d->devname, NULL, 0, 1, 1); + Kill(d->devname, NULL, 0, -1, 1); return write_super_imsm(st, 1); } } @@ -5313,7 +5289,9 @@ #ifndef MDASSEMBLE static int validate_geometry_imsm_container(struct supertype *st, int level, int layout, int raiddisks, int chunk, - unsigned long long size, char *dev, + unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, int verbose) { @@ -5329,8 +5307,8 @@ fd = open(dev, O_RDONLY|O_EXCL, 0); if (fd < 0) { - if (verbose) - fprintf(stderr, Name ": imsm: Cannot open %s: %s\n", + if (verbose > 0) + pr_err("imsm: Cannot open %s: %s\n", dev, strerror(errno)); return 0; } @@ -5343,15 +5321,7 @@ * note that there is no fd for the disks in array. */ super = alloc_super(); - if (!super) { - fprintf(stderr, - Name ": malloc of %zu failed.\n", - sizeof(*super)); - close(fd); - return 0; - } - - rv = find_intel_hba_capability(fd, super, verbose ? dev : NULL); + rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL); if (rv != 0) { #if DEBUG char str[256]; @@ -5368,7 +5338,7 @@ if (super->orom) { if (raiddisks > super->orom->tds) { if (verbose) - fprintf(stderr, Name ": %d exceeds maximum number of" + pr_err("%d exceeds maximum number of" " platform supported disks: %d\n", raiddisks, super->orom->tds); free_imsm(super); @@ -5377,13 +5347,13 @@ if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 && (ldsize >> 9) >> 32 > 0) { if (verbose) - fprintf(stderr, Name ": %s exceeds maximum platform supported size\n", dev); + pr_err("%s exceeds maximum platform supported size\n", dev); free_imsm(super); return 0; } } - *freesize = avail_size_imsm(st, ldsize >> 9); + *freesize = avail_size_imsm(st, ldsize >> 9, data_offset); free_imsm(super); return 1; @@ -5422,7 +5392,7 @@ * 'maxsize' given the "all disks in an array must share a common start * offset" constraint */ - struct extent *e = calloc(sum_extents, sizeof(*e)); + struct extent *e = xcalloc(sum_extents, sizeof(*e)); struct dl *dl; int i, j; int start_extent; @@ -5431,9 +5401,6 @@ unsigned long long maxsize; unsigned long reserve; - if (!e) - return 0; - /* coalesce and sort all extents. also, check to see if we need to * reserve space between member arrays */ @@ -5519,7 +5486,6 @@ return 0; } - static int active_arrays_by_format(char *name, char* hba, struct md_list **devlist, int dpa, int verbose) @@ -5540,17 +5506,15 @@ struct dev_member *dev = memb->members; int fd = -1; while(dev && (fd < 0)) { - char *path = malloc(strlen(dev->name) + strlen("/dev/") + 1); - if (path) { - num = sprintf(path, "%s%s", "/dev/", dev->name); - if (num > 0) - fd = open(path, O_RDONLY, 0); - if ((num <= 0) || (fd < 0)) { - pr_vrb(": Cannot open %s: %s\n", - dev->name, strerror(errno)); - } - free(path); + char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1); + num = sprintf(path, "%s%s", "/dev/", dev->name); + if (num > 0) + fd = open(path, O_RDONLY, 0); + if ((num <= 0) || (fd < 0)) { + pr_vrb(": Cannot open %s: %s\n", + dev->name, strerror(errno)); } + free(path); dev = dev->next; } found = 0; @@ -5565,20 +5529,13 @@ } } if (*devlist && (found < dpa)) { - dv = calloc(1, sizeof(*dv)); - if (dv == NULL) - fprintf(stderr, Name ": calloc failed\n"); - else { - dv->devname = malloc(strlen(memb->dev) + strlen("/dev/") + 1); - if (dv->devname != NULL) { - sprintf(dv->devname, "%s%s", "/dev/", memb->dev); - dv->found = found; - dv->used = 0; - dv->next = *devlist; - *devlist = dv; - } else - free(dv); - } + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(strlen(memb->dev) + strlen("/dev/") + 1); + sprintf(dv->devname, "%s%s", "/dev/", memb->dev); + dv->found = found; + dv->used = 0; + dv->next = *devlist; + *devlist = dv; } } if (fd >= 0) @@ -5598,17 +5555,8 @@ struct md_list *dv = NULL; for(i = 0; i < 12; i++) { - dv = calloc(1, sizeof(*dv)); - if (dv == NULL) { - fprintf(stderr, Name ": calloc failed\n"); - break; - } - dv->devname = malloc(40); - if (dv->devname == NULL) { - fprintf(stderr, Name ": malloc failed\n"); - free(dv); - break; - } + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(40); sprintf(dv->devname, "/dev/loop%d", i); dv->next = devlist; devlist = dv; @@ -5656,25 +5604,13 @@ fd2devname(fd, buf); close(fd); } else { - fprintf(stderr, Name ": cannot open device: %s\n", + pr_err("cannot open device: %s\n", ent->d_name); continue; } - - dv = calloc(1, sizeof(*dv)); - if (dv == NULL) { - fprintf(stderr, Name ": malloc failed\n"); - err = 1; - break; - } - dv->devname = strdup(buf); - if (dv->devname == NULL) { - fprintf(stderr, Name ": malloc failed\n"); - err = 1; - free(dv); - break; - } + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xstrdup(buf); dv->next = devlist; devlist = dv; } @@ -5854,7 +5790,6 @@ return count; } - static int count_volumes(char *hba, int dpa, int verbose) { @@ -5912,17 +5847,17 @@ return 0; } - /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */ + /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */ if (!is_raid_level_supported(super->orom, level, raiddisks)) { pr_vrb(": platform does not support raid%d with %d disk%s\n", level, raiddisks, raiddisks > 1 ? "s" : ""); return 0; } - if (chunk && (*chunk == 0 || *chunk == UnSet)) + if (*chunk == 0 || *chunk == UnSet) *chunk = imsm_default_chunk(super->orom); - if (super->orom && chunk && !imsm_orom_has_chunk(super->orom, *chunk)) { + if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) { pr_vrb(": platform does not support a chunk size of: " "%d\n", *chunk); return 0; @@ -5939,7 +5874,7 @@ return 0; } - if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 && chunk && + if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 && (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) { pr_vrb(": platform does not support a volume size over 2TB\n"); return 0; @@ -5947,12 +5882,14 @@ return 1; } -/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd * FIX ME add ahci details */ static int validate_geometry_imsm_volume(struct supertype *st, int level, int layout, int raiddisks, int *chunk, - unsigned long long size, char *dev, + unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, int verbose) { @@ -5972,7 +5909,7 @@ mpb = super->anchor; if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) { - fprintf(stderr, Name ": RAID gemetry validation failed. " + pr_err("RAID gemetry validation failed. " "Cannot proceed with the action(s).\n"); return 0; } @@ -6014,7 +5951,7 @@ } if (dcnt < raiddisks) { if (verbose) - fprintf(stderr, Name ": imsm: Not enough " + pr_err("imsm: Not enough " "devices with space for this array " "(%d < %d)\n", dcnt, raiddisks); @@ -6035,7 +5972,7 @@ } if (!dl) { if (verbose) - fprintf(stderr, Name ": %s is not in the " + pr_err("%s is not in the " "same imsm set\n", dev); return 0; } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { @@ -6044,14 +5981,14 @@ * understand this configuration (all member disks must be * members of each array in the container). */ - fprintf(stderr, Name ": %s is a spare and a volume" + pr_err("%s is a spare and a volume" " is already defined for this container\n", dev); - fprintf(stderr, Name ": The option-rom requires all member" + pr_err("The option-rom requires all member" " disks to be a member of all volumes\n"); return 0; } else if (super->orom && mpb->num_raid_devs > 0 && mpb->num_disks != raiddisks) { - fprintf(stderr, Name ": The option-rom requires all member" + pr_err("The option-rom requires all member" " disks to be a member of all volumes\n"); return 0; } @@ -6074,13 +6011,13 @@ dl->extent_cnt = i; } else { if (verbose) - fprintf(stderr, Name ": unable to determine free space for: %s\n", + pr_err("unable to determine free space for: %s\n", dev); return 0; } if (maxsize < size) { if (verbose) - fprintf(stderr, Name ": %s not enough space (%llu < %llu)\n", + pr_err("%s not enough space (%llu < %llu)\n", dev, maxsize, size); return 0; } @@ -6095,7 +6032,7 @@ if (!check_env("IMSM_NO_PLATFORM") && mpb->num_raid_devs > 0 && size && size != maxsize) { - fprintf(stderr, Name ": attempting to create a second " + pr_err("attempting to create a second " "volume with size less then remaining space. " "Aborting...\n"); return 0; @@ -6104,10 +6041,10 @@ if (maxsize < size || maxsize == 0) { if (verbose) { if (maxsize == 0) - fprintf(stderr, Name ": no free space" + pr_err("no free space" " left on device. Aborting...\n"); else - fprintf(stderr, Name ": not enough space" + pr_err("not enough space" " to create volume of given size" " (%llu < %llu). Aborting...\n", maxsize, size); @@ -6181,7 +6118,7 @@ (super->orom && used && used != raiddisks) || maxsize < minsize || maxsize == 0) { - fprintf(stderr, Name ": not enough devices with space to create array.\n"); + pr_err("not enough devices with space to create array.\n"); return 0; /* No enough free spaces large enough */ } @@ -6195,7 +6132,7 @@ } if (!check_env("IMSM_NO_PLATFORM") && mpb->num_raid_devs > 0 && size && size != maxsize) { - fprintf(stderr, Name ": attempting to create a second " + pr_err("attempting to create a second " "volume with size less then remaining space. " "Aborting...\n"); return 0; @@ -6235,6 +6172,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *dev, unsigned long long *freesize, int verbose) { @@ -6250,7 +6188,8 @@ /* Must be a fresh device to add to a container */ return validate_geometry_imsm_container(st, level, layout, raiddisks, - chunk?*chunk:0, size, + *chunk, + size, data_offset, dev, freesize, verbose); } @@ -6285,7 +6224,7 @@ } if (freesize) return reserve_space(st, raiddisks, size, - chunk?*chunk:0, freesize); + *chunk, freesize); } return 1; } @@ -6293,6 +6232,7 @@ /* creating in a given container */ return validate_geometry_imsm_volume(st, level, layout, raiddisks, chunk, size, + data_offset, dev, freesize, verbose); } @@ -6300,15 +6240,14 @@ fd = open(dev, O_RDONLY|O_EXCL, 0); if (fd >= 0) { if (verbose) - fprintf(stderr, - Name ": Cannot create this array on device %s\n", - dev); + pr_err("Cannot create this array on device %s\n", + dev); close(fd); return 0; } if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { if (verbose) - fprintf(stderr, Name ": Cannot open %s: %s\n", + pr_err("Cannot open %s: %s\n", dev, strerror(errno)); return 0; } @@ -6317,11 +6256,11 @@ close(fd); if (cfd < 0) { if (verbose) - fprintf(stderr, Name ": Cannot use %s: It is busy\n", + pr_err("Cannot use %s: It is busy\n", dev); return 0; } - sra = sysfs_read(cfd, 0, GET_VERSION); + sra = sysfs_read(cfd, NULL, GET_VERSION); if (sra && sra->array.major_version == -1 && strcmp(sra->text_version, "imsm") == 0) is_member = 1; @@ -6334,18 +6273,18 @@ if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) { st->sb = super; - st->container_dev = fd2devnum(cfd); + strcpy(st->container_devnm, fd2devnm(cfd)); close(cfd); return validate_geometry_imsm_volume(st, level, layout, raiddisks, chunk, - size, dev, + size, data_offset, dev, freesize, 1) ? 1 : -1; } } if (verbose) - fprintf(stderr, Name ": failed container membership check\n"); + pr_err("failed container membership check\n"); close(cfd); return 0; @@ -6392,20 +6331,17 @@ if (i < current_vol) continue; sprintf(subarray, "%u", i); - if (is_subarray_active(subarray, st->devname)) { - fprintf(stderr, - Name ": deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", - current_vol, i); + if (is_subarray_active(subarray, st->devnm)) { + pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", + current_vol, i); return 2; } } if (st->update_tail) { - struct imsm_update_kill_array *u = malloc(sizeof(*u)); + struct imsm_update_kill_array *u = xmalloc(sizeof(*u)); - if (!u) - return 2; u->type = update_kill_array; u->dev_idx = current_vol; append_metadata_update(st, u, sizeof(*u)); @@ -6451,9 +6387,8 @@ char *ep; int vol; - if (is_subarray_active(subarray, st->devname)) { - fprintf(stderr, - Name ": Unable to update name of active subarray\n"); + if (is_subarray_active(subarray, st->devnm)) { + pr_err("Unable to update name of active subarray\n"); return 2; } @@ -6465,10 +6400,8 @@ return 2; if (st->update_tail) { - struct imsm_update_rename_array *u = malloc(sizeof(*u)); + struct imsm_update_rename_array *u = xmalloc(sizeof(*u)); - if (!u) - return 2; u->type = update_rename_array; u->dev_idx = vol; snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name); @@ -6602,18 +6535,17 @@ /* do not assemble arrays when not all attributes are supported */ if (imsm_check_attributes(mpb->attributes) == 0) { sb_errors = 1; - fprintf(stderr, Name ": Unsupported attributes in IMSM metadata." + pr_err("Unsupported attributes in IMSM metadata." "Arrays activation is blocked.\n"); } /* check for bad blocks */ if (imsm_bbm_log_size(super->anchor)) { - fprintf(stderr, Name ": BBM log found in IMSM metadata." - "Arrays activation is blocked.\n"); + pr_err("BBM log found in IMSM metadata." + "Arrays activation is blocked.\n"); sb_errors = 1; } - /* count spare devices, not used in maps */ for (d = super->disks; d; d = d->next) @@ -6644,7 +6576,7 @@ */ if (dev->vol.migr_state && (migr_type(dev) == MIGR_STATE_CHANGE)) { - fprintf(stderr, Name ": cannot assemble volume '%.16s':" + pr_err("cannot assemble volume '%.16s':" " unsupported migration in progress\n", dev->volume); continue; @@ -6653,12 +6585,7 @@ * OROM/EFI */ - this = malloc(sizeof(*this)); - if (!this) { - fprintf(stderr, Name ": failed to allocate %zu bytes\n", - sizeof(*this)); - break; - } + this = xmalloc(sizeof(*this)); super->current_vol = i; getinfo_super_imsm_volume(st, this, NULL); @@ -6672,7 +6599,7 @@ map->num_members, /* raid disks */ &chunk, join_u32(dev->size_low, dev->size_high), 1 /* verbose */)) { - fprintf(stderr, Name ": IMSM RAID geometry validation" + pr_err("IMSM RAID geometry validation" " failed. Array %s activation is blocked.\n", dev->volume); this->array.state |= @@ -6710,7 +6637,7 @@ if (ord & IMSM_ORD_REBUILD) recovery_start = 0; - /* + /* * if we skip some disks the array will be assmebled degraded; * reset resync start to avoid a dirty-degraded * situation when performing the intial sync @@ -6722,21 +6649,7 @@ if (skip) continue; - info_d = calloc(1, sizeof(*info_d)); - if (!info_d) { - fprintf(stderr, Name ": failed to allocate disk" - " for volume %.16s\n", dev->volume); - info_d = this->devs; - while (info_d) { - struct mdinfo *d = info_d->next; - - free(info_d); - info_d = d; - } - free(this); - this = rest; - break; - } + info_d = xcalloc(1, sizeof(*info_d)); info_d->next = this->devs; this->devs = info_d; @@ -6778,7 +6691,6 @@ return rest; } - static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed, int look_in_map) { @@ -6787,7 +6699,7 @@ map = get_imsm_map(dev, look_in_map); if (!failed) - return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; switch (get_imsm_raid_level(map)) { @@ -6817,7 +6729,7 @@ struct imsm_disk *disk; /* reset the potential in-sync count on even-numbered - * slots. num_copies is always 2 for imsm raid10 + * slots. num_copies is always 2 for imsm raid10 */ if ((i & 1) == 0) insync = 2; @@ -6910,7 +6822,7 @@ struct imsm_super *mpb = super->anchor; if (atoi(inst) >= mpb->num_raid_devs) { - fprintf(stderr, "%s: subarry index %d, out of range\n", + pr_err("%s: subarry index %d, out of range\n", __func__, atoi(inst)); return -ENODEV; } @@ -7008,6 +6920,12 @@ if (!super->missing) return; + /* When orom adds replacement for missing disk it does + * not remove entry of missing disk, but just updates map with + * new added disk. So it is not enough just to test if there is + * any missing disk, we have to look if there are any failed disks + * in map to stop migration */ + dprintf("imsm: mark missing\n"); /* end process for initialization and rebuild only */ @@ -7018,7 +6936,8 @@ failed = imsm_count_failed(super, dev, MAP_0); map_state = imsm_check_degraded(super, dev, failed, MAP_0); - end_migration(dev, super, map_state); + if (failed) + end_migration(dev, super, map_state); } for (dl = super->missing; dl; dl = dl->next) mark_missing(dev, &dl->disk, dl->index); @@ -7283,7 +7202,7 @@ __u8 map_state; if (n > map->num_members) - fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n", + pr_err("imsm: set_disk %d out of range 0..%d\n", n, map->num_members - 1); if (n < 0) @@ -7587,7 +7506,6 @@ return dl; } - static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed) { struct imsm_dev *dev2; @@ -7731,12 +7649,9 @@ dl = imsm_add_spare(super, i, a, 1, rv); if (!dl) continue; - + /* found a usable disk with enough space */ - di = malloc(sizeof(*di)); - if (!di) - continue; - memset(di, 0, sizeof(*di)); + di = xcalloc(1, sizeof(*di)); /* dl->index will be -1 in the case we are activating a * pristine spare. imsm_process_update() will create a @@ -7774,24 +7689,9 @@ * Create a metadata_update record to update the * disk_ord_tbl for the array */ - mu = malloc(sizeof(*mu)); - if (mu) { - mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares); - if (mu->buf == NULL) { - free(mu); - mu = NULL; - } - } - if (!mu) { - while (rv) { - struct mdinfo *n = rv->next; - - free(rv); - rv = n; - } - return NULL; - } - + mu = xmalloc(sizeof(*mu)); + mu->buf = xcalloc(num_spares, + sizeof(struct imsm_update_activate_spare)); mu->space = NULL; mu->space_list = NULL; mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; @@ -7833,7 +7733,6 @@ return 0; } - static struct dl *get_disk_super(struct intel_super *super, int major, int minor) { struct dl *dl = NULL; @@ -7911,7 +7810,6 @@ return check_degraded; } - static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u, struct intel_super *super, void ***space_list) @@ -8071,7 +7969,6 @@ return ret_val; } - static int apply_update_activate_spare(struct imsm_update_activate_spare *u, struct intel_super *super, struct active_array *active_array) @@ -8101,7 +7998,7 @@ break; if (!dl) { - fprintf(stderr, "error: imsm_activate_spare passed " + pr_err("error: imsm_activate_spare passed " "an unknown disk (index: %d)\n", u->dl->index); return 0; @@ -8407,9 +8304,9 @@ * the arrays for general migration and convert selected spares * into active devices. * update_activate_spare - a spare device has replaced a failed - * device in an array, update the disk_ord_tbl. If this disk is - * present in all member arrays then also clear the SPARE_DISK - * flag + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag * update_create_array * update_kill_array * update_rename_array @@ -8486,7 +8383,7 @@ break; } case update_activate_spare: { - struct imsm_update_activate_spare *u = (void *) update->buf; + struct imsm_update_activate_spare *u = (void *) update->buf; if (apply_update_activate_spare(u, super, st->arrays)) super->updates_pending++; break; @@ -8684,7 +8581,7 @@ break; } default: - fprintf(stderr, "error: unsuported process update type:" + pr_err("error: unsuported process update type:" "(type: %d)\n", type); } } @@ -8721,15 +8618,10 @@ int num_members = map->num_members; void *space; int size, i; - int err = 0; /* allocate memory for added disks */ for (i = 0; i < num_members; i++) { size = sizeof(struct dl); - space = malloc(size); - if (!space) { - err++; - break; - } + space = xmalloc(size); *tail = space; tail = space; *tail = NULL; @@ -8737,24 +8629,11 @@ /* allocate memory for new device */ size = sizeof_imsm_dev(super->devlist->dev, 0) + (num_members * sizeof(__u32)); - space = malloc(size); - if (!space) - err++; - else { - *tail = space; - tail = space; - *tail = NULL; - } - if (!err) { - len = disks_to_mpb_size(num_members * 2); - } else { - /* if allocation didn't success, free buffer */ - while (update->space_list) { - void **sp = update->space_list; - update->space_list = *sp; - free(sp); - } - } + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + len = disks_to_mpb_size(num_members * 2); } break; @@ -8780,9 +8659,7 @@ if (u->new_raid_disks > u->old_raid_disks) size += sizeof(__u32)*2* (u->new_raid_disks - u->old_raid_disks); - s = malloc(size); - if (!s) - break; + s = xmalloc(size); *space_tail = s; space_tail = s; *space_tail = NULL; @@ -8816,9 +8693,7 @@ if (u->new_raid_disks > u->old_raid_disks) size += sizeof(__u32)*2* (u->new_raid_disks - u->old_raid_disks); - s = malloc(size); - if (!s) - break; + s = xmalloc(size); *space_tail = s; space_tail = s; *space_tail = NULL; @@ -8831,12 +8706,7 @@ /* add space for disk in update */ size = sizeof(struct dl); - s = malloc(size); - if (!s) { - free(update->space_list); - update->space_list = NULL; - break; - } + s = xmalloc(size); *space_tail = s; space_tail = s; *space_tail = NULL; @@ -8895,16 +8765,9 @@ inf = get_disk_info(u); len = sizeof_imsm_dev(dev, 1); /* allocate a new super->devlist entry */ - dv = malloc(sizeof(*dv)); - if (dv) { - dv->dev = malloc(len); - if (dv->dev) - update->space = dv; - else { - free(dv); - update->space = NULL; - } - } + dv = xmalloc(sizeof(*dv)); + dv->dev = xmalloc(len); + update->space = dv; /* count how many spares will be converted to members */ for (i = 0; i < map->num_members; i++) { @@ -9069,7 +8932,6 @@ } } - /******************************************************************************* * Function: open_backup_targets * Description: Function opens file descriptors for all devices given in @@ -9110,7 +8972,7 @@ sd->disk.minor, 1); raid_fds[sd->disk.raid_disk] = dev_open(dn, O_RDWR); if (raid_fds[sd->disk.raid_disk] < 0) { - fprintf(stderr, "cannot open component\n"); + pr_err("cannot open component\n"); continue; } opened++; @@ -9121,7 +8983,7 @@ imsm_get_allowed_degradation(info->new_level, raid_disks, super, dev)) { - fprintf(stderr, "Not enough disks can be opened.\n"); + pr_err("Not enough disks can be opened.\n"); close_targets(raid_fds, raid_disks); return -2; } @@ -9181,7 +9043,6 @@ migr_rec->post_migr_vol_cap = dev->size_low; migr_rec->post_migr_vol_cap_hi = dev->size_high; - /* Find the smallest dev */ for (sd = info->devs ; sd ; sd = sd->next) { sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); @@ -9236,16 +9097,12 @@ unsigned long long start; int data_disks = imsm_num_data_members(dev, MAP_0); - targets = malloc(new_disks * sizeof(int)); - if (!targets) - goto abort; + targets = xmalloc(new_disks * sizeof(int)); for (i = 0; i < new_disks; i++) targets[i] = -1; - target_offsets = malloc(new_disks * sizeof(unsigned long long)); - if (!target_offsets) - goto abort; + target_offsets = xcalloc(new_disks, sizeof(unsigned long long)); start = info->reshape_progress * 512; for (i = 0; i < new_disks; i++) { @@ -9276,7 +9133,7 @@ start, length, buf) != 0) { - fprintf(stderr, Name ": Error restoring stripes\n"); + pr_err("Error restoring stripes\n"); goto abort; } @@ -9407,13 +9264,10 @@ unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; if (posix_memalign((void **)&buf, 512, unit_len) != 0) goto abort; - targets = malloc(new_disks * sizeof(int)); - if (!targets) - goto abort; + targets = xcalloc(new_disks, sizeof(int)); if (open_backup_targets(info, new_disks, targets, super, id->dev)) { - fprintf(stderr, - Name ": Cannot open some devices belonging to array.\n"); + pr_err("Cannot open some devices belonging to array.\n"); goto abort; } @@ -9423,30 +9277,26 @@ continue; } if (lseek64(targets[i], read_offset, SEEK_SET) < 0) { - fprintf(stderr, - Name ": Cannot seek to block: %s\n", - strerror(errno)); + pr_err("Cannot seek to block: %s\n", + strerror(errno)); skipped_disks++; continue; } if ((unsigned)read(targets[i], buf, unit_len) != unit_len) { - fprintf(stderr, - Name ": Cannot read copy area block: %s\n", - strerror(errno)); + pr_err("Cannot read copy area block: %s\n", + strerror(errno)); skipped_disks++; continue; } if (lseek64(targets[i], write_offset, SEEK_SET) < 0) { - fprintf(stderr, - Name ": Cannot seek to block: %s\n", - strerror(errno)); + pr_err("Cannot seek to block: %s\n", + strerror(errno)); skipped_disks++; continue; } if ((unsigned)write(targets[i], buf, unit_len) != unit_len) { - fprintf(stderr, - Name ": Cannot restore block: %s\n", - strerror(errno)); + pr_err("Cannot restore block: %s\n", + strerror(errno)); skipped_disks++; continue; } @@ -9456,9 +9306,8 @@ new_disks, super, id->dev)) { - fprintf(stderr, - Name ": Cannot restore data from backup." - " Too many failed disks\n"); + pr_err("Cannot restore data from backup." + " Too many failed disks\n"); goto abort; } @@ -9503,30 +9352,29 @@ drv = "isci"; else if (hba && hba->type == SYS_DEV_SATA) drv = "ahci"; - else + else drv = "unknown"; dprintf("path: %s hba: %s attached: %s\n", path, (hba) ? hba->path : "NULL", drv); free(path); - if (hba) - free_sys_dev(&hba); } return drv; } -static int imsm_find_array_minor_by_subdev(int subdev, int container, int *minor) +static char *imsm_find_array_devnm_by_subdev(int subdev, char *container) { + static char devnm[32]; char subdev_name[20]; struct mdstat_ent *mdstat; sprintf(subdev_name, "%d", subdev); mdstat = mdstat_by_subdev(subdev_name, container); if (!mdstat) - return -1; + return NULL; - *minor = mdstat->devnum; + strcpy(devnm, mdstat->devnm); free_mdstat(mdstat); - return 0; + return devnm; } static int imsm_reshape_is_allowed_on_container(struct supertype *st, @@ -9543,10 +9391,9 @@ int devices_that_can_grow = 0; dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): " - "st->devnum = (%i)\n", - st->devnum); + "st->devnm = (%s)\n", st->devnm); - if (geo->size != -1 || + if (geo->size > 0 || geo->level != UnSet || geo->layout != UnSet || geo->chunksize != 0 || @@ -9564,8 +9411,7 @@ info = container_content_imsm(st, NULL); for (member = info; member; member = member->next) { - int result; - int minor; + char *result; dprintf("imsm: checking device_num: %i\n", member->container_member); @@ -9622,10 +9468,9 @@ * so they need to be assembled. We have already * checked that no recovery etc is happening. */ - result = imsm_find_array_minor_by_subdev(member->container_member, - st->container_dev, - &minor); - if (result < 0) { + result = imsm_find_array_devnm_by_subdev(member->container_member, + st->container_devnm); + if (result == NULL) { dprintf("imsm: cannot find array\n"); break; } @@ -9645,10 +9490,10 @@ /* Function: get_spares_for_grow * Description: Allocates memory and creates list of spare devices - * avaliable in container. Checks if spare drive size is acceptable. + * avaliable in container. Checks if spare drive size is acceptable. * Parameters: Pointer to the supertype structure * Returns: Pointer to the list of spare devices (mdinfo structure) on success, - * NULL if fail + * NULL if fail */ static struct mdinfo *get_spares_for_grow(struct supertype *st) { @@ -9687,12 +9532,7 @@ /* now add space for spare disks that we need to add. */ update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1); - u = calloc(1, update_memory_size); - if (u == NULL) { - dprintf("error: " - "cannot get memory for imsm_update_reshape update\n"); - return 0; - } + u = xcalloc(1, update_memory_size); u->type = update_reshape_container_disks; u->old_raid_disks = old_raid_disks; u->new_raid_disks = geo->raid_disks; @@ -9703,7 +9543,7 @@ if (spares == NULL || delta_disks > spares->array.spare_disks) { - fprintf(stderr, Name ": imsm: ERROR: Cannot get spare devices " + pr_err("imsm: ERROR: Cannot get spare devices " "for %s.\n", geo->dev_name); i = -1; goto abort; @@ -9746,7 +9586,6 @@ return 0; } - /****************************************************************************** * function: imsm_create_metadata_update_for_size_change() * Creates update for IMSM array for array size change. @@ -9767,12 +9606,7 @@ /* size of all update data without anchor */ update_memory_size = sizeof(struct imsm_update_size_change); - u = calloc(1, update_memory_size); - if (u == NULL) { - dprintf("error: cannot get memory for " - "imsm_create_metadata_update_for_size_change\n"); - return 0; - } + u = xcalloc(1, update_memory_size); u->type = update_size_change; u->subdev = super->current_vol; u->new_size = geo->size; @@ -9805,12 +9639,7 @@ /* size of all update data without anchor */ update_memory_size = sizeof(struct imsm_update_reshape_migration); - u = calloc(1, update_memory_size); - if (u == NULL) { - dprintf("error: cannot get memory for " - "imsm_create_metadata_update_for_migration\n"); - return 0; - } + u = xcalloc(1, update_memory_size); u->type = update_reshape_migration; u->subdev = super->current_vol; u->new_level = geo->level; @@ -9881,7 +9710,7 @@ /*************************************************************************** * Function: imsm_analyze_change * Description: Function analyze change for single volume -* and validate if transition is supported +* and validate if transition is supported * Parameters: Geometry parameters, supertype structure, * metadata change direction (apply/rollback) * Returns: Operation type code on success, -1 if fail @@ -9901,9 +9730,9 @@ int data_disks; struct imsm_dev *dev; struct intel_super *super; - long long current_size; + unsigned long long current_size; unsigned long long free_size; - long long max_size; + unsigned long long max_size; int rv; getinfo_super_imsm_volume(st, &info, NULL); @@ -9915,10 +9744,9 @@ if (geo->level == 5) { change = CH_MIGRATION; if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) { - fprintf(stderr, - Name " Error. Requested Layout " - "not supported (left-asymmetric layout " - "is supported only)!\n"); + pr_err("Error. Requested Layout " + "not supported (left-asymmetric layout " + "is supported only)!\n"); change = -1; goto analyse_change_exit; } @@ -9943,10 +9771,9 @@ break; } if (change == -1) { - fprintf(stderr, - Name " Error. Level Migration from %d to %d " - "not supported!\n", - info.array.level, geo->level); + pr_err("Error. Level Migration from %d to %d " + "not supported!\n", + info.array.level, geo->level); goto analyse_change_exit; } } else @@ -9966,10 +9793,9 @@ geo->layout = 0; geo->level = 5; } else { - fprintf(stderr, - Name " Error. Layout Migration from %d to %d " - "not supported!\n", - info.array.layout, geo->layout); + pr_err("Error. Layout Migration from %d to %d " + "not supported!\n", + info.array.layout, geo->layout); change = -1; goto analyse_change_exit; } @@ -9994,28 +9820,33 @@ */ current_size = info.custom_array_size / data_disks; - if (geo->size > 0) { + if ((geo->size > 0) && (geo->size != MAX_SIZE)) { /* align component size */ geo->size = imsm_component_size_aligment_check( get_imsm_raid_level(dev->vol.map), chunk * 1024, geo->size * 2); + if (geo->size == 0) { + pr_err("Error. Size expansion is " \ + "supported only (current size is %llu, " \ + "requested size /rounded/ is 0).\n", + current_size); + goto analyse_change_exit; + } } - if ((current_size != geo->size) && (geo->size >= 0)) { + if ((current_size != geo->size) && (geo->size > 0)) { if (change != -1) { - fprintf(stderr, - Name " Error. Size change should be the only " + pr_err("Error. Size change should be the only " "one at a time.\n"); change = -1; goto analyse_change_exit; } if ((super->current_vol + 1) != super->anchor->num_raid_devs) { - fprintf(stderr, - Name " Error. The last volume in container " - "can be expanded only (%i/%i).\n", - super->current_vol, st->devnum); + pr_err("Error. The last volume in container " + "can be expanded only (%i/%s).\n", + super->current_vol, st->devnm); goto analyse_change_exit; } /* check the maximum available size @@ -10035,11 +9866,11 @@ chunk * 1024, max_size); } - if (geo->size == 0) { + if (geo->size == MAX_SIZE) { /* requested size change to the maximum available size */ if (max_size == 0) { - fprintf(stderr, Name " Error. Cannot find " + pr_err("Error. Cannot find " "maximum available space.\n"); change = -1; goto analyse_change_exit; @@ -10058,20 +9889,18 @@ dprintf("Prepare update for size change to %llu\n", geo->size ); if (current_size >= geo->size) { - fprintf(stderr, - Name " Error. Size expansion is " - "supported only (current size is %llu, " - "requested size /rounded/ is %llu).\n", - current_size, geo->size); + pr_err("Error. Size expansion is " + "supported only (current size is %llu, " + "requested size /rounded/ is %llu).\n", + current_size, geo->size); goto analyse_change_exit; } if (max_size && geo->size > max_size) { - fprintf(stderr, - Name " Error. Requested size is larger " - "than maximum available size (maximum " - "available size is %llu, " - "requested size /rounded/ is %llu).\n", - max_size, geo->size); + pr_err("Error. Requested size is larger " + "than maximum available size (maximum " + "available size is %llu, " + "requested size /rounded/ is %llu).\n", + max_size, geo->size); goto analyse_change_exit; } } @@ -10084,7 +9913,7 @@ imsm_layout, geo->raid_disks + devNumChange, &chunk, - geo->size, + geo->size, INVALID_SECTORS, 0, 0, 1)) change = -1; @@ -10093,11 +9922,10 @@ struct imsm_super *mpb = super->anchor; if (mpb->num_raid_devs > 1) { - fprintf(stderr, - Name " Error. Cannot perform operation on %s" - "- for this operation it MUST be single " - "array in container\n", - geo->dev_name); + pr_err("Error. Cannot perform operation on %s" + "- for this operation it MUST be single " + "array in container\n", + geo->dev_name); change = -1; } } @@ -10117,9 +9945,7 @@ struct intel_super *super = st->sb; struct imsm_update_takeover *u; - u = malloc(sizeof(struct imsm_update_takeover)); - if (u == NULL) - return 1; + u = xmalloc(sizeof(struct imsm_update_takeover)); u->type = update_takeover; u->subarray = super->current_vol; @@ -10145,7 +9971,8 @@ return 0; } -static int imsm_reshape_super(struct supertype *st, long long size, int level, +static int imsm_reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, int delta_disks, char *backup, char *dev, int direction, int verbose) @@ -10158,7 +9985,7 @@ memset(&geo, 0, sizeof(struct geo_params)); geo.dev_name = dev; - geo.dev_id = st->devnum; + strcpy(geo.devnm, st->devnm); geo.size = size; geo.level = level; geo.layout = layout; @@ -10173,7 +10000,7 @@ if (experimental() == 0) return ret_val; - if (st->container_dev == st->devnum) { + if (strcmp(st->container_devnm, st->devnm) == 0) { /* On container level we can only increase number of devices. */ dprintf("imsm: info: Container operation\n"); int old_raid_disks = 0; @@ -10201,7 +10028,7 @@ free(u); } else { - fprintf(stderr, Name ": (imsm) Operation " + pr_err("(imsm) Operation " "is not allowed on this container\n"); } } else { @@ -10212,19 +10039,20 @@ */ struct intel_super *super = st->sb; struct intel_dev *dev = super->devlist; - int change, devnum; + int change; dprintf("imsm: info: Volume operation\n"); /* find requested device */ while (dev) { - if (imsm_find_array_minor_by_subdev( - dev->index, st->container_dev, &devnum) == 0 - && devnum == geo.dev_id) + char *devnm = + imsm_find_array_devnm_by_subdev( + dev->index, st->container_devnm); + if (devnm && strcmp(devnm, geo.devnm) == 0) break; dev = dev->next; } if (dev == NULL) { - fprintf(stderr, Name " Cannot find %s (%i) subarray\n", - geo.dev_name, geo.dev_id); + pr_err("Cannot find %s (%s) subarray\n", + geo.dev_name, geo.devnm); goto exit_imsm_reshape_super; } super->current_vol = dev->index; @@ -10297,7 +10125,7 @@ ******************************************************************************/ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) { - int fd = sysfs_get_fd(sra, NULL, "reshape_position"); + int fd = sysfs_get_fd(sra, NULL, "sync_completed"); unsigned long long completed; /* to_complete : new sync_max position */ unsigned long long to_complete = sra->reshape_progress; @@ -10316,10 +10144,10 @@ return 0; } - if (completed > to_complete) { + if (completed > position_to_set) { dprintf("imsm: wait_for_reshape_imsm() " "wrong next position to set %llu (%llu)\n", - to_complete, completed); + to_complete, position_to_set); close(fd); return -1; } @@ -10335,10 +10163,7 @@ do { char action[20]; - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, &rfds, NULL, NULL, NULL); + sysfs_wait(fd, NULL); if (sysfs_get_str(sra, NULL, "sync_action", action, 20) > 0 && strncmp(action, "reshape", 7) != 0) @@ -10349,7 +10174,7 @@ close(fd); return 1; } - } while (completed < to_complete); + } while (completed < position_to_set); close(fd); return 0; @@ -10370,8 +10195,10 @@ int degraded) { unsigned long long new_degraded; - sysfs_get_ll(info, NULL, "degraded", &new_degraded); - if (new_degraded != (unsigned long long)degraded) { + int rv; + + rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded); + if ((rv == -1) || (new_degraded != (unsigned long long)degraded)) { /* check each device to ensure it is still working */ struct mdinfo *sd; new_degraded = 0; @@ -10461,7 +10288,7 @@ } /* Only one volume can migrate at the same time */ if (migr_vol_qan != 1) { - fprintf(stderr, Name " : %s", migr_vol_qan ? + pr_err(": %s", migr_vol_qan ? "Number of migrating volumes greater than 1\n" : "There is no volume during migrationg\n"); goto abort; @@ -10617,6 +10444,8 @@ dprintf("wait_for_reshape_imsm returned error!\n"); goto abort; } + if (sigterm) + goto abort; if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { /* ignore error == 2, this can mean end of reshape here @@ -10651,6 +10480,7 @@ .add_to_super = add_to_super_imsm, .remove_from_super = remove_from_super_imsm, .detail_platform = detail_platform_imsm, + .export_detail_platform = export_detail_platform_imsm, .kill_subarray = kill_subarray_imsm, .update_subarray = update_subarray_imsm, .load_container = load_container_imsm, @@ -10659,6 +10489,7 @@ .reshape_super = imsm_reshape_super, .manage_reshape = imsm_manage_reshape, .recover_backup = recover_backup_imsm, + .copy_metadata = copy_metadata_imsm, #endif .match_home = match_home_imsm, .uuid_from_super= uuid_from_super_imsm, @@ -10678,7 +10509,6 @@ .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, - .external = 1, .name = "imsm", diff -Nru mdadm-3.2.5/super-mbr.c mdadm-3.3/super-mbr.c --- mdadm-3.2.5/super-mbr.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/super-mbr.c 2013-09-03 04:47:47.000000000 +0000 @@ -81,25 +81,23 @@ free_mbr(st); if (posix_memalign((void**)&super, 512, 512) != 0) { - fprintf(stderr, Name ": %s could not allocate superblock\n", + pr_err("%s could not allocate superblock\n", __func__); return 1; } - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ - lseek(fd, 0, 0); if (read(fd, super, sizeof(*super)) != sizeof(*super)) { if (devname) - fprintf(stderr, Name ": Cannot read partition table on %s\n", + pr_err("Cannot read partition table on %s\n", devname); free(super); return 1; } - + if (super->magic != MBR_SIGNATURE_MAGIC) { if (devname) - fprintf(stderr, Name ": No partition table found on %s\n", + pr_err("No partition table found on %s\n", devname); free(super); return 1; @@ -121,13 +119,11 @@ struct MBR *old, *super; if (posix_memalign((void**)&old, 512, 512) != 0) { - fprintf(stderr, Name ": %s could not allocate superblock\n", + pr_err("%s could not allocate superblock\n", __func__); return 1; } - ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */ - lseek(fd, 0, 0); if (read(fd, old, sizeof(*old)) != sizeof(*old)) { free(old); @@ -158,7 +154,7 @@ for (i = 0; i < MBR_PARTITIONS ; i++) if (sb->parts[i].blocks_num) { - unsigned long last = + unsigned long last = (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num) + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba); if (last > info->component_size) @@ -174,9 +170,7 @@ if (strcmp(arg, "mbr") != 0) return NULL; - st = malloc(sizeof(*st)); - if (!st) - return st; + st = xmalloc(sizeof(*st)); st->ss = &mbr; st->info = NULL; st->minor_version = 0; @@ -189,10 +183,11 @@ static int validate_geometry(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, char *subdev, unsigned long long *freesize, int verbose) { - fprintf(stderr, Name ": mbr metadata cannot be used this way\n"); + pr_err("mbr metadata cannot be used this way\n"); return 0; } #endif diff -Nru mdadm-3.2.5/swap_super.c mdadm-3.3/swap_super.c --- mdadm-3.2.5/swap_super.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/swap_super.c 2013-09-03 04:47:47.000000000 +0000 @@ -47,7 +47,6 @@ exit(1); } - for (i=0; i < 4096 ; i+=4) { char t = super[i]; super[i] = super[i+3]; @@ -69,7 +68,6 @@ super[32*4+10*4 +i] = t; } - if (lseek64(fd, offset, 0) < 0LL) { perror("lseek64"); exit(1); @@ -81,5 +79,3 @@ exit(0); } - - diff -Nru mdadm-3.2.5/sysfs.c mdadm-3.3/sysfs.c --- mdadm-3.2.5/sysfs.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/sysfs.c 2013-09-03 04:47:47.000000000 +0000 @@ -57,16 +57,12 @@ } } -int sysfs_open(int devnum, char *devname, char *attr) +int sysfs_open(char *devnm, char *devname, char *attr) { char fname[50]; int fd; - char *mdname = devnum2devname(devnum); - if (!mdname) - return -1; - - sprintf(fname, "/sys/block/%s/md/", mdname); + sprintf(fname, "/sys/block/%s/md/", devnm); if (devname) { strcat(fname, devname); strcat(fname, "/"); @@ -75,41 +71,36 @@ fd = open(fname, O_RDWR); if (fd < 0 && errno == EACCES) fd = open(fname, O_RDONLY); - free(mdname); return fd; } -void sysfs_init(struct mdinfo *mdi, int fd, int devnum) +void sysfs_init(struct mdinfo *mdi, int fd, char *devnm) { mdi->sys_name[0] = 0; if (fd >= 0) { mdu_version_t vers; if (ioctl(fd, RAID_VERSION, &vers) != 0) return; - devnum = fd2devnum(fd); + devnm = fd2devnm(fd); } - if (devnum == NoMdDev) + if (devnm == NULL) return; - fmt_devname(mdi->sys_name, devnum); + strcpy(mdi->sys_name, devnm); } - -struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) +struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) { char fname[PATH_MAX]; char buf[PATH_MAX]; char *base; char *dbase; struct mdinfo *sra; - struct mdinfo *dev; + struct mdinfo *dev, **devp; DIR *dir = NULL; struct dirent *de; - sra = malloc(sizeof(*sra)); - if (sra == NULL) - return sra; - memset(sra, 0, sizeof(*sra)); - sysfs_init(sra, fd, devnum); + sra = xcalloc(1, sizeof(*sra)); + sysfs_init(sra, fd, devnm); if (sra->sys_name[0] == 0) { free(sra); return NULL; @@ -179,8 +170,10 @@ if (options & GET_CACHE) { strcpy(base, "stripe_cache_size"); if (load_sys(fname, buf)) - goto abort; - sra->cache_size = strtoul(buf, NULL, 0); + /* Probably level doesn't support it */ + sra->cache_size = 0; + else + sra->cache_size = strtoul(buf, NULL, 0); } if (options & GET_MISMATCH) { strcpy(base, "mismatch_cnt"); @@ -241,6 +234,8 @@ goto abort; sra->array.spare_disks = 0; + devp = &sra->devs; + sra->devs = NULL; while ((de = readdir(dir)) != NULL) { char *ep; if (de->d_ino == 0 || @@ -250,9 +245,7 @@ dbase = base + strlen(base); *dbase++ = '/'; - dev = malloc(sizeof(*dev)); - if (!dev) - goto abort; + dev = xmalloc(sizeof(*dev)); /* Always get slot, major, minor */ strcpy(dbase, "slot"); @@ -274,7 +267,7 @@ free(dev); goto abort; } - + } strcpy(dev->sys_name, de->d_name); dev->disk.raid_disk = strtoul(buf, &ep, 10); @@ -288,6 +281,7 @@ free(dev); continue; } + sra->array.nr_disks++; sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); /* special case check for block devices that can go 'offline' */ @@ -299,14 +293,20 @@ } /* finally add this disk to the array */ - dev->next = sra->devs; - sra->devs = dev; + *devp = dev; + devp = & dev->next; + dev->next = NULL; if (options & GET_OFFSET) { strcpy(dbase, "offset"); if (load_sys(fname, buf)) goto abort; dev->data_offset = strtoull(buf, NULL, 0); + strcpy(dbase, "new_offset"); + if (load_sys(fname, buf) == 0) + dev->new_data_offset = strtoull(buf, NULL, 0); + else + dev->new_data_offset = dev->data_offset; } if (options & GET_SIZE) { strcpy(dbase, "size"); @@ -455,7 +455,7 @@ return -1; } return 0; -} +} int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name) { @@ -513,6 +513,49 @@ return n; } +int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2) +{ + /* two numbers in this sysfs file, either + * NNN (NNN) + * or + * NNN / NNN + */ + char buf[80]; + int n; + char *ep, *ep2; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0) + return -2; + buf[n] = 0; + *v1 = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + while (*ep == ' ' || *ep == '/' || *ep == '(') + ep++; + *v2 = strtoull(ep, &ep2, 0); + if (ep2 == ep || (*ep2 != 0 && *ep2 != '\n' && *ep2 != ' ' && *ep2 != ')')) { + *v2 = *v1; + return 1; + } + return 2; +} + +int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_two(fd, v1, v2); + close(fd); + return n; +} + int sysfs_fd_get_str(int fd, char *val, int size) { int n; @@ -580,7 +623,7 @@ if ((vers % 100) < 2 || sysfs_set_str(info, NULL, "metadata_version", ver) < 0) { - fprintf(stderr, Name ": This kernel does not " + pr_err("This kernel does not " "support external metadata.\n"); return 1; } @@ -601,7 +644,7 @@ rc = sysfs_set_num(info, NULL, "array_size", info->custom_array_size/2); if (rc && errno == ENOENT) { - fprintf(stderr, Name ": This kernel does not " + pr_err("This kernel does not " "have the md/array_size attribute, " "the array may be larger than expected\n"); rc = 0; @@ -639,13 +682,7 @@ return rv; memset(nm, 0, sizeof(nm)); - sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor); - rv = readlink(dv, nm, sizeof(nm)-1); - if (rv <= 0) - return -1; - nm[rv] = '\0'; - dname = strrchr(nm, '/'); - if (dname) dname++; + dname = devid2kname(makedev(sd->disk.major, sd->disk.minor)); strcpy(sd->sys_name, "dev-"); strcpy(sd->sys_name+4, dname); @@ -776,32 +813,33 @@ return 0; } - -int sysfs_unique_holder(int devnum, long rdev) +int sysfs_unique_holder(char *devnm, long rdev) { - /* Check that devnum is a holder of rdev, + /* Check that devnm is a holder of rdev, * and is the only holder. * we should be locked against races by - * an O_EXCL on devnum + * an O_EXCL on devnm + * Return values: + * 0 - not unique, not even a holder + * 1 - unique, this is the only holder. + * 2/3 - not unique, there is another holder + * -1 - error, cannot find the holders */ DIR *dir; struct dirent *de; char dirname[100]; char l; - int found = 0; + int ret = 0; sprintf(dirname, "/sys/dev/block/%d:%d/holders", major(rdev), minor(rdev)); dir = opendir(dirname); - errno = ENOENT; if (!dir) - return 0; + return -1; l = strlen(dirname); while ((de = readdir(dir)) != NULL) { - char buf[10]; + char buf[100]; + char *sl; int n; - int mj, mn; - char c; - int fd; if (de->d_ino == 0) continue; @@ -809,36 +847,22 @@ continue; strcpy(dirname+l, "/"); strcat(dirname+l, de->d_name); - strcat(dirname+l, "/dev"); - fd = open(dirname, O_RDONLY); - if (fd < 0) { - errno = ENOENT; - break; - } - n = read(fd, buf, sizeof(buf)-1); - close(fd); - if (n < 0) + n = readlink(dirname, buf, sizeof(buf)-1); + if (n <= 0) continue; buf[n] = 0; - if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 || - c != '\n') { - errno = ENOENT; - break; - } - if (mj != MD_MAJOR) - mn = -1-(mn>>6); + sl = strrchr(buf, '/'); + if (!sl) + continue; + sl++; - if (devnum != mn) { - errno = EEXIST; - break; - } - found = 1; + if (strcmp(devnm, sl) == 0) + ret |= 1; + else + ret |= 2; } closedir(dir); - if (de) - return 0; - else - return found; + return ret; } int sysfs_freeze_array(struct mdinfo *sra) @@ -857,9 +881,41 @@ if (strcmp(buf, "frozen\n") == 0) /* Already frozen */ return 0; - if (strcmp(buf, "idle\n") != 0) + if (strcmp(buf, "idle\n") != 0 && strcmp(buf, "recover\n") != 0) return -1; if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) return 0; return 1; } + +int sysfs_wait(int fd, int *msec) +{ + /* Wait up to '*msec' for fd to have an exception condition. + * if msec == NULL, wait indefinitely. + */ + fd_set fds; + int n; + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (msec == NULL) + n = select(fd+1, NULL, NULL, &fds, NULL); + else if (*msec < 0) + n = 0; + else { + struct timeval start, end, tv; + gettimeofday(&start, NULL); + if (*msec < 1000) { + tv.tv_sec = 0; + tv.tv_usec = (*msec)*1000; + } else { + tv.tv_sec = (*msec)/1000; + tv.tv_usec = 0; + } + n = select(fd+1, NULL, NULL, &fds, &tv); + gettimeofday(&end, NULL); + end.tv_sec -= start.tv_sec; + *msec -= (end.tv_sec * 1000 + end.tv_usec/1000 + - start.tv_usec/1000) + 1; + } + return n; +} diff -Nru mdadm-3.2.5/systemd/mdmon@.service mdadm-3.3/systemd/mdmon@.service --- mdadm-3.2.5/systemd/mdmon@.service 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/systemd/mdmon@.service 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD Metadata Monitor on /dev/%I +DefaultDependencies=no +Before=initrd-switch-root.target + +[Service] +ExecStart=/sbin/mdmon --foreground %I +StandardInput=null +StandardOutput=null +StandardError=null +KillMode=none diff -Nru mdadm-3.2.5/test mdadm-3.3/test --- mdadm-3.2.5/test 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/test 2013-09-03 04:47:47.000000000 +0000 @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # run test suite for mdadm user=`id -un` @@ -8,9 +8,6 @@ fi prefix='[0-9][0-9]' -if [ -n "$1" ] -then prefix=$1 -fi dir=`pwd` mdadm=$dir/mdadm @@ -19,6 +16,25 @@ echo >&2 "test: $mdadm isn't usable." fi +testdir="tests" +logdir="$testdir/logs" +logsave=0 +exitonerror=1 + +echo "Testing on linux-$(uname -r) kernel" + +# Check whether to run multipath tests +modprobe multipath 2> /dev/null +if grep -s 'Personalities : .*multipath' > /dev/null /proc/mdstat ; then + MULTIPATH="yes" +fi +INTEGRITY=yes +DEVTYPE=loop +LVM_VOLGROUP=mdtest + +# make sure to test local mdmon, not system one +export MDADM_NO_SYSTEMCTL=1 + # assume md0, md1, md2 exist in /dev md0=/dev/md0 md1=/dev/md1 md2=/dev/md2 mdp0=/dev/md_d0 @@ -53,44 +69,75 @@ cleanup() { udevadm settle $mdadm -Ssq 2> /dev/null - for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 - do + case $DEVTYPE in + loop) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d rm -f /dev/disk/by-path/loop* - done + done + ;; + lvm) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + eval "lvremove --quiet -f \$dev$d" + done + ;; + esac } -trap cleanup 0 1 2 3 15 +ctrl_c() { + exitonerror=1 +} -devlist= -for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 -do - sz=$size - if [ $d -gt 7 ]; then sz=$ddfsize ; fi - [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 - [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d - if [ $d -eq 7 ] - then - losetup /dev/loop$d $targetdir/mdtest6 # for multipath use - else - losetup /dev/loop$d $targetdir/mdtest$d - fi - eval dev$d=/dev/loop$d - eval file$d=$targetdir/mdtest$d - eval devlist=\"\$devlist \$dev$d\" +do_setup() { + trap cleanup 0 1 3 15 + trap ctrl_c 2 + + devlist= + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + sz=$size + if [ $d -gt 7 ]; then sz=$ddfsize ; fi + case $DEVTYPE in + loop) + [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 + [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d + if [ $d -eq 7 ] + then + losetup /dev/loop$d $targetdir/mdtest6 # for multipath use + else + losetup /dev/loop$d $targetdir/mdtest$d + fi + eval dev$d=/dev/loop$d + eval file$d=$targetdir/mdtest$d + ;; + lvm) + unset MULTIPATH + eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d + if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP; then + trap '' 0 # make sure lvremove is not called + eval echo error creating \$dev$d + exit 129 + fi + ;; + ram) + unset MULTIPATH + eval dev$d=/dev/ram$d + ;; + esac + eval devlist=\"\$devlist \$dev$d\" + eval devlist$d=\"\$devlist\" #" <-- add this quote to un-confuse vim syntax highlighting -done -path0=$dev6 -path1=$dev7 - -ulimit -c unlimited -[ -f /proc/mdstat ] || modprobe md_mod -echo 2000 > /proc/sys/dev/raid/speed_limit_max -echo 0 > /sys/module/md_mod/parameters/start_ro - -if [ " $1" = " setup" ] -then trap 0 ; exit 0 -fi + done + path0=$dev6 + path1=$dev7 + + ulimit -c unlimited + [ -f /proc/mdstat ] || modprobe md_mod + echo 2000 > /proc/sys/dev/raid/speed_limit_max + echo 0 > /sys/module/md_mod/parameters/start_ro +} # mdadm always adds --quiet, and we want to see any unexpected messages mdadm() { @@ -124,6 +171,10 @@ grep -s "active $1 " /proc/mdstat > /dev/null || { echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} ;; + algorithm ) + grep -s " algorithm $2 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR algorithm $2 not found"; cat /proc/mdstat; exit 1;} + ;; resync | recovery | reshape) sleep 0.5 grep -s $1 /proc/mdstat > /dev/null || { @@ -138,10 +189,13 @@ ;; wait ) + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 2000000 > /proc/sys/dev/raid/speed_limit_max sleep 0.1 while grep -E '(resync|recovery|reshape|check|repair) *=' > /dev/null /proc/mdstat - do sleep 2; + do sleep 0.5; done + echo $p > /proc/sys/dev/raid/speed_limit_max ;; state ) @@ -198,28 +252,151 @@ fi } +fast_sync() { + echo 200000 > /proc/sys/dev/raid/speed_limit_max +} + rotest() { dev=$1 fsck -fn $dev >&2 } -for script in tests/$prefix tests/$prefix*[^~] -do - if [ -f "$script" ] +do_test() { + _script=$1 + _basename=`basename $_script` + if [ -f "$_script" ] then - rm -f $targetdir/stderr - # stop all arrays, just incase some script left an array active. - $mdadm -Ssq 2> /dev/null - mdadm --zero $devlist 2> /dev/null - mdadm --zero $devlist 2> /dev/null - # source script in a subshell, so it has access to our - # namespace, but cannot change it. - echo -ne "$script... " - if ( set -ex ; . $script ) 2> $targetdir/log - then echo "succeeded" - else echo "FAILED - see $targetdir/log for details" - exit 1 - fi + rm -f $targetdir/stderr + # stop all arrays, just incase some script left an array active. + $mdadm -Ssq 2> /dev/null + mdadm --zero $devlist 2> /dev/null + mdadm --zero $devlist 2> /dev/null + # this might have been reset: restore the default. + echo 2000 > /proc/sys/dev/raid/speed_limit_max + # source script in a subshell, so it has access to our + # namespace, but cannot change it. + echo -ne "$_script... " + if ( set -ex ; . $_script ) &> $targetdir/log + then + echo "succeeded" + _fail=0 + else + log=log + cat $targetdir/stderr >> $targetdir/log + if [ $exitonerror == 0 ]; then + log=log-`basename $_script` + mv $targetdir/log $targetdir/$log + fi + echo "FAILED - see $targetdir/$log for details" + _fail=1 + fi + if [ "$savelogs" == "1" ]; then + cp $targetdir/log $logdir/$_basename.log + fi + if [ "$_fail" == "1" -a "$exitonerror" == "1" ]; then + exit 1 + fi fi +} + +do_help() { + echo "Usage: $0 [options]" + echo " Options:" + echo " --tests= Comma separated list of tests to run" + echo " --disable-multipath Disable any tests involving multipath" + echo " --disable-integrity Disable slow tests of RAID[56] consistency" + echo " --logdir= Directory to save logfiles in" + echo " --save-logs Save all logs in " + echo " --keep-going Don't stop on error, ie. run all tests" + echo " --dev=[loop|lvm|ram] Use loop devices (default), LVM, or RAM disk" + echo " --volgroup= LVM volume group for LVM test" + echo " setup Setup test environment and exit" + echo " cleanup Cleanup test environment" + echo " Run tests with " +} + +parse_args() { + for i in $* + do + case $i in + [0-9]*) + prefix=$i + ;; + setup) + echo "mdadm test environment setup" + do_setup + trap 0; exit 0 + ;; + cleanup) + cleanup + exit 0 + ;; + --tests=*) + TESTLIST=`expr "x$i" : 'x[^=]*=\(.*\)' | sed -e 's/,/ /g'` + ;; + --logdir=*) + logdir=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --save-logs) + savelogs=1 + if [ ! -d $logdir ] ; then + mkdir $logdir + if [ $? -ne 0 ] ; then + exit 1; + fi + fi + ;; + --keep-going | --no-error) + exitonerror=0 + ;; + --disable-multipath) + unset MULTIPATH + ;; + --disable-integrity) + unset INTEGRITY + ;; + --dev=loop) + DEVTYPE=loop + ;; + --dev=lvm) + DEVTYPE=lvm + ;; + --dev=ram) + DEVTYPE=ram + ;; + --volgroup=*) + LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --help) + do_help + exit 0; + ;; + -*) + echo " $0: Unknown argument: $i" + do_help + exit 0; + ;; + esac done +} + +parse_args $@ + +do_setup + +if [ "$savelogs" == "1" ]; then + echo "Saving logs to $logdir" +fi + +if [ "x$TESTLIST" != "x" ]; then + for script in $TESTLIST + do + do_test $testdir/$script + done +else + for script in $testdir/$prefix $testdir/$prefix*[^~] + do + do_test $script + done +fi exit 0 diff -Nru mdadm-3.2.5/tests/00linear mdadm-3.3/tests/00linear --- mdadm-3.2.5/tests/00linear 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/00linear 2013-09-03 04:47:47.000000000 +0000 @@ -23,4 +23,3 @@ check linear testdev $md0 5 $size 64 mdadm -S $md0 - diff -Nru mdadm-3.2.5/tests/00multipath mdadm-3.3/tests/00multipath --- mdadm-3.2.5/tests/00multipath 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/00multipath 2013-09-03 04:47:47.000000000 +0000 @@ -2,6 +2,11 @@ # # create a multipath, and fail and stuff +if [ "$MULTIPATH" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + mdadm -CR $md1 -l multipath -n2 $path0 $path1 testdev $md1 1 $mdsize12 1 diff -Nru mdadm-3.2.5/tests/00names mdadm-3.3/tests/00names --- mdadm-3.2.5/tests/00names 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/00names 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,13 @@ +set -x -e + +# create arrays with non-numeric names +conf=$targetdir/mdadm.conf +echo "CREATE names=yes" > $conf + +for i in linear raid0 raid1 raid4 raid5 raid6 +do + mdadm -CR --config $conf /dev/md/$i -l $i -n 4 $dev4 $dev3 $dev2 $dev1 + check $i + [ -d /sys/class/block/md_$i/md ] + mdadm -S md_$i +done diff -Nru mdadm-3.2.5/tests/01r5integ mdadm-3.3/tests/01r5integ --- mdadm-3.2.5/tests/01r5integ 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/01r5integ 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,12 @@ # Check integrity of raid5 in degraded mode # Create a 4 disk raid5, create a filesystem and -# sh1sum it with each device failed +# sha1sum it with each device failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi for layout in ls rs la ra do @@ -16,9 +21,9 @@ mdadm $md0 -r $i blockdev --flushbufs $md0 sum1=`sha1sum $md0` - if [ $sum != $sum1 ] + if [ "$sum" != "$sum1" ] then - echo $sum does not matc $sum1 with $i missing + echo $sum does not match $sum1 with $i missing exit 1 fi mdadm $md0 -a $i @@ -26,4 +31,3 @@ done mdadm -S $md0 done - diff -Nru mdadm-3.2.5/tests/01raid6integ mdadm-3.3/tests/01raid6integ --- mdadm-3.2.5/tests/01raid6integ 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/01raid6integ 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,12 @@ # Check integrity of raid6 in degraded modes # Create a 5 disk raid6, dump some data to it, then -# sh1sum it with different pairs of devices failed +# sha1sum it with different pairs of devices failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi layouts='ls rs la ra' lv=`uname -r` @@ -25,9 +30,9 @@ mdadm $md0 -r $second blockdev --flushbufs $md0 sum1=`sha1sum $md0` - if [ $sum != $sum1 ] + if [ "$sum" != "$sum1" ] then - echo $sum does not matc $sum1 with $second missing + echo $sum does not match $sum1 with $second missing exit 1 fi for first in $totest @@ -36,9 +41,9 @@ mdadm $md0 -r $first blockdev --flushbufs $md0 sum1=`sha1sum $md0` - if [ $sum != $sum1 ] + if [ "$sum" != "$sum1" ] then - echo $sum does not matc $sum1 with $first and $second missing + echo $sum does not match $sum1 with $first and $second missing exit 1 fi mdadm $md0 -a $first @@ -50,4 +55,3 @@ done mdadm -S $md0 done - diff -Nru mdadm-3.2.5/tests/01replace mdadm-3.3/tests/01replace --- mdadm-3.2.5/tests/01replace 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/01replace 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,52 @@ +set -x -e + +## test --replace for raid5 raid6 raid1 and raid10 +#1/ after replace, can remove replaced device +#2/ after --replace-with cannot remove the 'with' device +#3/ preserve integrity with concurrent failure + +for level in 1 5 6 10 +do + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + dd if=/dev/urandom of=$md0 bs=1M || true + sum=`sha1sum < $md0` + check wait + mdadm $md0 --replace $dev1 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 && exit 1 + mdadm -S $md0 + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + check wait + sum1=`sha1sum < $md0` + [ "$sum" == "$sum1" ] + + mdadm $md0 --replace $dev1 --with $dev4 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 + mdadm $md0 --remove $dev4 && exit 1 + + mdadm $md0 --add $dev1 $dev5 + mdadm $md0 --replace $dev0 + sleep 1 + mdadm $md0 --fail $dev2 + check wait + sum2=`sha1sum < $md0` + [ "$sum" == "$sum2" ] + + mdadm $md0 --remove $dev0 $dev2 + mdadm $md0 --add $dev0 $dev2 + mdadm $md0 --replace $dev3 + sleep 1 + mdadm $md0 --fail $dev0 $dev2 + check wait + sum3=`sha1sum < $md0` + [ "$sum" == "$sum3" ] + + mdadm -S $md0 +done diff -Nru mdadm-3.2.5/tests/02r5grow mdadm-3.3/tests/02r5grow --- mdadm-3.2.5/tests/02r5grow 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/02r5grow 2013-09-03 04:47:47.000000000 +0000 @@ -2,7 +2,7 @@ # create a small raid5 array, make it larger. Then make it smaller -mdadm -CR $md0 -e0.90 --level raid5 --chunk=32 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +mdadm -CR $md0 -e0.90 --level raid5 --chunk=64 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 check wait check state UUU testdev $md0 2 $[size/2] 32 diff -Nru mdadm-3.2.5/tests/02r6grow mdadm-3.3/tests/02r6grow --- mdadm-3.2.5/tests/02r6grow 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/02r6grow 2013-09-03 04:47:47.000000000 +0000 @@ -2,7 +2,7 @@ # create a small raid6 array, make it larger. Then make it smaller -mdadm -CR $md0 -e 0.90 --level raid6 --chunk=32 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +mdadm -CR $md0 -e 0.90 --level raid6 --chunk=64 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 check wait check state UUUU testdev $md0 2 $[size/2] 32 diff -Nru mdadm-3.2.5/tests/03assem-incr mdadm-3.3/tests/03assem-incr --- mdadm-3.2.5/tests/03assem-incr 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/03assem-incr 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,17 @@ +set -x -e + +# Test interaction between -I and -A +# there are locking issue too, but those are hard to test for. +# +# Here just test that a partly "-I" assembled array can +# be completed with "-A" + +for l in 0 1 5 linear +do + mdadm -CR $md0 -l $l -n5 $dev0 $dev1 $dev2 $dev3 $dev4 --assume-clean + mdadm -S md0 + mdadm -I $dev1 + mdadm -I $dev3 + mdadm -A /dev/md0 $dev0 $dev1 $dev2 $dev3 $dev4 + mdadm -S /dev/md0 +done diff -Nru mdadm-3.2.5/tests/03r5assemV1 mdadm-3.3/tests/03r5assemV1 --- mdadm-3.2.5/tests/03r5assemV1 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/03r5assemV1 2013-09-03 04:47:47.000000000 +0000 @@ -60,7 +60,8 @@ ### Now with a missing device # We don't want the recovery to complete while we are # messing about here. -echo 1000 > /proc/sys/dev/raid/speed_limit_max +echo 100 > /proc/sys/dev/raid/speed_limit_max +echo 100 > /proc/sys/dev/raid/speed_limit_min mdadm -AR $md1 $dev0 $dev2 $dev3 $dev4 # check state U_U @@ -124,3 +125,4 @@ mdadm -I -c $conf $dev2 eval $tst echo 2000 > /proc/sys/dev/raid/speed_limit_max +echo 1000 > /proc/sys/dev/raid/speed_limit_min diff -Nru mdadm-3.2.5/tests/04r5swap mdadm-3.3/tests/04r5swap --- mdadm-3.2.5/tests/04r5swap 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/04r5swap 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ + +# make a raid5 array, byte swap the superblocks, then assemble... + +mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +sleep 4 +mdadm -S $md0 + +mdadm -E --metadata=0 $dev1 > $targetdir/d1 +for d in $dev0 $dev1 $dev2 $dev3 +do $dir/swap_super $d +done +mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s +diff -u $targetdir/d1 $targetdir/d1s + +mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3 +sleep 3 +check recovery +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/04update-metadata mdadm-3.3/tests/04update-metadata --- mdadm-3.2.5/tests/04update-metadata 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/04update-metadata 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,48 @@ +set -xe + +# test converting v0.90 to v1.0 +# check for different levels +# check it fails for non-v0.90 +# check it fails during reshape or recovery +# check it fails when bitmap is present + +dlist="$dev0 $dev1 $dev2 $dev3" + +for ls in raid0/4 linear/4 raid1/1 raid5/3 raid6/2 +do + s=${ls#*/} l=${ls%/*} + mdadm -CR --assume-clean -e 0.90 $md0 --level $l -n 4 -c 64 $dlist + testdev $md0 $s 19904 64 + mdadm -S $md0 + mdadm -A $md0 --update=metadata $dlist + testdev $md0 $s 19904 64 check + mdadm -S $md0 +done + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail with v1.0 metadata + exit 1 +fi + +mdadm -CR -e 0.90 $md0 --level=6 -n4 -c32 $dlist +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail during resync + exit 1 +fi +mdadm -A $md0 $dlist +mdadm --wait $md0 +mdadm -S $md0 + +# should succeed now +mdadm -A $md0 --update=metadata $dlist + +mdadm -S /dev/md0 +mdadm -CR --assume-clean -e 0.90 $md0 --level=6 -n4 -c32 $dlist --bitmap=internal +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail when bitmap present + exit 1 +fi diff -Nru mdadm-3.2.5/tests/04update-uuid mdadm-3.3/tests/04update-uuid --- mdadm-3.2.5/tests/04update-uuid 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/04update-uuid 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,82 @@ +set -x + +# create an array, then change the uuid. + +mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + +# try v1 superblock + +mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + + +# now if we have a bitmap, that needs updating too. +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# and bitmap for version1 +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +# -X cannot tell which byteorder to use for the UUID, so allow both. +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# Internal bitmaps too. +mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 + +mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 diff -Nru mdadm-3.2.5/tests/05r1-add-internalbitmap mdadm-3.3/tests/05r1-add-internalbitmap --- mdadm-3.2.5/tests/05r1-add-internalbitmap 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-add-internalbitmap 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-add-internalbitmap-v1a mdadm-3.3/tests/05r1-add-internalbitmap-v1a --- mdadm-3.2.5/tests/05r1-add-internalbitmap-v1a 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-add-internalbitmap-v1a 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-add-internalbitmap-v1b mdadm-3.3/tests/05r1-add-internalbitmap-v1b --- mdadm-3.2.5/tests/05r1-add-internalbitmap-v1b 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-add-internalbitmap-v1b 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-add-internalbitmap-v1c mdadm-3.3/tests/05r1-add-internalbitmap-v1c --- mdadm-3.2.5/tests/05r1-add-internalbitmap-v1c 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-add-internalbitmap-v1c 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-bitmapfile mdadm-3.3/tests/05r1-bitmapfile --- mdadm-3.2.5/tests/05r1-bitmapfile 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/05r1-bitmapfile 2013-09-03 04:47:47.000000000 +0000 @@ -33,6 +33,7 @@ mdadm --assemble -R $md0 --bitmap=$bmf $dev2 dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +mdadm --zero $dev1 # force --add, not --re-add mdadm $md0 --add $dev1 #it is too fast# check recovery diff -Nru mdadm-3.2.5/tests/05r1-remove-internalbitmap mdadm-3.3/tests/05r1-remove-internalbitmap --- mdadm-3.2.5/tests/05r1-remove-internalbitmap 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-remove-internalbitmap 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1a mdadm-3.3/tests/05r1-remove-internalbitmap-v1a --- mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1a 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-remove-internalbitmap-v1a 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1b mdadm-3.3/tests/05r1-remove-internalbitmap-v1b --- mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1b 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-remove-internalbitmap-v1b 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1c mdadm-3.3/tests/05r1-remove-internalbitmap-v1c --- mdadm-3.2.5/tests/05r1-remove-internalbitmap-v1c 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r1-remove-internalbitmap-v1c 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/05r5-bitmapfile mdadm-3.3/tests/05r5-bitmapfile --- mdadm-3.2.5/tests/05r5-bitmapfile 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/05r5-bitmapfile 2013-09-03 04:47:47.000000000 +0000 @@ -32,6 +32,7 @@ mdadm -S $md0 mdadm --assemble -R $md0 --bitmap=$bmf $dev2 $dev3 +mdadm --zero $dev1 # force add, not re-add mdadm $md0 --add $dev1 check recovery diff -Nru mdadm-3.2.5/tests/05r5-internalbitmap mdadm-3.3/tests/05r5-internalbitmap --- mdadm-3.2.5/tests/05r5-internalbitmap 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/05r5-internalbitmap 2013-09-03 04:47:47.000000000 +0000 @@ -30,6 +30,7 @@ mdadm -S $md0 mdadm --assemble -R $md0 $dev2 $dev3 +mdadm --zero $dev1 # force --add, not --re-add mdadm $md0 --add $dev1 check recovery diff -Nru mdadm-3.2.5/tests/05r6-bitmapfile mdadm-3.3/tests/05r6-bitmapfile --- mdadm-3.2.5/tests/05r6-bitmapfile 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/05r6-bitmapfile 2013-09-03 04:47:47.000000000 +0000 @@ -32,6 +32,7 @@ mdadm -S $md0 mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev2 $dev4 +mdadm --zero $dev3 # force --add, not --re-add mdadm $md0 --add $dev3 check recovery diff -Nru mdadm-3.2.5/tests/05r6tor0 mdadm-3.3/tests/05r6tor0 --- mdadm-3.2.5/tests/05r6tor0 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/05r6tor0 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,27 @@ +set -x -e + +# reshape a RAID6 to RAID5 and then RAID0. +# then reshape back up to RAID5 and RAID5 + +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check wait +check raid6 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 +check wait +check raid5 +testdev $md0 3 19456 512 +mdadm -G $md0 -l0 +check wait +check raid0 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 --add $dev3 $dev4 +check wait +check raid5 +check algorithm 2 +testdev $md0 3 19456 512 +mdadm -G $md0 -l 6 +check wait +check raid6 +check algorithm 2 +testdev $md0 3 19456 512 diff -Nru mdadm-3.2.5/tests/06name mdadm-3.3/tests/06name --- mdadm-3.2.5/tests/06name 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/06name 2013-09-03 04:47:47.000000000 +0000 @@ -10,4 +10,3 @@ mdadm -A $md0 --name="Fred" $devlist #mdadm -Db $md0 mdadm -S $md0 - diff -Nru mdadm-3.2.5/tests/06r5swap mdadm-3.3/tests/06r5swap --- mdadm-3.2.5/tests/06r5swap 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/06r5swap 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ - -# make a raid5 array, byte swap the superblocks, then assemble... - -mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 -sleep 4 -mdadm -S $md0 - -mdadm -E --metadata=0 $dev1 > $targetdir/d1 -for d in $dev0 $dev1 $dev2 $dev3 -do $dir/swap_super $d -done -mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s -diff -u $targetdir/d1 $targetdir/d1s - -mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3 -sleep 3 -check recovery -mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/06update-uuid mdadm-3.3/tests/06update-uuid --- mdadm-3.2.5/tests/06update-uuid 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/06update-uuid 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ -set -x - -# create an array, then change the uuid. - -mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -mdadm -S /dev/md0 - -# try v1 superblock - -mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -mdadm -S /dev/md0 - - -# now if we have a bitmap, that needs updating too. -rm -f $targetdir/bitmap -mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || - mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 -then : ; else - echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; -fi -mdadm -S /dev/md0 - -# and bitmap for version1 -rm -f $targetdir/bitmap -mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -# -X cannot tell which byteorder to use for the UUID, so allow both. -if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || - mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 -then : ; else - echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; -fi -mdadm -S /dev/md0 - -# Internal bitmaps too. -mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -X $dev0; exit 2; -} -mdadm -S /dev/md0 - -mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 -mdadm -S /dev/md0 -mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 -no_errors -mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; -} -mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { - echo Wrong uuid; mdadm -X $dev0; exit 2; -} -mdadm -S /dev/md0 diff -Nru mdadm-3.2.5/tests/06wrmostly mdadm-3.3/tests/06wrmostly --- mdadm-3.2.5/tests/06wrmostly 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/06wrmostly 2013-09-03 04:47:47.000000000 +0000 @@ -11,4 +11,3 @@ mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2 testdev $md0 1 $mdsize1a 64 mdadm -S $md0 - diff -Nru mdadm-3.2.5/tests/07changelevelintr mdadm-3.3/tests/07changelevelintr --- mdadm-3.2.5/tests/07changelevelintr 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/07changelevelintr 2013-09-03 04:47:47.000000000 +0000 @@ -48,13 +48,13 @@ restart checkgeo md0 raid5 5 $[128*1024] 3 -mdadm -G $md0 --array-size 59136 +mdadm -G $md0 --array-size 58368 mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu restart checkgeo md0 raid5 4 $[64*1024] 3 devs="$dev0 $dev1 $dev2 $dev3" -mdadm -G $md0 --array-size 19712 +mdadm -G $md0 --array-size 19456 mdadm -G $md0 -n 2 -c 256 --backup-file=$bu restart checkgeo md0 raid5 2 $[256*1024] 3 diff -Nru mdadm-3.2.5/tests/07changelevels mdadm-3.3/tests/07changelevels --- mdadm-3.2.5/tests/07changelevels 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/07changelevels 2013-09-03 04:47:47.000000000 +0000 @@ -59,13 +59,16 @@ mdadm -G /dev/md0 --array-size 39936 mdadm -G $md0 -n4 --backup-file $bu +checkgeo md0 raid6 4 $[32*1024] dotest 2 mdadm -G $md0 -l5 --backup-file $bu +checkgeo md0 raid5 3 $[32*1024] dotest 2 mdadm -G /dev/md0 --array-size 19968 mdadm -G $md0 -n2 --backup-file $bu +checkgeo md0 raid5 2 $[32*1024] dotest 1 mdadm -G --level=1 $md0 @@ -79,9 +82,10 @@ mdadm $md0 --fail $dev0 -mdadm -G /dev/md0 --array-size 39936 +mdadm -G /dev/md0 --array-size 37888 mdadm -G $md0 -n4 --backup-file $bu dotest 2 +checkgeo md0 raid6 4 $[512*1024] mdadm $md0 --fail $dev4 mdadm $md0 --fail $dev3 @@ -98,10 +102,12 @@ mdadm -G $md0 -l5 --backup-file $bu dotest 2 -mdadm -G /dev/md0 --array-size 19968 +mdadm -G /dev/md0 --array-size 18944 mdadm -G $md0 -n2 --backup-file $bu dotest 1 +checkgeo md0 raid5 2 $[512*1024] mdadm $md0 --fail $dev2 mdadm -G --level=1 $md0 dotest 1 +checkgeo md0 raid1 2 diff -Nru mdadm-3.2.5/tests/07reshape5intr mdadm-3.3/tests/07reshape5intr --- mdadm-3.2.5/tests/07reshape5intr 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/07reshape5intr 2013-09-03 04:47:47.000000000 +0000 @@ -20,6 +20,7 @@ mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs mdadm $md0 --add $dev6 + echo 20 > /proc/sys/dev/raid/speed_limit_min echo 20 > /proc/sys/dev/raid/speed_limit_max mdadm --grow $md0 -n $[disks+1] check reshape @@ -27,6 +28,7 @@ mdadm --stop $md0 mdadm --assemble $md0 $devs $dev6 check reshape + echo 1000 > /proc/sys/dev/raid/speed_limit_min echo 2000 > /proc/sys/dev/raid/speed_limit_max check wait echo check > /sys/block/md0/md/sync_action diff -Nru mdadm-3.2.5/tests/07revert-grow mdadm-3.3/tests/07revert-grow --- mdadm-3.2.5/tests/07revert-grow 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/07revert-grow 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,52 @@ +set -e -x + +# revert a reshape that is increasing the number of devices, +# raid5, raid6, and raid10 + +# metadate 0.90 cannot handle RAID10 growth +# metadata 1.0 doesn't get a default headspace, is don't try it either. + +for metadata in 0.90 1.1 1.2 +do +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 --metadata=$metadata +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n4 -x1 $devlist4 --metadata=$metadata +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +if [ $metadata = 0.90 ]; then continue; fi + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n4 -x1 $devlist4 --metadata=$metadata +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist4 +check wait +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +done diff -Nru mdadm-3.2.5/tests/07revert-inplace mdadm-3.3/tests/07revert-inplace --- mdadm-3.2.5/tests/07revert-inplace 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/07revert-inplace 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,44 @@ +set -e -x + +# revert a reshape that is not changing the number of data devices, +# raid5, raid6, and raid10 + +# RAID5 -> RAID6 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 6 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +check algorithm 18 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 -> RAID5 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 - decrease chunk size +mdadm -CR --assume-clean $md0 -l10 -n6 -c 64 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -G $md0 -c 32 +sleep 3 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/07revert-shrink mdadm-3.3/tests/07revert-shrink --- mdadm-3.2.5/tests/07revert-shrink 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/07revert-shrink 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,56 @@ +set -e -x + +# revert a reshape that is decreasing the number of devices, +# raid5, raid6, and raid10 + +bu=$targetdir/md-backup +rm -f $bu +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n5 $devlist4 +check raid5 +testdev $md0 4 $mdsize1 512 +mdadm --grow $md0 --array-size 56832 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid5 +fsck -f -n $md0 +testdev $md0 4 $mdsize1 512 +mdadm -S $md0 + +#FIXME +rm -f $bu +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 37888 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid6 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n6 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 36864 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 diff -Nru mdadm-3.2.5/tests/07testreshape5 mdadm-3.3/tests/07testreshape5 --- mdadm-3.2.5/tests/07testreshape5 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/07testreshape5 2013-09-03 04:47:47.000000000 +0000 @@ -43,4 +43,3 @@ done done exit 0 - diff -Nru mdadm-3.2.5/tests/08imsm-overlap mdadm-3.3/tests/08imsm-overlap --- mdadm-3.2.5/tests/08imsm-overlap 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/08imsm-overlap 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ - -. tests/env-imsm-template - -# create raid arrays with varying degress of overlap -mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 -imsm_check container 6 - -size=1910 -level=1 -num_disks=2 -mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size -mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size -mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size -mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size - -udevadm settle - -offset=0 -imsm_check member $member0 $num_disks $level $size 1024 $offset -offset=$((offset+size+2048)) -imsm_check member $member1 $num_disks $level $size 1024 $offset -offset=$((offset+size+2048)) -imsm_check member $member2 $num_disks $level $size 1024 $offset -# at this point there should be more freespace at the start of the disk -# than the end -offset=0 -imsm_check member $member3 $num_disks $level $size 1024 $offset -offset=$((offset+size+2048)) -imsm_check member $member4 $num_disks $level $size 1024 $offset diff -Nru mdadm-3.2.5/tests/09imsm-create-fail-rebuild mdadm-3.3/tests/09imsm-create-fail-rebuild --- mdadm-3.2.5/tests/09imsm-create-fail-rebuild 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/09imsm-create-fail-rebuild 2013-09-03 04:47:47.000000000 +0000 @@ -76,4 +76,3 @@ mdadm --add $container $dev4 check wait imsm_check_hold $container $dev4 - diff -Nru mdadm-3.2.5/tests/09imsm-overlap mdadm-3.3/tests/09imsm-overlap --- mdadm-3.2.5/tests/09imsm-overlap 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/09imsm-overlap 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,30 @@ + +. tests/env-imsm-template + +# create raid arrays with varying degress of overlap +mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 +imsm_check container 6 + +size=1910 +level=1 +num_disks=2 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size +mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size +mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size +mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size +mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size + +udevadm settle + +offset=0 +imsm_check member $member0 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member1 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member2 $num_disks $level $size 1024 $offset +# at this point there should be more freespace at the start of the disk +# than the end +offset=0 +imsm_check member $member3 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member4 $num_disks $level $size 1024 $offset diff -Nru mdadm-3.2.5/tests/10ddf-create mdadm-3.3/tests/10ddf-create --- mdadm-3.2.5/tests/10ddf-create 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/10ddf-create 2013-09-03 04:47:47.000000000 +0000 @@ -2,7 +2,8 @@ # Test basic DDF functionality. # # Create a container with 5 drives -# create a small raid0 across them all, then a 2disk raid1 +# create a small raid0 across them all, +# then a small raid10 using 4 drives, then a 2disk raid1 # and a 3disk raid5 using the remaining space # # add some data, tear down the array, reassemble @@ -10,21 +11,25 @@ set -e mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 -mdadm -CR r0 -l0 -n5 /dev/md/ddf0 -z 5000 -if mdadm -CR r0 -l1 -n2 /dev/md/ddf0 -z 5000 +mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000 +if mdadm -CR r5 -l1 -n2 /dev/md/ddf0 -z 5000 then echo >&2 create with same name should fail ; exit 1 fi +mdadm -CR r10 -l10 -n4 -pn2 /dev/md/ddf0 -z 5000 mdadm -CR r1 -l1 -n2 /dev/md/ddf0 -mdadm -CR r5 -l5 -n3 /dev/md/ddf0 -testdev /dev/md/r0 5 5000 512 -# r0 will use 4608 due to chunk size, so that leaves 28160 for the rest -testdev /dev/md/r1 1 28160 64 -testdev /dev/md/r5 2 28160 512 +mdadm -CR r0 -l0 -n3 /dev/md/ddf0 +testdev /dev/md/r5 4 5000 512 +testdev /dev/md/r10 2 5000 512 +# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest +testdev /dev/md/r1 1 23552 64 +testdev /dev/md/r0 3 23552 512 dd if=/dev/sda of=/dev/md/r0 || true +dd if=/dev/sda of=/dev/md/r10 || true dd if=/dev/sda of=/dev/md/r1 || true dd if=/dev/sda of=/dev/md/r5 || true s0=`sha1sum /dev/md/r0` +s10=`sha1sum /dev/md/r10` s1=`sha1sum /dev/md/r1` s5=`sha1sum /dev/md/r5` @@ -35,12 +40,16 @@ udevadm settle s0a=`sha1sum /dev/md/r0` +s10a=`sha1sum /dev/md/r10` s1a=`sha1sum /dev/md/r1` s5a=`sha1sum /dev/md/r5` if [ "$s0" != "$s0a" ]; then echo r0 did not match ; exit 1; fi +if [ "$s10" != "$s10a" ]; then + echo r10 did not match ; exit 1; +fi if [ "$s1" != "$s1a" ]; then echo r1 did not match ; exit 1; fi @@ -50,6 +59,7 @@ # failure status just means it has completed already, so ignore it. mdadm --wait /dev/md/r1 || true +mdadm --wait /dev/md/r10 || true mdadm --wait /dev/md/r5 || true mdadm -Dbs > /var/tmp/mdadm.conf @@ -67,11 +77,7 @@ # and now assemble fully incrementally. for i in $dev8 $dev9 $dev10 $dev11 $dev12 do - #./mdadm -I $i -vv 2>&1 | wc -l > /tmp/cnt - ./mdadm -I $i 2> /tmp/thing - wc -l < /tmp/thing > /tmp/cnt - # should find container and 2 devices, so 3 lines. - [ `cat /tmp/cnt` -eq 3 ] + mdadm -I $i -c /var/tmp/mdadm.conf done check nosync udevadm settle diff -Nru mdadm-3.2.5/tests/10ddf-create-fail-rebuild mdadm-3.3/tests/10ddf-create-fail-rebuild --- mdadm-3.2.5/tests/10ddf-create-fail-rebuild 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-create-fail-rebuild 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,77 @@ +# sanity check array creation + +ddf_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +ddf_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-ddf-template + +num_disks=2 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 +ddf_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +layout=0 +mdadm -CR $member0 $dev8 $dev9 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev8 $dev9 -n $num_disks -l $level -z $size +ddf_check member $member1 $num_disks $level $size $size $offset $chunk $layout +testdev $member1 1 $size 1 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 +ddf_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +layout=2 +mdadm -CR $member0 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=5 +mdadm -CR $member1 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk $layout +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +ddf_check_hold $container $dev8 +mdadm --fail $member0 $dev8 +mdadm --wait-clean --scan +ddf_check_removal $container $dev8 +mdadm --add $container $dev12 +check wait +ddf_check_hold $container $dev12 diff -Nru mdadm-3.2.5/tests/10ddf-fail-create-race mdadm-3.3/tests/10ddf-fail-create-race --- mdadm-3.2.5/tests/10ddf-fail-create-race 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-fail-create-race 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,66 @@ +# This test creates a RAID1, fails a disk, and immediately +# (simultaneously) creates a new array. This tests for a possible +# race where the meta data reflecting the disk failure may not +# be written when the 2nd array is created. +. tests/env-ddf-template + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +mdadm -CR $container -e ddf -l container -n 2 $dev11 $dev12 +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 >/tmp/mdmon.txt 2>&1 +mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 +check wait +fail0=$dev11 +mdadm --fail $member0 $fail0 & + +# The test can succeed two ways: +# 1) mdadm -C member1 fails - in this case the meta data +# was already on disk when the create attempt was made +# 2) mdadm -C succeeds in the first place (meta data not on disk yet), +# but mdmon detects the problem and sets the disk faulty. + +if mdadm -CR $member1 -l raid1 -n 2 $container; then + + echo create should have failed / race condition? + + check wait + set -- $(get_raiddisks $member0) + d0=$1 + ret=0 + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + else + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + fi + fi + if [ $ret -eq 1 ]; then + echo ERROR: failed disk $fail0 is still a RAID member + echo $member0: $(get_raiddisks $member0) + echo $member1: $(get_raiddisks $member1) + fi + tmp=$(mktemp /tmp/mdest-XXXXXX) + mdadm -E $d0 >$tmp + if [ x$(grep -c 'state\[[01]\] : Degraded' $tmp) != x2 ]; then + echo ERROR: non-degraded array found + mdadm -E $d0 + ret=1 + fi + if ! grep -q '^ *0 *[0-9a-f]\{8\} .*Offline, Failed' $tmp; then + echo ERROR: disk 0 not marked as failed in meta data + mdadm -E $d0 + ret=1 + fi + rm -f $tmp +else + ret=0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +[ $ret -eq 0 ] + diff -Nru mdadm-3.2.5/tests/10ddf-fail-spare mdadm-3.3/tests/10ddf-fail-spare --- mdadm-3.2.5/tests/10ddf-fail-spare 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-fail-spare 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,86 @@ +# Test suggested by Albert Pauw: Create, fail one disk, have mdmon +# activate the spare, +# then run create again. Shouldn't use the failed disk for Create, +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm --fail $member0 $fail0 + +# To make sure the spare is activated, we may have to sleep +# 2s has always been enough for me +sleep 2 +check wait + +# This test can succeed both ways - if spare was activated +# before new array was created, we see only member 0. +# otherwise, we see both, adn member0 is degraded because the +# new array grabbed the spare +# which case occurs depends on the sleep time above. +ret=0 +if mdadm -CR $member1 -l raid5 -n 3 $container; then + # Creation successful - must have been quicker than spare activation + + check wait + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 -o $3 = $fail0 ]; then + echo ERROR: $member1 must not contain $fail0: $@ + ret=1 + fi + d1=$1 + mdadm -E $d1 >$tmp + if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: member 1 should be optimal in meta data + ret=1 + fi + state0=Degraded +else + # Creation unsuccessful - spare was used for member 0 + state0=Optimal +fi + +# need to delay a little bit, sometimes the meta data aren't +# up-to-date yet +sleep 0.5 +set -- $(get_raiddisks $member0) +if [ $1 = $fail0 -o $2 = $fail0 ]; then + echo ERROR: $member0 must not contain $fail0: $@ + ret=1 +fi +d0=$1 + +[ -f $tmp ] || mdadm -E $d0 >$tmp + +if ! grep -q 'state\[0\] : '$state0', Consistent' $tmp; then + echo ERROR: member 0 should be $state0 in meta data + ret=1 +fi +if ! grep -q 'Offline, Failed' $tmp; then + echo ERROR: Failed disk expected in meta data + ret=1 +fi +if [ $ret -eq 1 ]; then + cat /proc/mdstat + mdadm -E $d0 + mdadm -E $d1 + mdadm -E $fail0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +rm -f $tmp +[ $ret -eq 0 ] diff -Nru mdadm-3.2.5/tests/10ddf-fail-twice mdadm-3.3/tests/10ddf-fail-twice --- mdadm-3.2.5/tests/10ddf-fail-twice 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-fail-twice 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,59 @@ +. tests/env-ddf-template + +num_disks=5 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 $dev12 +ddf_check container $num_disks + +mdadm -CR $member0 -n 2 -l 1 $container +mdadm -CR $member1 -n 3 -l 5 $container + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 +set -- $(get_raiddisks $member1) +fail1=$1 +mdadm $member1 --fail $fail1 + +mdadm $container --add $dev13 + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 + + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" + +present=$(($(get_present $member0) + $(get_present $member1))) +[ $present -eq 4 ] || { + echo expected 4 present disks, got $present + devices for $member0: $devs0 + devices for $member1: $devs1 + exit 1 +} + +if echo "$devs0" | grep -q MISSING; then + good=1 + bad=0 +else + good=0 + bad=1 +fi + +# find a good device +eval "set -- \$devs$good" +check=$1 + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +mdadm -E $check >$tmp + +{ grep -q 'state\['$bad'\] : Degraded, Consistent' $tmp && + grep -q 'state\['$good'\] : Optimal, Consistent' $tmp; } || { + echo unexpected meta data state on $check + mdadm -E $check + rm -f $tmp + exit 1 +} + +rm -f $tmp +exit 0 diff -Nru mdadm-3.2.5/tests/10ddf-fail-two-spares mdadm-3.3/tests/10ddf-fail-two-spares --- mdadm-3.2.5/tests/10ddf-fail-two-spares 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-fail-two-spares 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,85 @@ +# Simulate two disks failing shorty after each other +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 6 \ + $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +#fast_sync + +mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 +#$dir/mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 \ +# >/tmp/mdmon.txt 2>&1 +mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 + +dd if=/dev/sda of=$member0 bs=1M +dd if=/dev/sda of=$member1 bs=1M skip=16 + +check wait + +sum0=$(sha1sum $member0) +sum1=$(sha1sum $member1) + +mdadm --fail $member1 $dev11 +sleep 1 +mdadm --fail $member1 $dev12 + +# We will have 4 resync procedures, 2 spares for 2 arrays. +mdadm --wait $member1 $member0 +mdadm --wait $member1 $member0 + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" +expected="$dev10 +$dev13 +$dev8 +$dev9" + +ret=0 +if [ "$(echo "$devs0" | sort)" != "$expected" \ + -o "$(echo "$devs1" | sort)" != "$expected" ]; then + echo ERROR: unexpected members + echo $member0: $devs0 + echo $member1: $devs1 + ret=1 +fi + +mdadm -E $dev10 >$tmp +if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: $member0 should be optimal in meta data + ret=1 +fi +if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: $member1 should be optimal in meta data + ret=1 +fi +if [ x"$(grep -c active/Online $tmp)" != x4 ]; then + echo ERROR: expected 4 online disks + ret=1 +fi +if [ x"$(grep -c "Offline, Failed" $tmp)" != x2 ]; then + echo ERROR: expected 2 failed disks + ret=1 +fi + +sum0a=$(sha1sum $member0) +sum1a=$(sha1sum $member1) + +if [ "$sum0" != "$sum0a" -o "$sum1" != "$sum1a" ]; then + echo ERROR: checksum mismatch + ret=1 +fi + +if [ $ret -eq 1 ]; then + cat /proc/mdstat + cat $tmp +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} +rm -f $tmp + +[ $ret -eq 0 ] diff -Nru mdadm-3.2.5/tests/10ddf-geometry mdadm-3.3/tests/10ddf-geometry --- mdadm-3.2.5/tests/10ddf-geometry 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/10ddf-geometry 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,82 @@ +# +# Test various RAID geometries, creation and deletion of subarrays +# + +assert_fail() { + if mdadm "$@"; then + echo mdadm "$@" must fail + return 1 + else + return 0 + fi +} + +assert_kill() { + local dev=$1 n=$2 + mdadm -S $dev + mdadm --kill-subarray=$n /dev/md/ddf0 + if mdadm -Dbs | grep -q $dev; then + echo >&2 $dev should be deleted + return 1 + fi + return 0 +} + +set -e +mdadm -CR /dev/md/ddf0 -e ddf -n 6 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# RAID1 geometries +# Use different sizes to make offset calculation harder +mdadm -CR l1s -l1 -n2 /dev/md/ddf0 -z 8000 +mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000 +assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0 + +# RAID10 geometries +assert_fail -CR badl10 -l10 -n3 /dev/md/ddf0 +assert_fail -CR badl10 -l10 -n5 /dev/md/ddf0 +assert_fail -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0 +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000 + +assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +assert_kill /dev/md/l10_2 2 +# gone now, must be able to create it again +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 + +# Now stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# Same as above, on inactive container +assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +# Kill subarray without having started anything (no mdmon) +mdadm --kill-subarray=3 /dev/md/ddf0 +mdadm -I /dev/md/ddf0 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000 + +assert_kill /dev/md/l10_2 2 +assert_kill /dev/md/l10_3 3 + +# RAID5 geometries +mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ra -l5 -n3 --layout=ddf-zero-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ls -l5 -n3 --layout=ddf-N-continue /dev/md/ddf0 -z 5000 +assert_fail -CR l5rs -l5 -n3 -prs /dev/md/ddf0 -z 5000 + +# Stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -I /dev/md/ddf0 + +assert_kill /dev/md/l5la 2 +assert_kill /dev/md/l5ls 4 +assert_kill /dev/md/l5ra 3 + +# RAID6 geometries +assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000 +assert_fail -CR l6rs -l5 -n4 -prs /dev/md/ddf0 -z 5000 +mdadm -CR l6la -l6 -n4 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l6ra -l6 -n4 --layout=ddf-zero-restart $dev8 $dev9 $dev10 $dev11 -z 5000 +mdadm -CR l6ls -l6 -n4 --layout=ddf-N-continue $dev13 $dev8 $dev9 $dev12 -z 5000 + +mdadm -Ss diff -Nru mdadm-3.2.5/tests/14imsm-r0_r0_2d-takeover-r10_4d mdadm-3.3/tests/14imsm-r0_r0_2d-takeover-r10_4d --- mdadm-3.2.5/tests/14imsm-r0_r0_2d-takeover-r10_4d 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/14imsm-r0_r0_2d-takeover-r10_4d 2013-09-03 04:47:47.000000000 +0000 @@ -28,4 +28,3 @@ vol0_new_chunk=64 . tests/imsm-grow-template 1 1 - diff -Nru mdadm-3.2.5/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k mdadm-3.3/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k --- mdadm-3.2.5/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k 2013-09-03 04:47:47.000000000 +0000 @@ -18,7 +18,4 @@ vol0_new_num_comps=$vol0_num_comps vol0_new_chunk=256 - . tests/imsm-grow-template 0 1 - - diff -Nru mdadm-3.2.5/tests/19raid6auto-repair mdadm-3.3/tests/19raid6auto-repair --- mdadm-3.2.5/tests/19raid6auto-repair 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/19raid6auto-repair 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,43 @@ +number_of_disks=5 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +# make a raid5 from a file +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + +# wipe out 5 chunks on each device +dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] +dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] +dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] +dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] +dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] + +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches + +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + +$dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } +blockdev --flushbufs $md0 $devs; sync +echo 3 > /proc/sys/vm/drop_caches + +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + +mdadm -S $md0 +udevadm settle +blockdev --flushbufs $md0 $devs; sync +echo 3 > /proc/sys/vm/drop_caches diff -Nru mdadm-3.2.5/tests/19raid6repair mdadm-3.3/tests/19raid6repair --- mdadm-3.2.5/tests/19raid6repair 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/19raid6repair 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,47 @@ +number_of_disks=4 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ + "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do + failure_split=( $failure ) + device_with_error=${failure_split[0]} + stripe_with_error=${failure_split[1]} + repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" + start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] + + # make a raid5 from a file + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib + blockdev --flushbufs $device_with_error; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 + udevadm settle + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches +done diff -Nru mdadm-3.2.5/tests/19repair-does-not-destroy mdadm-3.3/tests/19repair-does-not-destroy --- mdadm-3.2.5/tests/19repair-does-not-destroy 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/19repair-does-not-destroy 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,29 @@ +number_of_disks=7 +chunksize_in_kib=512 +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 repair 1 2 3 > /dev/null # D D +$dir/raid6check $md0 repair 8 2 5 > /dev/null # D P +$dir/raid6check $md0 repair 15 4 6 > /dev/null # D Q +$dir/raid6check $md0 repair 22 5 6 > /dev/null # P Q +$dir/raid6check $md0 repair 3 4 0 > /dev/null # Q D +$dir/raid6check $md0 repair 3 3 1 > /dev/null # P D +$dir/raid6check $md0 repair 6 4 5 > /dev/null # D /dev/null # D>D +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo should not mess up correct stripe ; exit 2; } + +mdadm -S $md0 +udevadm settle +blockdev --flushbufs $md0 $devs; sync diff -Nru mdadm-3.2.5/tests/env-ddf-template mdadm-3.3/tests/env-ddf-template --- mdadm-3.2.5/tests/env-ddf-template 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/tests/env-ddf-template 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,97 @@ +get_sysdir() { + local mddev=$1 + [ -L $mddev ] && mddev=$(readlink -f $mddev) + echo "/sys/class/block/$(basename $mddev)/md" +} + +get_raiddisks() { + sysdir=$(get_sysdir "$1") + for i in $(seq 0 $(($(cat $sysdir/raid_disks)-1))); do + if [ -d $sysdir/rd$i ]; then + readlink -f /dev/block/$(cat $sysdir/rd$i/block/dev) + else + echo MISSING + fi + done +} + +get_present() { + get_raiddisks $1 | grep -vc MISSING +} + +ddf_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:ddf" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + t_layout=$9 + + if [ $t_chunk -ne 0 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + fi + case $t_level in + 0) t_size=$((t_num_disks*$t_rd_size));; + 1) t_size=$t_rd_size;; + 4|5) t_size=$(((t_num_disks-1)*$t_rd_size));; + 6) t_size=$(((t_num_disks-2)*$t_rd_size));; + 10) t_size=$((t_num_disks*$t_rd_size/t_layout));; + esac + + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $((_offset/2))" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + [ o$_size = odefault ] && _size=$(($(cat ${sysfs}/size)/2)) + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop8 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +container=/dev/md/ddf +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 diff -Nru mdadm-3.2.5/tests/testdev mdadm-3.3/tests/testdev --- mdadm-3.2.5/tests/testdev 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/tests/testdev 2013-09-03 04:47:47.000000000 +0000 @@ -11,4 +11,3 @@ echo "ERROR: size is wrong for $dev: $cnt * $size (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`" exit 1; fi - diff -Nru mdadm-3.2.5/udev-md-raid-arrays.rules mdadm-3.3/udev-md-raid-arrays.rules --- mdadm-3.2.5/udev-md-raid-arrays.rules 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/udev-md-raid-arrays.rules 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,35 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="md_end" + +# handle md arrays +ACTION!="add|change", GOTO="md_end" +KERNEL!="md*", GOTO="md_end" + +# partitions have no md/{array_state,metadata_version}, but should not +# for that reason be ignored. +ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" + +# container devices have a metadata version of e.g. 'external:ddf' and +# never leave state 'inactive' +ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" +TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end" +ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end" +LABEL="md_ignore_state" + +IMPORT{program}="/sbin/mdadm --detail --export $devnode" +ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" +ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" +ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +IMPORT{builtin}="blkid" +OPTIONS+="link_priority=100" +OPTIONS+="watch" +ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +LABEL="md_end" diff -Nru mdadm-3.2.5/udev-md-raid-assembly.rules mdadm-3.3/udev-md-raid-assembly.rules --- mdadm-3.2.5/udev-md-raid-assembly.rules 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/udev-md-raid-assembly.rules 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,19 @@ +# do not edit this file, it will be overwritten on update + +# assemble md arrays + +SUBSYSTEM!="block", GOTO="md_inc_end" + +# handle potential components of arrays (the ones supported by md) +ENV{ID_FS_TYPE}=="ddf_raid_member|isw_raid_member|linux_raid_member", GOTO="md_inc" +GOTO="md_inc_end" + +LABEL="md_inc" + +# remember you can limit what gets auto/incrementally assembled by +# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' +ACTION=="add", RUN+="/sbin/mdadm --incremental $devnode --offroot" +ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}" +ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="/sbin/mdadm -If $name" + +LABEL="md_inc_end" diff -Nru mdadm-3.2.5/udev-md-raid.rules mdadm-3.3/udev-md-raid.rules --- mdadm-3.2.5/udev-md-raid.rules 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/udev-md-raid.rules 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -# do not edit this file, it will be overwritten on update - -SUBSYSTEM!="block", GOTO="md_end" - -# handle potential components of arrays (the ones supported by md) -ENV{ID_FS_TYPE}=="ddf_raid_member|isw_raid_member|linux_raid_member", GOTO="md_inc" -GOTO="md_inc_skip" - -LABEL="md_inc" - -# remember you can limit what gets auto/incrementally assembled by -# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' -ACTION=="add", RUN+="/sbin/mdadm --incremental $tempnode" -ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}" -ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="/sbin/mdadm -If $name" - -LABEL="md_inc_skip" - -# handle md arrays -ACTION!="add|change", GOTO="md_end" -KERNEL!="md*", GOTO="md_end" - -# partitions have no md/{array_state,metadata_version}, but should not -# for that reason be ignored. -ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" - -# container devices have a metadata version of e.g. 'external:ddf' and -# never leave state 'inactive' -ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" -TEST!="md/array_state", GOTO="md_end" -ATTR{md/array_state}=="|clear|inactive", GOTO="md_end" -LABEL="md_ignore_state" - -IMPORT{program}="/sbin/mdadm --detail --export $tempnode" -ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" -ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" -ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" -ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" -ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" -ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" -ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" - -IMPORT{program}="/sbin/blkid -o udev -p $tempnode" -OPTIONS+="link_priority=100" -OPTIONS+="watch" -ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" -ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" - -LABEL="md_end" diff -Nru mdadm-3.2.5/util.c mdadm-3.3/util.c --- mdadm-3.2.5/util.c 2012-05-18 07:10:03.000000000 +0000 +++ mdadm-3.3/util.c 2013-09-03 04:47:47.000000000 +0000 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2001-2013 Neil Brown * * * This program is free software; you can redistribute it and/or modify @@ -28,12 +28,13 @@ #include #include #include +#include +#include +#include #include #include #include -int __offroot; - /* * following taken from linux/blkpg.h because they aren't * anywhere else and it isn't safe to #include linux/ * stuff. @@ -43,10 +44,10 @@ /* The argument structure */ struct blkpg_ioctl_arg { - int op; - int flags; - int datalen; - void *data; + int op; + int flags; + int datalen; + void *data; }; /* The subfunctions (for the op field) */ @@ -89,15 +90,16 @@ int hit = 0; /* number of Hex digIT */ int i; char c; - for (i=0; i<4; i++) uuid[i]=0; + for (i = 0; i < 4; i++) + uuid[i] = 0; - while ((c= *str++)) { + while ((c = *str++) != 0) { int n; - if (c>='0' && c<='9') + if (c >= '0' && c <= '9') n = c-'0'; - else if (c>='a' && c <= 'f') + else if (c >= 'a' && c <= 'f') n = 10 + c - 'a'; - else if (c>='A' && c <= 'F') + else if (c >= 'A' && c <= 'F') n = 10 + c - 'A'; else if (strchr(":. -", c)) continue; @@ -114,7 +116,6 @@ return 0; } - /* * Get the md version number. * We use the RAID_VERSION ioctl if it is supported @@ -127,21 +128,21 @@ int md_get_version(int fd) { - struct stat stb; - mdu_version_t vers; + struct stat stb; + mdu_version_t vers; - if (fstat(fd, &stb)<0) - return -1; - if ((S_IFMT&stb.st_mode) != S_IFBLK) - return -1; + if (fstat(fd, &stb)<0) + return -1; + if ((S_IFMT&stb.st_mode) != S_IFBLK) + return -1; - if (ioctl(fd, RAID_VERSION, &vers) == 0) - return (vers.major*10000) + (vers.minor*100) + vers.patchlevel; - if (errno == EACCES) - return -1; - if (major(stb.st_rdev) == MD_MAJOR) - return (3600); - return -1; + if (ioctl(fd, RAID_VERSION, &vers) == 0) + return (vers.major*10000) + (vers.minor*100) + vers.patchlevel; + if (errno == EACCES) + return -1; + if (major(stb.st_rdev) == MD_MAJOR) + return (3600); + return -1; } int get_linux_version() @@ -188,13 +189,13 @@ return (a*1000000)+(b*1000)+c; } -long long parse_size(char *size) +unsigned long long parse_size(char *size) { /* parse 'size' which should be a number optionally * followed by 'K', 'M', or 'G'. * Without a suffix, K is assumed. * Number returned is in sectors (half-K) - * -1 returned on error. + * INVALID_SECTORS returned on error. */ char *c; long long s = strtoll(size, &c, 10); @@ -213,10 +214,14 @@ c++; s *= 1024 * 1024 * 2; break; + case 's': /* sectors */ + c++; + break; } - } + } else + s = INVALID_SECTORS; if (*c) - s = -1; + s = INVALID_SECTORS; return s; } @@ -244,7 +249,7 @@ { /* Parse the layout string for 'faulty' */ int ln = strcspn(layout, "0123456789"); - char *m = strdup(layout); + char *m = xstrdup(layout); int mode; m[ln] = 0; mode = map_name(faultylayout, m); @@ -253,6 +258,17 @@ return mode | (atoi(layout+ln)<< ModeShift); } + +long parse_num(char *num) +{ + /* Either return a valid number, or -1 */ + char *c; + long rv = strtol(num, &c, 10); + if (rv < 0 || *c || !num[0]) + return -1; + else + return rv; +} #endif void remove_partitions(int fd) @@ -269,7 +285,7 @@ a.datalen = sizeof(p); a.flags = 0; memset(a.data, 0, a.datalen); - for (p.pno=0; p.pno < 16; p.pno++) + for (p.pno = 0; p.pno < 16; p.pno++) ioctl(fd, BLKPG, &a); #endif } @@ -327,19 +343,20 @@ * which actual disks are present. */ copies = (layout&255)* ((layout>>8) & 255); - first=0; + first = 0; do { /* there must be one of the 'copies' form 'first' */ int n = copies; - int cnt=0; + int cnt = 0; + int this = first; while (n--) { - if (avail[first]) + if (avail[this]) cnt++; - first = (first+1) % raid_disks; + this = (this+1) % raid_disks; } if (cnt == 0) return 0; - + first = (first+(layout&255)) % raid_disks; } while (first != 0); return 1; @@ -377,8 +394,8 @@ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0 || array.raid_disks <= 0) return 0; - avail = calloc(array.raid_disks, 1); - for (i=0; i < MAX_DISKS && array.nr_disks > 0; i++) { + avail = xcalloc(array.raid_disks, 1); + for (i = 0; i < MAX_DISKS && array.nr_disks > 0; i++) { disk.number = i; if (ioctl(fd, GET_DISK_INFO, &disk) != 0) continue; @@ -400,7 +417,6 @@ return rv; } - const int uuid_zero[4] = { 0, 0, 0, 0 }; int same_uuid(int a[4], int b[4], int swapuuid) @@ -413,7 +429,7 @@ unsigned char *ac = (unsigned char *)a; unsigned char *bc = (unsigned char *)b; int i; - for (i=0; i<16; i+= 4) { + for (i = 0; i < 16; i += 4) { if (ac[i+0] != bc[i+3] || ac[i+1] != bc[i+2] || ac[i+2] != bc[i+1] || @@ -430,6 +446,7 @@ return 0; } } + void copy_uuid(void *a, int b[4], int swapuuid) { if (swapuuid) { @@ -440,7 +457,7 @@ unsigned char *ac = (unsigned char *)a; unsigned char *bc = (unsigned char *)b; int i; - for (i=0; i<16; i+= 4) { + for (i = 0; i < 16; i += 4) { ac[i+0] = bc[i+3]; ac[i+1] = bc[i+2]; ac[i+2] = bc[i+1]; @@ -494,7 +511,8 @@ */ unsigned char sb[1024]; time_t mtime; - int size, bsize; + unsigned long long size; + int bsize; if (lseek(fd, 1024,0)!= 1024) return 0; if (read(fd, sb, 1024)!= 1024) @@ -505,10 +523,10 @@ mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8; bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8; size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8; - fprintf(stderr, Name ": %s appears to contain an ext2fs file system\n", + size <<= bsize; + pr_err("%s appears to contain an ext2fs file system\n", name); - fprintf(stderr," size=%dK mtime=%s", - size*(1<ignore_hw_compat = 1; + if (!st) + return 0; st->ss->load_super(st, fd, name); /* Looks like a raid array .. */ - fprintf(stderr, Name ": %s appears to be part of a raid array:\n", + pr_err("%s appears to be part of a raid array:\n", name); st->ss->getinfo_super(st, &info, NULL); st->ss->free_super(st); crtime = info.array.ctime; level = map_num(pers, info.array.level); if (!level) level = "-unknown-"; - fprintf(stderr, " level=%s devices=%d ctime=%s", - level, info.array.raid_disks, ctime(&crtime)); + cont_err("level=%s devices=%d ctime=%s", + level, info.array.raid_disks, ctime(&crtime)); return 1; } @@ -563,7 +581,7 @@ { char *add = ""; int i; - for (i=0; i<5; i++) { + for (i = 0; i < 5; i++) { char buf[100]; fprintf(stderr, "%s%s", mesg, add); fflush(stderr); @@ -575,7 +593,7 @@ return 0; add = "(y/n) "; } - fprintf(stderr, Name ": assuming 'no'\n"); + pr_err("assuming 'no'\n"); return 0; } #endif /* MDASSEMBLE */ @@ -590,18 +608,18 @@ * 0 if not a standard name. */ char *d = strrchr(dev, '/'); - int type=0; + int type = 0; int num; if (!d) return 0; - if (strncmp(d, "/d",2)==0) - d += 2, type=1; /* /dev/md/dN{pM} */ - else if (strncmp(d, "/md_d", 5)==0) - d += 5, type=1; /* /dev/md_dN{pM} */ - else if (strncmp(d, "/md", 3)==0) - d += 3, type=-1; /* /dev/mdN */ - else if (d-dev > 3 && strncmp(d-2, "md/", 3)==0) - d += 1, type=-1; /* /dev/md/N */ + if (strncmp(d, "/d",2) == 0) + d += 2, type = 1; /* /dev/md/dN{pM} */ + else if (strncmp(d, "/md_d", 5) == 0) + d += 5, type = 1; /* /dev/md_dN{pM} */ + else if (strncmp(d, "/md", 3) == 0) + d += 3, type = -1; /* /dev/mdN */ + else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0) + d += 1, type = -1; /* /dev/md/N */ else return 0; if (!*d) @@ -623,13 +641,13 @@ unsigned int csum; unsigned int *superc = (unsigned int*) super; - for(i=0; i>32); #ifdef __alpha__ /* The in-kernel checksum calculation is always 16bit on * the alpha, though it is 32 bit on i386... - * I wonder what it is elsewhere... (it uses and API in + * I wonder what it is elsewhere... (it uses an API in * a way that it shouldn't). */ csum = (csum & 0xffff) + (csum >> 16); @@ -653,7 +671,7 @@ */ if (bytes < 5000*1024) - buf[0]=0; + buf[0] = 0; else if (bytes < 2*1024LL*1024LL*1024LL) { long cMiB = (bytes / ( (1LL<<20) / 200LL ) +1) /2; long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; @@ -670,24 +688,49 @@ return buf; } -char *human_size_brief(long long bytes) +char *human_size_brief(long long bytes, int prefix) { static char buf[30]; + /* We convert bytes to either centi-M{ega,ibi}bytes or + * centi-G{igi,ibi}bytes, with appropriate rounding, + * and then print 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes, as that shows more precision and isn't + * too large a number. + * Terabytes are not yet handled. + * + * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc. + * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc. + */ + if (bytes < 5000*1024) - snprintf(buf, sizeof(buf), "%ld.%02ldKiB", - (long)(bytes>>10), (long)(((bytes&1023)*100+512)/1024) - ); - else if (bytes < 2*1024LL*1024LL*1024LL) - snprintf(buf, sizeof(buf), "%ld.%02ldMiB", - (long)(bytes>>20), - (long)((bytes&0xfffff)+0x100000/200)/(0x100000/100) - ); + buf[0] = 0; + else if (prefix == IEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes / ( (1LL<<20) / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMiB", + cMiB/100 , cMiB % 100); + } else { + long cGiB = (bytes / ( (1LL<<30) / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGiB", + cGiB/100 , cGiB % 100); + } + } + else if (prefix == JEDEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMB", + cMB/100, cMB % 100); + } else { + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGB", + cGB/100 , cGB % 100); + } + } else - snprintf(buf, sizeof(buf), "%ld.%02ldGiB", - (long)(bytes>>30), - (long)(((bytes>>10)&0xfffff)+0x100000/200)/(0x100000/100) - ); + buf[0] = 0; + return buf; } @@ -722,11 +765,15 @@ { int data_disks = 0; switch (level) { - case 0: data_disks = raid_disks; break; - case 1: data_disks = 1; break; + case 0: data_disks = raid_disks; + break; + case 1: data_disks = 1; + break; case 4: - case 5: data_disks = raid_disks - 1; break; - case 6: data_disks = raid_disks - 2; break; + case 5: data_disks = raid_disks - 1; + break; + case 6: data_disks = raid_disks - 2; + break; case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); break; } @@ -734,43 +781,79 @@ return data_disks; } +int devnm2devid(char *devnm) +{ + /* First look in /sys/block/$DEVNM/dev for %d:%d + * If that fails, try parsing out a number + */ + char path[100]; + char *ep; + int fd; + int mjr,mnr; + + sprintf(path, "/sys/block/%s/dev", devnm); + fd = open(path, O_RDONLY); + if (fd >= 0) { + char buf[20]; + int n = read(fd, buf, sizeof(buf)); + close(fd); + if (n > 0) + buf[n] = 0; + if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2) + return makedev(mjr, mnr); + } + if (strncmp(devnm, "md_d", 4) == 0 && + isdigit(devnm[4]) && + (mnr = strtoul(devnm+4, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(get_mdp_major(), mnr << MdpMinorShift); + + if (strncmp(devnm, "md", 2) == 0 && + isdigit(devnm[2]) && + (mnr = strtoul(devnm+2, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(MD_MAJOR, mnr); + + return 0; +} + #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) -char *get_md_name(int dev) +char *get_md_name(char *devnm) { /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ /* if dev < 0, want /dev/md/d%d or find mdp in /proc/devices ... */ + static char devname[50]; struct stat stb; - dev_t rdev; + dev_t rdev = devnm2devid(devnm); char *dn; - if (dev < 0) { - int mdp = get_mdp_major(); - if (mdp < 0) return NULL; - rdev = makedev(mdp, (-1-dev)<<6); - snprintf(devname, sizeof(devname), "/dev/md/d%d", -1-dev); - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) - return devname; - } else { - rdev = makedev(MD_MAJOR, dev); - snprintf(devname, sizeof(devname), "/dev/md%d", dev); - if (stat(devname, &stb) == 0 - && (S_IFMT&stb.st_mode) == S_IFBLK - && (stb.st_rdev == rdev)) - return devname; - - snprintf(devname, sizeof(devname), "/dev/md/%d", dev); + if (rdev == 0) + return 0; + if (strncmp(devnm, "md_", 3) == 0) { + snprintf(devname, sizeof(devname), "/dev/md/%s", + devnm + 3); if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && (stb.st_rdev == rdev)) return devname; } + snprintf(devname, sizeof(devname), "/dev/%s", devnm); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + + snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + dn = map_dev(major(rdev), minor(rdev), 0); if (dn) return dn; - snprintf(devname, sizeof(devname), "/dev/.tmp.md%d", dev); + snprintf(devname, sizeof(devname), "/dev/.tmp.%s", devnm); if (mknod(devname, S_IFBLK | 0600, rdev) == -1) if (errno != EEXIST) return NULL; @@ -785,32 +868,9 @@ void put_md_name(char *name) { - if (strncmp(name, "/dev/.tmp.md", 12)==0) + if (strncmp(name, "/dev/.tmp.md", 12) == 0) unlink(name); } - -int find_free_devnum(int use_partitions) -{ - int devnum; - for (devnum = 127; devnum != 128; - devnum = devnum ? devnum-1 : (1<<20)-1) { - char *dn; - int _devnum; - - _devnum = use_partitions ? (-1-devnum) : devnum; - if (mddev_busy(_devnum)) - continue; - /* make sure it is new to /dev too, at least as a - * non-standard */ - dn = map_dev(dev2major(_devnum), dev2minor(_devnum), 0); - if (dn && ! is_standard(dn, NULL)) - continue; - break; - } - if (devnum == 128) - return NoMdDev; - return use_partitions ? (-1-devnum) : devnum; -} #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ int dev_open(char *dev, int flags) @@ -831,21 +891,17 @@ if (e > dev && *e == ':' && e[1] && (minor = strtoul(e+1, &e, 0)) >= 0 && *e == 0) { - char *path = map_dev(major, minor, 0); - if (path) - fd = open(path, flags); - if (fd < 0) { - snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", - (int)getpid(), major, minor); - if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { - fd = open(devname, flags); - unlink(devname); - } + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); } if (fd < 0) { + /* Try /tmp as /dev appear to be read-only */ snprintf(devname, sizeof(devname), "/tmp/.tmp.md.%d:%d:%d", (int)getpid(), major, minor); - if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { fd = open(devname, flags); unlink(devname); } @@ -855,27 +911,31 @@ return fd; } -int open_dev_flags(int devnum, int flags) +int open_dev_flags(char *devnm, int flags) { + int devid; char buf[20]; - sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum)); + devid = devnm2devid(devnm); + sprintf(buf, "%d:%d", major(devid), minor(devid)); return dev_open(buf, flags); } -int open_dev(int devnum) +int open_dev(char *devnm) { - return open_dev_flags(devnum, O_RDONLY); + return open_dev_flags(devnm, O_RDONLY); } -int open_dev_excl(int devnum) +int open_dev_excl(char *devnm) { char buf[20]; int i; int flags = O_RDWR; + int devid = devnm2devid(devnm); + long delay = 1000; - sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum)); - for (i=0 ; i<25 ; i++) { + sprintf(buf, "%d:%d", major(devid), minor(devid)); + for (i = 0 ; i < 25 ; i++) { int fd = dev_open(buf, flags|O_EXCL); if (fd >= 0) return fd; @@ -885,7 +945,9 @@ } if (errno != EBUSY) return fd; - usleep(200000); + usleep(delay); + if (delay < 200000) + delay *= 2; } return -1; } @@ -908,18 +970,21 @@ { int i; struct stat stb_want; + long delay = 1000; if (fstat(fd, &stb_want) != 0 || (stb_want.st_mode & S_IFMT) != S_IFBLK) return; - for (i=0 ; i<25 ; i++) { + for (i = 0 ; i < 25 ; i++) { struct stat stb; if (stat(dev, &stb) == 0 && (stb.st_mode & S_IFMT) == S_IFBLK && (stb.st_rdev == stb_want.st_rdev)) return; - usleep(200000); + usleep(delay); + if (delay < 200000) + delay *= 2; } if (i == 25) dprintf("%s: timeout waiting for %s\n", __func__, dev); @@ -945,9 +1010,9 @@ char version[20]; int i; char *subarray = NULL; - int container = NoMdDev; + char container[32] = ""; - sra = sysfs_read(fd, 0, GET_VERSION); + sra = sysfs_read(fd, NULL, GET_VERSION); if (sra) { vers = sra->array.major_version; @@ -971,9 +1036,9 @@ subarray = strchr(dev, '/'); if (subarray) { *subarray++ = '\0'; - subarray = strdup(subarray); + subarray = xstrdup(subarray); } - container = devname2devnum(dev); + strcpy(container, dev); if (sra) sysfs_free(sra); sra = sysfs_read(-1, container, GET_VERSION); @@ -992,8 +1057,8 @@ st->sb = NULL; if (subarrayp) *subarrayp = subarray; - st->container_dev = container; - st->devnum = fd2devnum(fd); + strcpy(st->container_devnm, container); + strcpy(st->devnm, fd2devnm(fd)); } else free(subarray); @@ -1024,13 +1089,12 @@ if (!orig) return orig; - st = malloc(sizeof(*st)); - if (!st) - return st; - memset(st, 0, sizeof(*st)); + st = xcalloc(1, sizeof(*st)); st->ss = orig->ss; st->max_devs = orig->max_devs; st->minor_version = orig->minor_version; + st->ignore_hw_compat = orig->ignore_hw_compat; + st->data_offset = orig->data_offset; st->sb = NULL; st->info = NULL; return st; @@ -1047,11 +1111,10 @@ int bestsuper = -1; int i; - st = malloc(sizeof(*st)); - memset(st, 0, sizeof(*st)); - st->container_dev = NoMdDev; + st = xcalloc(1, sizeof(*st)); + st->container_devnm[0] = 0; - for (i=0 ; superlist[i]; i++) { + for (i = 0 ; superlist[i]; i++) { int rv; ss = superlist[i]; if (guess_type == guess_array && ss->add_to_super == NULL) @@ -1079,7 +1142,6 @@ rv = superlist[bestsuper]->load_super(st, fd, NULL); if (rv == 0) { superlist[bestsuper]->free_super(st); - st->ignore_hw_compat = 0; return st; } } @@ -1106,7 +1168,7 @@ ldsize <<= 9; } else { if (dname) - fprintf(stderr, Name ": Cannot get size of %s: %s\b", + pr_err("Cannot get size of %s: %s\b", dname, strerror(errno)); return 0; } @@ -1169,7 +1231,7 @@ part = (struct GPT_part_entry *)buf; - for (part_nr=0; part_nr < all_partitions; part_nr++) { + for (part_nr = 0; part_nr < all_partitions; part_nr++) { /* read partition entry */ if (read(fd, buf, entry_size) != (ssize_t)entry_size) return 0; @@ -1213,7 +1275,7 @@ /* found the correct signature */ part = boot_sect.parts; - for (part_nr=0; part_nr < MBR_PARTITIONS; part_nr++) { + for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) { /* check for GPT type */ if (part->part_type == MBR_GPT_PARTITION_TYPE) { retval = get_gpt_last_partition_end(fd, endofpart); @@ -1248,40 +1310,25 @@ /* There appears to be a partition table here */ if (freesize == 0) { /* partitions will not be visible in new device */ - fprintf(stderr, - Name ": partition table exists on %s but will be lost or\n" - " meaningless after creating array\n", - dname); + pr_err("partition table exists on %s but will be lost or\n" + " meaningless after creating array\n", + dname); return 1; } else if (endofpart > freesize) { /* last partition overlaps metadata */ - fprintf(stderr, - Name ": metadata will over-write last partition on %s.\n", - dname); + pr_err("metadata will over-write last partition on %s.\n", + dname); return 1; } else if (size && endofpart > size) { /* partitions will be truncated in new device */ - fprintf(stderr, - Name ": array size is too small to cover all partitions on %s.\n", - dname); + pr_err("array size is too small to cover all partitions on %s.\n", + dname); return 1; } } return 0; } -void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk) -{ - int d; - - ioctl(mdfd, GET_ARRAY_INFO, ainf); - for (d = 0 ; d < MAX_DISKS ; d++) { - if (ioctl(mdfd, GET_DISK_INFO, disk) == 0 && - (disk->major || disk->minor)) - return; - } -} - int open_container(int fd) { /* 'fd' is a block device. Find out if it is in use @@ -1310,6 +1357,20 @@ continue; if (de->d_name[0] == '.') continue; + /* Need to make sure it is a container and not a volume */ + sprintf(e, "/%s/md/metadata_version", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (strncmp(buf, "external", 8) != 0 || + n < 10 || + buf[9] == '/') + continue; sprintf(e, "/%s/dev", de->d_name); dfd = open(path, O_RDONLY); if (dfd < 0) @@ -1346,13 +1407,47 @@ return NULL; } +int metadata_container_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the container named in 'metadata' + * which is + * /containername/componentname or + * -containername/componentname + */ + int l; + if (*metadata != '/' && *metadata != '-') + return 0; + l = strlen(devnm); + if (strncmp(metadata+1, devnm, l) != 0) + return 0; + if (metadata[l+1] != '/') + return 0; + return 1; +} + +int metadata_subdev_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the subdev named in 'metadata' + * which is + * /containername/subdev or + * -containername/subdev + */ + char *sl; + if (*metadata != '/' && *metadata != '-') + return 0; + sl = strchr(metadata+1, '/'); + if (!sl) + return 0; + if (strcmp(sl+1, devnm) == 0) + return 1; + return 0; +} + int is_container_member(struct mdstat_ent *mdstat, char *container) { if (mdstat->metadata_version == NULL || strncmp(mdstat->metadata_version, "external:", 9) != 0 || - !is_subarray(mdstat->metadata_version+9) || - strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 || - mdstat->metadata_version[10+strlen(container)] != '/') + !metadata_container_matches(mdstat->metadata_version+9, container)) return 0; return 1; @@ -1385,71 +1480,70 @@ struct mdinfo *mdi; struct mdinfo *info; int fd, err = 1; + char *_devnm; fd = open(dev, O_RDWR|O_EXCL); if (fd < 0) { if (!quiet) - fprintf(stderr, Name ": Couldn't open %s, aborting\n", + pr_err("Couldn't open %s, aborting\n", dev); return -1; } - st->devnum = fd2devnum(fd); - if (st->devnum == NoMdDev) { + _devnm = fd2devnm(fd); + if (_devnm == NULL) { if (!quiet) - fprintf(stderr, - Name ": Failed to determine device number for %s\n", - dev); + pr_err("Failed to determine device number for %s\n", + dev); goto close_fd; } + strcpy(st->devnm, _devnm); - mdi = sysfs_read(fd, st->devnum, GET_VERSION|GET_LEVEL); + mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL); if (!mdi) { if (!quiet) - fprintf(stderr, Name ": Failed to read sysfs for %s\n", + pr_err("Failed to read sysfs for %s\n", dev); goto close_fd; } if (mdi->array.level != UnSet) { if (!quiet) - fprintf(stderr, Name ": %s is not a container\n", dev); + pr_err("%s is not a container\n", dev); goto free_sysfs; } st->ss = version_to_superswitch(mdi->text_version); if (!st->ss) { if (!quiet) - fprintf(stderr, - Name ": Operation not supported for %s metadata\n", - mdi->text_version); + pr_err("Operation not supported for %s metadata\n", + mdi->text_version); goto free_sysfs; } - st->devname = devnum2devname(st->devnum); - if (!st->devname) { + if (st->devnm[0] == 0) { if (!quiet) - fprintf(stderr, Name ": Failed to allocate device name\n"); + pr_err("Failed to allocate device name\n"); goto free_sysfs; } if (!st->ss->load_container) { if (!quiet) - fprintf(stderr, Name ": %s is not a container\n", dev); - goto free_name; + pr_err("%s is not a container\n", dev); + goto free_sysfs; } if (st->ss->load_container(st, fd, NULL)) { if (!quiet) - fprintf(stderr, Name ": Failed to load metadata for %s\n", + pr_err("Failed to load metadata for %s\n", dev); - goto free_name; + goto free_sysfs; } info = st->ss->container_content(st, subarray); if (!info) { if (!quiet) - fprintf(stderr, Name ": Failed to find subarray-%s in %s\n", + pr_err("Failed to find subarray-%s in %s\n", subarray, dev); goto free_super; } @@ -1460,9 +1554,6 @@ free_super: if (err) st->ss->free_super(st); - free_name: - if (err) - free(st->devname); free_sysfs: sysfs_free(mdi); close_fd: @@ -1493,7 +1584,7 @@ if (sd2 == info) break; if (sd2 == NULL) { - sd2 = malloc(sizeof(*sd2)); + sd2 = xmalloc(sizeof(*sd2)); *sd2 = *info; sd2->next = sra->devs; sra->devs = sd2; @@ -1560,16 +1651,14 @@ return recovery_start; } -int mdmon_pid(int devnum) +int mdmon_pid(char *devnm) { char path[100]; char pid[10]; int fd; int n; - char *devname = devnum2devname(devnum); - sprintf(path, "%s/%s.pid", MDMON_DIR, devname); - free(devname); + sprintf(path, "%s/%s.pid", MDMON_DIR, devnm); fd = open(path, O_RDONLY | O_NOATIME, 0); @@ -1582,9 +1671,9 @@ return atoi(pid); } -int mdmon_running(int devnum) +int mdmon_running(char *devnm) { - int pid = mdmon_pid(devnum); + int pid = mdmon_pid(devnm); if (pid <= 0) return 0; if (kill(pid, 0) == 0) @@ -1592,17 +1681,17 @@ return 0; } -int start_mdmon(int devnum) +int start_mdmon(char *devnm) { int i, skipped; int len; - pid_t pid; + pid_t pid; int status; char pathbuf[1024]; char *paths[4] = { pathbuf, "/sbin/mdmon", - "mdmon", + "./mdmon", NULL }; @@ -1622,50 +1711,72 @@ } else pathbuf[0] = '\0'; + /* First try to run systemctl */ + if (!check_env("MDADM_NO_SYSTEMCTL")) + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we start mdmon ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdmon@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: pr_err("cannot run mdmon. " + "Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 0; + } + + /* That failed, try running mdmon directly */ switch(fork()) { case 0: /* FIXME yuk. CLOSE_EXEC?? */ skipped = 0; - for (i=3; skipped < 20; i++) + for (i = 3; skipped < 20; i++) if (close(i) < 0) skipped++; else skipped = 0; - for (i=0; paths[i]; i++) + for (i = 0; paths[i]; i++) if (paths[i][0]) { - if (__offroot) { - execl(paths[i], "mdmon", "--offroot", - devnum2devname(devnum), - NULL); - } else { - execl(paths[i], "mdmon", - devnum2devname(devnum), - NULL); - } + execl(paths[i], paths[i], + devnm, NULL); } exit(1); - case -1: fprintf(stderr, Name ": cannot run mdmon. " + case -1: pr_err("cannot run mdmon. " "Array remains readonly\n"); return -1; default: /* parent - good */ pid = wait(&status); - if (pid < 0 || status != 0) + if (pid < 0 || status != 0) { + pr_err("failed to launch mdmon. " + "Array remains readonly\n"); return -1; + } } return 0; } -int check_env(char *name) -{ - char *val = getenv(name); - - if (val && atoi(val) == 1) - return 1; - - return 0; -} - __u32 random32(void) { __u32 rv; @@ -1686,7 +1797,7 @@ return -1; } - sfd = connect_monitor(devnum2devname(st->container_dev)); + sfd = connect_monitor(st->container_devnm); if (sfd < 0) return -1; @@ -1709,7 +1820,7 @@ void append_metadata_update(struct supertype *st, void *buf, int len) { - struct metadata_update *mu = malloc(sizeof(*mu)); + struct metadata_update *mu = xmalloc(sizeof(*mu)); mu->buf = buf; mu->len = len; @@ -1731,7 +1842,7 @@ if (check_env("MDADM_EXPERIMENTAL")) return 1; else { - fprintf(stderr, Name ": To use this feature MDADM_EXPERIMENTAL" + pr_err("To use this feature MDADM_EXPERIMENTAL" " environment variable has to be defined.\n"); return 0; } @@ -1773,7 +1884,7 @@ found = 1; /* check if domain matches */ if (found && domlist) { - struct dev_policy *pol = devnum_policy(dev); + struct dev_policy *pol = devid_policy(dev); if (spare_group) pol_add(&pol, pol_domain, spare_group, NULL); @@ -1797,3 +1908,47 @@ } return disks; } + +/* Checks if paths point to the same device + * Returns 0 if they do. + * Returns 1 if they don't. + * Returns -1 if something went wrong, + * e.g. paths are empty or the files + * they point to don't exist */ +int compare_paths (char* path1, char* path2) +{ + struct stat st1,st2; + + if (path1 == NULL || path2 == NULL) + return -1; + if (stat(path1,&st1) != 0) + return -1; + if (stat(path2,&st2) != 0) + return -1; + if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev)) + return 0; + return 1; +} + +/* Make sure we can open as many devices as needed */ +void enable_fds(int devices) +{ + unsigned int fds = 20 + devices; + struct rlimit lim; + if (getrlimit(RLIMIT_NOFILE, &lim) != 0 + || lim.rlim_cur >= fds) + return; + if (lim.rlim_max < fds) + lim.rlim_max = fds; + lim.rlim_cur = fds; + setrlimit(RLIMIT_NOFILE, &lim); +} + +int in_initrd(void) +{ + /* This is based on similar function in systemd. */ + struct statfs s; + return statfs("/", &s) >= 0 && + ((unsigned long)s.f_type == TMPFS_MAGIC || + (unsigned long)s.f_type == RAMFS_MAGIC); +} diff -Nru mdadm-3.2.5/xmalloc.c mdadm-3.3/xmalloc.c --- mdadm-3.2.5/xmalloc.c 1970-01-01 00:00:00.000000000 +0000 +++ mdadm-3.3/xmalloc.c 2013-09-03 04:47:47.000000000 +0000 @@ -0,0 +1,72 @@ +/* mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +/*#include +#include +#include +#include +#include +#include +#include +*/ + +void *xmalloc(size_t len) +{ + void *rv = malloc(len); + char *msg; + if (rv) + return rv; + msg = Name ": memory allocation failure - aborting\n"; + exit(4+!!write(2, msg, strlen(msg))); +} + +void *xrealloc(void *ptr, size_t len) +{ + void *rv = realloc(ptr, len); + char *msg; + if (rv) + return rv; + msg = Name ": memory allocation failure - aborting\n"; + exit(4+!!write(2, msg, strlen(msg))); +} + +void *xcalloc(size_t num, size_t size) +{ + void *rv = calloc(num, size); + char *msg; + if (rv) + return rv; + msg = Name ": memory allocation failure - aborting\n"; + exit(4+!!write(2, msg, strlen(msg))); +} + +char *xstrdup(const char *str) +{ + char *rv = strdup(str); + char *msg; + if (rv) + return rv; + msg = Name ": memory allocation failure - aborting\n"; + exit(4+!!write(2, msg, strlen(msg))); +}