diff -Nru zfs-linux-0.7.0-rc2/cmd/arc_summary/arc_summary.py zfs-linux-0.7.0-rc3/cmd/arc_summary/arc_summary.py --- zfs-linux-0.7.0-rc2/cmd/arc_summary/arc_summary.py 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/arc_summary/arc_summary.py 2017-01-20 18:18:28.000000000 +0000 @@ -705,8 +705,8 @@ ) sys.stdout.write("\n") - if arc["l2_arc_evicts"]['lock_retries'] + \ - arc["l2_arc_evicts"]["reading"] > 0: + if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ + arc["l2_arc_evicts"]["reading"] != '0': sys.stdout.write("L2 ARC Evicts:\n") sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % arc["l2_arc_evicts"]['lock_retries']) diff -Nru zfs-linux-0.7.0-rc2/cmd/mount_zfs/mount_zfs.c zfs-linux-0.7.0-rc3/cmd/mount_zfs/mount_zfs.c --- zfs-linux-0.7.0-rc2/cmd/mount_zfs/mount_zfs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/mount_zfs/mount_zfs.c 2017-01-20 18:18:28.000000000 +0000 @@ -367,7 +367,7 @@ if (zfs_prop_get(zhp, zpt, context, sizeof (context), NULL, NULL, 0, B_FALSE) == 0) { if (strcmp(context, "none") != 0) - append_mntopt(name, context, mntopts, mtabopt, B_TRUE); + append_mntopt(name, context, mntopts, mtabopt, B_TRUE); } } @@ -600,7 +600,7 @@ gettext("filesystem '%s' (v%d) is not " "supported by this implementation of " "ZFS (max v%d).\n"), dataset, - (int) zfs_version, (int) ZPL_VERSION); + (int)zfs_version, (int)ZPL_VERSION); } else { (void) fprintf(stderr, gettext("filesystem '%s' mount " diff -Nru zfs-linux-0.7.0-rc2/cmd/raidz_test/raidz_bench.c zfs-linux-0.7.0-rc3/cmd/raidz_test/raidz_bench.c --- zfs-linux-0.7.0-rc2/cmd/raidz_test/raidz_bench.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/raidz_test/raidz_bench.c 2017-01-20 18:18:28.000000000 +0000 @@ -53,18 +53,18 @@ /* * To permit larger column sizes these have to be done - * allocated using aligned alloc instead of zio_data_buf_alloc + * allocated using aligned alloc instead of zio_abd_buf_alloc */ - zio_bench.io_data = raidz_alloc(max_data_size); + zio_bench.io_abd = raidz_alloc(max_data_size); - init_zio_data(&zio_bench); + init_zio_abd(&zio_bench); } static void bench_fini_raidz_maps(void) { /* tear down golden zio */ - raidz_free(zio_bench.io_data, max_data_size); + raidz_free(zio_bench.io_abd, max_data_size); bzero(&zio_bench, sizeof (zio_t)); } @@ -93,7 +93,7 @@ start = gethrtime(); for (iter = 0; iter < iter_cnt; iter++) vdev_raidz_generate_parity(rm_bench); - elapsed = NSEC2SEC((double) (gethrtime() - start)); + elapsed = NSEC2SEC((double)(gethrtime() - start)); disksize = (1ULL << ds) / rto_opts.rto_dcols; d_bw = (double)iter_cnt * (double)disksize; @@ -106,7 +106,7 @@ (1ULL<rto_dcols, /* -d */ ilog2(opts->rto_dsize), /* -s */ opts->rto_sweep ? "yes" : "no", /* -S */ - verbose /* -v */ - ); + verbose); /* -v */ } } @@ -98,25 +97,24 @@ FILE *fp = requested ? stdout : stderr; (void) fprintf(fp, "Usage:\n" - "\t[-a zio ashift (default: %zu)]\n" - "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" - "\t[-d number of raidz data columns (default: %zu)]\n" - "\t[-s zio size, exponent radix 2 (default: %zu)]\n" - "\t[-S parameter sweep (default: %s)]\n" - "\t[-t timeout for parameter sweep test]\n" - "\t[-B benchmark all raidz implementations]\n" - "\t[-v increase verbosity (default: %zu)]\n" - "\t[-h (print help)]\n" - "\t[-T test the test, see if failure would be detected]\n" - "\t[-D debug (attach gdb on SIGSEGV)]\n" - "", - o->rto_ashift, /* -a */ - ilog2(o->rto_offset), /* -o */ - o->rto_dcols, /* -d */ - ilog2(o->rto_dsize), /* -s */ - rto_opts.rto_sweep ? "yes" : "no", /* -S */ - o->rto_v /* -d */ - ); + "\t[-a zio ashift (default: %zu)]\n" + "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" + "\t[-d number of raidz data columns (default: %zu)]\n" + "\t[-s zio size, exponent radix 2 (default: %zu)]\n" + "\t[-S parameter sweep (default: %s)]\n" + "\t[-t timeout for parameter sweep test]\n" + "\t[-B benchmark all raidz implementations]\n" + "\t[-v increase verbosity (default: %zu)]\n" + "\t[-h (print help)]\n" + "\t[-T test the test, see if failure would be detected]\n" + "\t[-D debug (attach gdb on SIGSEGV)]\n" + "", + o->rto_ashift, /* -a */ + ilog2(o->rto_offset), /* -o */ + o->rto_dcols, /* -d */ + ilog2(o->rto_dsize), /* -s */ + rto_opts.rto_sweep ? "yes" : "no", /* -S */ + o->rto_v); /* -d */ exit(requested ? 0 : 1); } @@ -181,10 +179,10 @@ } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_data) +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) #define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_data) +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) #define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) static int @@ -195,10 +193,9 @@ VERIFY(parity >= 1 && parity <= 3); for (i = 0; i < parity; i++) { - if (0 != memcmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i), - CODE_COL_SIZE(rm, i))) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { ret++; - LOG_OPT(D_DEBUG, opts, "\nParity block [%d] different!\n", i); } @@ -213,8 +210,8 @@ int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); for (i = 0; i < dcols; i++) { - if (0 != memcmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i), - DATA_COL_SIZE(opts->rm_golden, i))) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { ret++; LOG_OPT(D_DEBUG, opts, @@ -224,37 +221,41 @@ return (ret); } +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *)data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { int i; - int *dst; raidz_col_t *col; for (i = 0; i < cnt; i++) { col = &rm->rm_col[tgts[i]]; - dst = col->rc_data; - for (i = 0; i < col->rc_size / sizeof (int); i++) - dst[i] = rand(); + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); } } void -init_zio_data(zio_t *zio) +init_zio_abd(zio_t *zio) { - int i; - int *dst = (int *) zio->io_data; - - for (i = 0; i < zio->io_size / sizeof (int); i++) { - dst[i] = rand_data[i]; - } + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); } static void fini_raidz_map(zio_t **zio, raidz_map_t **rm) { vdev_raidz_map_free(*rm); - raidz_free((*zio)->io_data, (*zio)->io_size); + raidz_free((*zio)->io_abd, (*zio)->io_size); umem_free(*zio, sizeof (zio_t)); *zio = NULL; @@ -279,11 +280,11 @@ opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; - opts->zio_golden->io_data = raidz_alloc(opts->rto_dsize); - zio_test->io_data = raidz_alloc(opts->rto_dsize); + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); - init_zio_data(opts->zio_golden); - init_zio_data(zio_test); + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); VERIFY0(vdev_raidz_impl_set("original")); @@ -326,11 +327,11 @@ (*zio)->io_offset = 0; (*zio)->io_size = alloc_dsize; - (*zio)->io_data = raidz_alloc(alloc_dsize); - init_zio_data(*zio); + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, - total_ncols, parity); + total_ncols, parity); VERIFY(rm); /* Make sure code columns are destroyed */ @@ -473,18 +474,15 @@ } } else { /* can reconstruct 3 failed data disk */ - for (x0 = 0; - x0 < opts->rto_dcols; x0++) { + for (x0 = 0; x0 < opts->rto_dcols; x0++) { if (x0 >= rm->rm_cols - raidz_parity(rm)) continue; - for (x1 = x0 + 1; - x1 < opts->rto_dcols; x1++) { + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { if (x1 >= rm->rm_cols - raidz_parity(rm)) continue; - for (x2 = x1 + 1; - x2 < opts->rto_dcols; x2++) { + for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { if (x2 >= - rm->rm_cols - raidz_parity(rm)) + rm->rm_cols - raidz_parity(rm)) continue; /* Check if should stop */ @@ -501,7 +499,7 @@ if (!opts->rto_sanity) vdev_raidz_reconstruct(rm, - tgtidx, 3); + tgtidx, 3); if (cmp_data(opts, rm) != 0) { err++; @@ -552,7 +550,7 @@ for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { LOG(D_INFO, "\t\tTesting method [%s] ...", - raidz_rec_name[fn]); + raidz_rec_name[fn]); if (run_rec_check_impl(opts, rm_test, fn) != 0) { LOG(D_INFO, "[FAIL]\n"); @@ -604,7 +602,7 @@ sweep_thread(void *arg) { int err = 0; - raidz_test_opts_t *opts = (raidz_test_opts_t *) arg; + raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; VERIFY(opts != NULL); err = run_test(opts); @@ -705,7 +703,7 @@ opts->rto_v = 0; /* be quiet */ VERIFY3P(zk_thread_create(NULL, 0, - (thread_func_t) sweep_thread, + (thread_func_t)sweep_thread, (void *) opts, TS_RUN, NULL, 0, 0, PTHREAD_CREATE_JOINABLE), !=, NULL); } @@ -762,7 +760,7 @@ kernel_init(FREAD); /* setup random data because rand() is not reentrant */ - rand_data = (int *) umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); srand((unsigned)time(NULL) * getpid()); for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) rand_data[i] = rand(); diff -Nru zfs-linux-0.7.0-rc2/cmd/raidz_test/raidz_test.h zfs-linux-0.7.0-rc3/cmd/raidz_test/raidz_test.h --- zfs-linux-0.7.0-rc2/cmd/raidz_test/raidz_test.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/raidz_test/raidz_test.h 2017-01-20 18:18:28.000000000 +0000 @@ -34,6 +34,8 @@ "sse2", "ssse3", "avx2", + "avx512f", + "avx512bw", "aarch64_neon", "aarch64_neonx2", NULL @@ -102,11 +104,11 @@ #define SEP "----------------\n" -#define raidz_alloc(size) zio_data_buf_alloc(size) -#define raidz_free(p, size) zio_data_buf_free(p, size) +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) -void init_zio_data(zio_t *zio); +void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); diff -Nru zfs-linux-0.7.0-rc2/cmd/zdb/zdb.c zfs-linux-0.7.0-rc3/cmd/zdb/zdb.c --- zfs-linux-0.7.0-rc2/cmd/zdb/zdb.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zdb/zdb.c 2017-01-20 18:18:28.000000000 +0000 @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -76,7 +77,7 @@ if (type < DMU_OT_NUMTYPES) return (dmu_ot[type].ot_name); else if ((type & DMU_OT_NEWTYPE) && - ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); else return ("UNKNOWN"); @@ -2464,7 +2465,7 @@ zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2530,7 +2531,7 @@ if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -2543,7 +2544,7 @@ spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(NULL, spa, bp, data, size, + zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } @@ -2642,10 +2643,21 @@ if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; + + /* + * We are going to be changing the meaning of the metaslab's + * ms_tree. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + ASSERTV(metaslab_group_t *mg = vd->vdev_mg); for (m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(msp->ms_group, ==, mg); mutex_enter(&msp->ms_lock); metaslab_unload(msp); @@ -2666,8 +2678,6 @@ (longlong_t)m, (longlong_t)vd->vdev_ms_count); - msp->ms_ops = &zdb_metaslab_ops; - /* * We don't want to spend the CPU * manipulating the size-ordered @@ -2677,7 +2687,9 @@ msp->ms_tree->rt_ops = NULL; VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); - msp->ms_loaded = B_TRUE; + + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; } mutex_exit(&msp->ms_lock); } @@ -2701,8 +2713,10 @@ vdev_t *rvd = spa->spa_root_vdev; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + ASSERTV(metaslab_group_t *mg = vd->vdev_mg); for (m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); mutex_enter(&msp->ms_lock); /* @@ -2716,7 +2730,9 @@ * from the ms_tree. */ range_tree_vacate(msp->ms_tree, zdb_leak, vd); - msp->ms_loaded = B_FALSE; + + if (msp->ms_loaded) + msp->ms_loaded = B_FALSE; mutex_exit(&msp->ms_lock); } @@ -3248,7 +3264,7 @@ (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); #ifdef _LITTLE_ENDIAN - /* correct the endianess */ + /* correct the endianness */ do_bswap = !do_bswap; #endif for (i = 0; i < nwords; i += 2) { @@ -3270,7 +3286,7 @@ * child[.child]* - For example: 0.1.1 * * The second form can be used to specify arbitrary vdevs anywhere - * in the heirarchy. For example, in a pool with a mirror of + * in the hierarchy. For example, in a pool with a mirror of * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . */ static vdev_t * @@ -3321,6 +3337,13 @@ return (NULL); } +/* ARGSUSED */ +static int +random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) +{ + return (random_get_pseudo_bytes(buf, len)); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -3352,7 +3375,8 @@ uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *pbuf, *lbuf, *buf; + abd_t *pabd; + void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3425,8 +3449,7 @@ psize = size; lsize = size; - /* Some 4K native devices require 4K buffer alignment */ - pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL); + pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3454,15 +3477,15 @@ /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3485,13 +3508,13 @@ void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - bcopy(pbuf, pbuf2, psize); + abd_copy_to_buf(pbuf2, pabd, psize); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, + random_get_pseudo_bytes_cb, NULL)); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize)); /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, @@ -3506,10 +3529,10 @@ "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); - if (zio_decompress_data(c, pbuf, lbuf, - psize, lsize) == 0 && - zio_decompress_data(c, pbuf2, lbuf2, - psize, lsize) == 0 && + if (zio_decompress_data(c, pabd, + lbuf, psize, lsize) == 0 && + zio_decompress_data_buf(c, pbuf2, + lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } @@ -3527,7 +3550,7 @@ buf = lbuf; size = lsize; } else { - buf = pbuf; + buf = abd_to_buf(pabd); size = psize; } @@ -3545,7 +3568,7 @@ zdb_dump_block(thing, buf, size, flags); out: - umem_free(pbuf, SPA_MAXBLOCKSIZE); + abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zdb/zdb_il.c zfs-linux-0.7.0-rc3/cmd/zdb/zdb_il.c --- zfs-linux-0.7.0-rc2/cmd/zdb/zdb_il.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zdb/zdb_il.c 2017-01-20 18:18:28.000000000 +0000 @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -42,6 +42,7 @@ #include #include #include +#include extern uint8_t dump_opt[256]; @@ -120,13 +121,29 @@ } /* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + int i; + + for (i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { - char *data, *dlimit; + abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; - char *buf; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -137,9 +154,6 @@ if (txtype == TX_WRITE2 || verbose < 5) return; - if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL) - return; - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, !BP_IS_HOLE(bp) && @@ -150,43 +164,38 @@ if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - bzero(buf, SPA_MAXBLOCKSIZE); (void) printf("%s\n", prefix); - goto exit; + return; } if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { (void) printf("%s\n", prefix); - goto exit; + return; } SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) - goto exit; - data = buf; + goto out; } else { - data = (char *)(lr + 1); + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); } - dlimit = data + MIN(lr->lr_length, - (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("%s", prefix); - while (data < dlimit) { - if (isprint(*data)) - (void) printf("%c ", *data); - else - (void) printf("%2hhX", *data); - data++; - } + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); (void) printf("\n"); -exit: - free(buf); + +out: + abd_free(data); } /* ARGSUSED */ diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_api.c zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_api.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_api.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_api.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file imlements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include +#include +#include +#include +#include +#include + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_api.h zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_api.h --- zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_api.h 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_api.h 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_serd.c zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_serd.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_serd.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_serd.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: reseting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + if (sep == NULL) + return; + + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_serd.h zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_serd.h --- zfs-linux-0.7.0-rc2/cmd/zed/agents/fmd_serd.h 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/fmd_serd.h 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/README.md zfs-linux-0.7.0-rc3/cmd/zed/agents/README.md --- zfs-linux-0.7.0-rc2/cmd/zed/agents/README.md 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/README.md 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_agents.c zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_agents.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_agents.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_agents.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,368 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; +} guid_search_t; + +static void +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_agent_iter_vdev(zhp, child[c], gsp); + return; + } + /* + * On a devid match, grab the vdev guid + */ + if ((gsp->gs_vdev_guid == 0) && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + } +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + /* + * For multipath, ZFS_EV_VDEV_GUID is missing so find it. + */ + if (vdev_guid == 0) { + guid_search_t search = { 0 }; + + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, + &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directly from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} + +/* + * In ZED context, all the FMA agents run in the same thread + * and do not require a unique libzfs instance. Modules should + * use these stubs. + */ +libzfs_handle_t * +__libzfs_init(void) +{ + return (g_zfs_hdl); +} + +void +__libzfs_fini(libzfs_handle_t *hdl) +{ +} diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_agents.h zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_agents.h --- zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_agents.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_agents.h 2017-01-20 18:18:28.000000000 +0000 @@ -26,29 +26,25 @@ #endif /* - * Agents from ZFS FMA and syseventd - linked directly into ZED daemon binary + * Agent abstraction presented to ZED */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); /* * ZFS Sysevent Linkable Module (SLM) */ -extern int zfs_slm_init(libzfs_handle_t *zfs_hdl); +extern int zfs_slm_init(void); extern void zfs_slm_fini(void); extern void zfs_slm_event(const char *, const char *, nvlist_t *); /* - * ZFS FMA Retire Agent + * In ZED context, all the FMA agents run in the same thread + * and do not require a unique libzfs instance. */ -extern int zfs_retire_init(libzfs_handle_t *zfs_hdl); -extern void zfs_retire_fini(void); -extern void zfs_retire_recv(nvlist_t *nvl, const char *class); - -/* - * ZFS FMA Diagnosis Engine - */ -extern int zfs_diagnosis_init(libzfs_handle_t *zfs_hdl); -extern void zfs_diagnosis_fini(void); -extern void zfs_diagnosis_recv(nvlist_t *nvl, const char *class); +extern libzfs_handle_t *__libzfs_init(void); +extern void __libzfs_fini(libzfs_handle_t *); #ifdef __cplusplus } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_diagnosis.c zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_diagnosis.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_diagnosis.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_diagnosis.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,27 +21,1022 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "zfs_agents.h" -#include "../zed_log.h" +#include "fmd_api.h" +/* + * Our serd engines are named 'zfs___{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) -/*ARGSUSED*/ -void -zfs_diagnosis_recv(nvlist_t *nvl, const char *class) +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) { + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + int ret; + + ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + assert(ret == 0); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } } /*ARGSUSED*/ -int -zfs_diagnosis_init(libzfs_handle_t *zfs_hdl) +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + assert(ret == 0); + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) { + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + return (0); } +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; +#ifdef _HAS_FMD_TOPO + nvlist_t *fmri; + topo_hdl_t *thp; + int err; +#endif + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + +#ifdef _HAS_FMD_TOPO + /* + * We also want to make sure that the detector (pool or vdev) properly + * reflects the diagnosed state, when the fault corresponds to internal + * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a + * device which was unavailable early in boot (because the driver/file + * wasn't available) and is now healthy will be mis-diagnosed. + */ + if (!fmd_nvl_fmri_present(hdl, detector) || + (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { + fmd_case_close(hdl, zcp->zc_case); + nvlist_free(detector); + return; + } + + + fru = NULL; + if (zcp->zc_fru != NULL && + (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { + /* + * If the vdev had an associated FRU, then get the FRU nvlist + * from the topo handle and use that in the suspect list. We + * explicitly lookup the FRU because the fmri reported from the + * kernel may not have up to date details about the disk itself + * (serial, part, etc). + */ + if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * If the disk is part of the system chassis, but the + * FRU indicates a different chassis ID than our + * current system, then ignore the error. This + * indicates that the device was part of another + * cluster head, and for obvious reasons cannot be + * imported on this system. + */ + if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { + fmd_case_close(hdl, zcp->zc_case); + nvlist_free(fmri); + fmd_hdl_topo_rele(hdl, thp); + nvlist_free(detector); + return; + } + + /* + * If the device is no longer present on the system, or + * topo_fmri_fru() fails for other reasons, then fall + * back to the fmri specified in the vdev. + */ + if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) + fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); + nvlist_free(fmri); + } + + fmd_hdl_topo_rele(hdl, thp); + } +#endif + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ /*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + void -zfs_diagnosis_fini(void) +_zfs_diagnosis_init(fmd_hdl_t *hdl) { + libzfs_handle_t *zhdl; + + if ((zhdl = __libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + __libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + __libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + __libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + __libzfs_fini(zhdl); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_mod.c zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_mod.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_mod.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_mod.c 2017-01-20 18:18:28.000000000 +0000 @@ -168,7 +168,7 @@ * operation when finished). If this succeeds, then we're done. If it fails, * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, * but that the label was not what we expected. If the 'autoreplace' property - * is not set, then we relabel the disk (if specified), and attempt a 'zpool + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool * replace'. If the online is successful, but the new state is something else * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of * race, and we should avoid attempting to relabel the disk. @@ -215,9 +215,7 @@ if (offline) return; /* don't intervene if it was taken offline */ -#ifdef HAVE_LIBDEVMAPPER is_dm = zfs_dev_is_dm(path); -#endif zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path, physpath ? physpath : "NULL", wholedisk, is_dm, @@ -261,16 +259,15 @@ } /* - * If the pool doesn't have the autoreplace property set, then attempt - * a true online (without the unspare flag), which will trigger a FMA - * fault. + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. */ - if (!is_dm && (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || - !wholedisk || physpath == NULL)) { + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); return; } @@ -291,12 +288,6 @@ return; } - if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL)) { - zed_log_msg(LOG_INFO, "%s: Autoreplace is not enabled on this" - " pool, ignore disk.", __func__); - return; - } - /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && @@ -352,7 +343,7 @@ list_insert_tail(&g_device_list, device); zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", - leafname, (u_longlong_t) guid); + leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ @@ -369,16 +360,20 @@ found = B_TRUE; break; } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); } if (!found) { /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; } zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", - physpath, (u_longlong_t) guid); + physpath, (u_longlong_t)guid); (void) snprintf(devpath, sizeof (devpath), "%s%s", DEV_BYID_PATH, new_devid); @@ -404,8 +399,8 @@ nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || (physpath != NULL && nvlist_add_string(newvd, ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || - nvlist_add_string(newvd, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, - enc_sysfs_path) != 0 || + (enc_sysfs_path != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, @@ -482,7 +477,7 @@ } else if (dp->dd_compare != NULL) { /* * NOTE: On Linux there is an event for partition, so unlike - * illumos, substring matching is not required to accomodate + * illumos, substring matching is not required to accommodate * the partition suffix. An exact match will be present in * the dp->dd_compare value. */ @@ -656,14 +651,10 @@ * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * * For disks, we only want to pay attention to vdevs marked as whole - * disks. For multipath devices does whole disk apply? (TBD). + * disks or are a multipath device. */ - if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) { - if (!is_slice) { - (void) devphys_iter(devpath, devid, zfs_process_add, - is_slice); - } - } + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) + (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); return (0); } @@ -849,9 +840,9 @@ * For now, each agent has it's own libzfs instance */ int -zfs_slm_init(libzfs_handle_t *zfs_hdl) +zfs_slm_init() { - if ((g_zfshdl = libzfs_init()) == NULL) + if ((g_zfshdl = __libzfs_init()) == NULL) return (-1); /* @@ -863,6 +854,7 @@ if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { list_destroy(&g_pool_list); + __libzfs_fini(g_zfshdl); return (-1); } @@ -903,19 +895,12 @@ } list_destroy(&g_device_list); - libzfs_fini(g_zfshdl); + __libzfs_fini(g_zfshdl); } void zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) { - static pthread_mutex_t serialize = PTHREAD_MUTEX_INITIALIZER; - - /* - * Serialize incoming events from zfs or libudev sources - */ - (void) pthread_mutex_lock(&serialize); zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); (void) zfs_slm_deliver_event(class, subclass, nvl); - (void) pthread_mutex_unlock(&serialize); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_retire.c zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_retire.c --- zfs-linux-0.7.0-rc2/cmd/zed/agents/zfs_retire.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/agents/zfs_retire.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,26 +20,623 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. */ +/* + * The ZFS retire agent is responsible for managing hot spares across all pools. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). + */ + +#include +#include +#include +#include +#include + #include "zfs_agents.h" -#include "../zed_log.h" +#include "fmd_api.h" -/*ARGSUSED*/ -void -zfs_retire_recv(nvlist_t *nvl, const char *class) + +typedef struct zfs_retire_repaired { + struct zfs_retire_repaired *zrr_next; + uint64_t zrr_pool; + uint64_t zrr_vdev; +} zfs_retire_repaired_t; + +typedef struct zfs_retire_data { + libzfs_handle_t *zrd_hdl; + zfs_retire_repaired_t *zrd_repaired; +} zfs_retire_data_t; + +static void +zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) { + zfs_retire_repaired_t *zrp; + + while ((zrp = zdp->zrd_repaired) != NULL) { + zdp->zrd_repaired = zrp->zrr_next; + fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); + } } -/*ARGSUSED*/ -int -zfs_retire_init(libzfs_handle_t *zfs_hdl) +/* + * Find a pool with a matching GUID. + */ +typedef struct find_cbdata { + uint64_t cb_guid; + const char *cb_fru; + zpool_handle_t *cb_zhp; + nvlist_t *cb_vdev; +} find_cbdata_t; + +static int +find_pool(zpool_handle_t *zhp, void *data) { + find_cbdata_t *cbp = data; + + if (cbp->cb_guid == + zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); return (0); } +/* + * Find a vdev within a tree with a matching GUID. + */ +static nvlist_t * +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru, + uint64_t search_guid) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + nvlist_t *ret; + char *fru; + + if (search_fru != NULL) { + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 && + libzfs_fru_compare(zhdl, fru, search_fru)) + return (nv); + } else { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + guid == search_guid) { + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "matched vdev %llu", guid); + return (nv); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_fru, + search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_fru, + search_guid)) != NULL) + return (ret); + } + + return (NULL); +} + +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (vdev_guid != 0) { + if ((*vdevp = find_vdev(zhdl, nvroot, NULL, + vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + } + + return (zhp); +} + +#ifdef _HAS_FMD_TOPO +static int +search_pool(zpool_handle_t *zhp, void *data) +{ + find_cbdata_t *cbp = data; + nvlist_t *config; + nvlist_t *nvroot; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (0); + } + + if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot, + cbp->cb_fru, 0)) != NULL) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Given a FRU FMRI, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp) +{ + find_cbdata_t cb; + + cb.cb_fru = fru; + cb.cb_zhp = NULL; + if (zpool_iter(zhdl, search_pool, &cb) != 1) + return (NULL); + + *vdevp = cb.cb_vdev; + return (cb.cb_zhp); +} +#endif /* _HAS_FMD_TOPO */ + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds. + */ +static void +replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return; + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return; + + replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT); + + dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + (void) nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1); + + fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", + dev_name, basename(spare_name)); + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE) == 0) + break; + } + + free(dev_name); + nvlist_free(replacement); +} + +/* + * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and + * ASRU is now usable. ZFS has found the device to be present and + * functioning. + */ /*ARGSUSED*/ +static void +zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + zfs_retire_repaired_t *zrp; + uint64_t pool_guid, vdev_guid; +#ifdef _HAS_FMD_TOPO + nvlist_t *asru; +#endif + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + return; + + /* + * Before checking the state of the ASRU, go through and see if we've + * already made an attempt to repair this ASRU. This list is cleared + * whenever we receive any kind of list event, and is designed to + * prevent us from generating a feedback loop when we attempt repairs + * against a faulted pool. The problem is that checking the unusable + * state of the ASRU can involve opening the pool, which can post + * statechange events but otherwise leave the pool in the faulted + * state. This list allows us to detect when a statechange event is + * due to our own request. + */ + for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { + if (zrp->zrr_pool == pool_guid && + zrp->zrr_vdev == vdev_guid) + return; + } + +#ifdef _HAS_FMD_TOPO + asru = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid); + (void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid); + + /* + * We explicitly check for the unusable state here to make sure we + * aren't responding to a transient state change. As part of opening a + * vdev, it's possible to see the 'statechange' event, only to be + * followed by a vdev failure later. If we don't check the current + * state of the vdev (or pool) before marking it repaired, then we risk + * generating spurious repair events followed immediately by the same + * diagnosis. + * + * This assumes that the ZFS scheme code associated unusable (i.e. + * isolated) with its own definition of faulty state. In the case of a + * DEGRADED leaf vdev (due to checksum errors), this is not the case. + * This works, however, because the transient state change is not + * posted in this case. This could be made more explicit by not + * relying on the scheme's unusable callback and instead directly + * checking the vdev state, where we could correctly account for + * DEGRADED state. + */ + if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl, + asru, FMD_HAS_FAULT_ASRU, NULL)) { + topo_hdl_t *thp; + char *fmri = NULL; + int err; + + thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); + if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0) + (void) fmd_repair_asru(hdl, fmri); + fmd_hdl_topo_rele(hdl, thp); + + topo_hdl_strfree(thp, fmri); + } + nvlist_free(asru); +#endif + zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); + zrp->zrr_next = zdp->zrd_repaired; + zrp->zrr_pool = pool_guid; + zrp->zrr_vdev = vdev_guid; + zdp->zrd_repaired = zrp; + + fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", + vdev_guid, pool_guid); +} + +/*ARGSUSED*/ +static void +zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + uint64_t pool_guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + libzfs_handle_t *zhdl = zdp->zrd_hdl; + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev = NULL; + char *uuid; + int repair_done = 0; + boolean_t retire; + boolean_t is_disk; + vdev_aux_t aux; + uint64_t state = 0; + + fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + + /* + * If this is a resource notifying us of device removal, then simply + * check for an available spare and continue. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + if (fmd_prop_get_int32(hdl, "spare_on_remove")) + replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + return; + } + + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + + /* + * Note: on zfsonlinux statechange events are more than just + * healthy ones so we need to confirm the actual state value. + */ + if (strcmp(class, "resource.fs.zfs.statechange") == 0 && + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, + &state) == 0 && state == VDEV_STATE_HEALTHY) { + zfs_vdev_repair(hdl, nvl); + return; + } + if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + zfs_vdev_repair(hdl, nvl); + return; + } + + zfs_retire_clear_data(hdl, zdp); + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; + + /* + * We subscribe to zfs faults as well as all repair events. + */ + if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, + &faults, &nfaults) != 0) + return; + + for (f = 0; f < nfaults; f++) { + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; + is_disk = B_FALSE; + + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + + /* + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). We also subscribe to fault.io.* events, so that + * faulty disks will be faulted in the ZFS configuration. + */ + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { + fault_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) { + degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.device")) { + fault_device = B_FALSE; + } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { + is_disk = B_TRUE; + fault_device = B_TRUE; + } else { + continue; + } + + if (is_disk) { +#ifdef _HAS_FMD_TOPO + /* + * This is a disk fault. Lookup the FRU, convert it to + * an FMRI string, and attempt to find a matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU, + &fru) != 0 || + nvlist_lookup_string(fru, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) + continue; + + thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); + if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) { + fmd_hdl_topo_rele(hdl, thp); + continue; + } + + zhp = find_by_fru(zhdl, fmri, &vdev); + topo_hdl_strfree(thp, fmri); + fmd_hdl_topo_rele(hdl, thp); + + if (zhp == NULL) + continue; + + (void) nvlist_lookup_uint64(vdev, + ZPOOL_CONFIG_GUID, &vdev_guid); + aux = VDEV_AUX_EXTERNAL; +#else + continue; +#endif + } else { + /* + * This is a ZFS fault. Lookup the resource, and + * attempt to find the matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) { + if (is_repair) + vdev_guid = 0; + else + continue; + } + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + continue; + + aux = VDEV_AUX_ERR_EXCEEDED; + } + + if (vdev_guid == 0) { + /* + * For pool-level repair events, clear the entire pool. + */ + fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", + zpool_get_name(zhp)); + (void) zpool_clear(zhp, NULL, NULL); + zpool_close(zhp); + continue; + } + + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + repair_done = 1; + fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", + zpool_get_name(zhp), vdev_guid); + (void) zpool_vdev_clear(zhp, vdev_guid); + zpool_close(zhp); + continue; + } + + /* + * Actively fault the device if needed. + */ + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid, aux); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid, aux); + + if (fault_device || degrade_device) + fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", + fault_device ? "fault" : "degrade", vdev_guid, + zpool_get_name(zhp)); + + /* + * Attempt to substitute a hot spare. + */ + replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + } + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && + nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) + fmd_case_uuresolved(hdl, uuid); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_retire_recv, /* fmdo_recv */ + NULL, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props +}; + void -zfs_retire_fini(void) +_zfs_retire_init(fmd_hdl_t *hdl) { + zfs_retire_data_t *zdp; + libzfs_handle_t *zhdl; + + if ((zhdl = __libzfs_init()) == NULL) + return; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + libzfs_fini(zhdl); + return; + } + + zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); + zdp->zrd_hdl = zhdl; + + fmd_hdl_setspecific(hdl, zdp); +} + +void +_zfs_retire_fini(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + + if (zdp != NULL) { + zfs_retire_clear_data(hdl, zdp); + __libzfs_fini(zdp->zrd_hdl); + fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); + } } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/Makefile.am zfs-linux-0.7.0-rc3/cmd/zed/Makefile.am --- zfs-linux-0.7.0-rc2/cmd/zed/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -27,10 +27,15 @@ zed_strings.h FMA_SRC = \ + agents/zfs_agents.c \ agents/zfs_agents.h \ agents/zfs_diagnosis.c \ agents/zfs_mod.c \ - agents/zfs_retire.c + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h zed_SOURCES = $(ZED_SRC) $(FMA_SRC) @@ -38,10 +43,13 @@ $(top_builddir)/lib/libavl/libavl.la \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libspl/libspl.la \ + $(top_builddir)/lib/libuutil/libuutil.la \ $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la +zed_LDFLAGS = -lrt -pthread + zedconfdir = $(sysconfdir)/zfs/zed.d dist_zedconf_DATA = \ @@ -53,28 +61,22 @@ dist_zedexec_SCRIPTS = \ zed.d/all-debug.sh \ zed.d/all-syslog.sh \ - zed.d/checksum-notify.sh \ - zed.d/checksum-spare.sh \ zed.d/data-notify.sh \ zed.d/generic-notify.sh \ - zed.d/io-notify.sh \ - zed.d/io-spare.sh \ zed.d/resilver_finish-notify.sh \ zed.d/scrub_finish-notify.sh \ zed.d/statechange-led.sh \ + zed.d/statechange-notify.sh \ zed.d/vdev_clear-led.sh zedconfdefaults = \ all-syslog.sh \ - checksum-notify.sh \ - checksum-spare.sh \ data-notify.sh \ - io-notify.sh \ - io-spare.sh \ resilver_finish-notify.sh \ scrub_finish-notify.sh \ - statechange-blinkled.sh \ - vdev_clear-blinkled.sh + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh install-data-hook: $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.c zfs-linux-0.7.0-rc3/cmd/zed/zed.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.c 2017-01-20 18:18:28.000000000 +0000 @@ -255,7 +255,7 @@ zed_log_msg(LOG_NOTICE, "ZFS Event Daemon %s-%s (PID %d)", - ZFS_META_VERSION, ZFS_META_RELEASE, (int) getpid()); + ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid()); if (zed_conf_open_state(zcp) < 0) exit(EXIT_FAILURE); diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_conf.c zfs-linux-0.7.0-rc3/cmd/zed/zed_conf.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_conf.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_conf.c 2017-01-20 18:18:28.000000000 +0000 @@ -513,7 +513,7 @@ /* * Write PID file. */ - n = snprintf(buf, sizeof (buf), "%d\n", (int) getpid()); + n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid()); if ((n < 0) || (n >= sizeof (buf))) { errno = ERANGE; zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", @@ -637,7 +637,7 @@ "Failed to read state file: %s", strerror(errno)); return (-1); } - if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { zed_log_msg(LOG_WARNING, "Failed to reposition state file offset: %s", strerror(errno)); @@ -687,7 +687,7 @@ "Failed to write state file: %s", strerror(errno)); return (-1); } - if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { zed_log_msg(LOG_WARNING, "Failed to reposition state file offset: %s", strerror(errno)); diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/checksum-notify.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/checksum-notify.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/checksum-notify.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/checksum-notify.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,64 +0,0 @@ -#!/bin/sh -# -# Send notification in response to a CHECKSUM, DATA, or IO error. -# -# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given -# class/pool/[vdev] combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool/[vdev]. -# -# Exit codes: -# 0: notification sent -# 1: notification failed -# 2: notification not configured -# 3: notification suppressed -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -[ -n "${ZEVENT_POOL}" ] || exit 9 -[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 - -if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \ - && [ "${ZEVENT_SUBCLASS}" != "data" ] \ - && [ "${ZEVENT_SUBCLASS}" != "io" ]; then - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 -fi - -rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" -zed_rate_limit "${rate_limit_tag}" || exit 3 - -umask 077 -note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" -{ - [ "${ZEVENT_SUBCLASS}" = "io" ] && article="an" || article="a" - - echo "ZFS has detected ${article} ${ZEVENT_SUBCLASS} error:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - - [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" - [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" - [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" - - [ -n "${ZEVENT_VDEV_CKSUM_ERRORS}" ] \ - && echo " cksum: ${ZEVENT_VDEV_CKSUM_ERRORS}" - - [ -n "${ZEVENT_VDEV_READ_ERRORS}" ] \ - && echo " read: ${ZEVENT_VDEV_READ_ERRORS}" - - [ -n "${ZEVENT_VDEV_WRITE_ERRORS}" ] \ - && echo " write: ${ZEVENT_VDEV_WRITE_ERRORS}" - - echo " pool: ${ZEVENT_POOL}" - -} > "${note_pathname}" - -zed_notify "${note_subject}" "${note_pathname}"; rv=$? -rm -f "${note_pathname}" -exit "${rv}" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/checksum-spare.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/checksum-spare.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/checksum-spare.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/checksum-spare.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,239 +0,0 @@ -#!/bin/sh -# -# Replace a device with a hot spare in response to IO or CHECKSUM errors. -# The following actions will be performed automatically when the number -# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -# ZED_SPARE_ON_CHECKSUM_ERRORS. -# -# 1) FAULT the device on IO errors, no futher IO will be attempted. -# DEGRADE the device on checksum errors, the device is still -# functional and can be used to service IO requests. -# 2) Set the SES fault beacon for the device. -# 3) Replace the device with a hot spare if any are available. -# -# Once the hot sparing operation is complete either the failed device or -# the hot spare must be manually retired using the 'zpool detach' command. -# The 'autoreplace' functionality which would normally take care of this -# under Illumos has not yet been implemented. -# -# Full support for autoreplace is planned, but it requires that the full -# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -# the majority of the expected hot spare functionality. -# -# Exit codes: -# 0: hot spare replacement successful -# 1: hot spare device not available -# 2: hot sparing disabled or threshold not reached -# 3: device already faulted or degraded -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -# Disabled by default. Enable in the zed.rc file. -: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" -: "${ZED_SPARE_ON_IO_ERRORS:=0}" - - -# query_vdev_status (pool, vdev) -# -# Given a [pool] and [vdev], return the matching vdev path & status on stdout. -# -# Warning: This function does not handle the case of [pool] or [vdev] -# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. -# -# Arguments -# pool: pool name -# vdev: virtual device name -# -# StdOut -# arg1: vdev pathname -# arg2: vdev status -# -query_vdev_status() -{ - local pool="$1" - local vdev="$2" - local t - - vdev="$(basename -- "${vdev}")" - ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return - t="$(printf '\t')" - - "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ - "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ - | tail -1 -} - - -# notify (old_vdev, new_vdev, num_errors) -# -# Send a notification regarding the hot spare replacement. -# -# Arguments -# old_vdev: path of old vdev that has failed -# new_vdev: path of new vdev used as the hot spare replacement -# num_errors: number of errors that triggered this replacement -# -notify() -{ - local old_vdev="$1" - local new_vdev="$2" - local num_errors="$3" - local note_subject - local note_pathname - local s - local rv - - umask 077 - note_subject="ZFS hot spare replacement for ${ZEVENT_POOL} on $(hostname)" - note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" - { - [ "${num_errors}" -ne 1 ] 2>/dev/null && s="s" - - echo "ZFS has replaced a failing device with a hot spare after" \ - "${num_errors} ${ZEVENT_SUBCLASS} error${s}:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - echo " old: ${old_vdev}" - echo " new: ${new_vdev}" - - "${ZPOOL}" status "${ZEVENT_POOL}" - - } > "${note_pathname}" - - zed_notify "${note_subject}" "${note_pathname}"; rv=$? - rm -f "${note_pathname}" - return "${rv}" -} - - -# main -# -# Arguments -# none -# -# Return -# see above -# -main() -{ - local num_errors - local action - local lockfile - local vdev_path - local vdev_status - local spare - local spare_path - local spare_status - local zpool_err - local zpool_rv - local rv - - # Avoid hot-sparing a hot-spare. - # - # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. - # - [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 - - [ -n "${ZEVENT_POOL}" ] || exit 9 - [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 - [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 - - zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 - - # Fault the device after a given number of I/O errors. - # - if [ "${ZEVENT_SUBCLASS}" = "io" ]; then - if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then - num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) - [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ - && action="fault" - fi 2>/dev/null - - # Degrade the device after a given number of checksum errors. - # - elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then - if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then - num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" - [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ - && action="degrade" - fi 2>/dev/null - - else - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 - fi - - # Error threshold not reached. - # - if [ -z "${action}" ]; then - exit 2 - fi - - lockfile="zed.spare.lock" - zed_lock "${lockfile}" - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") - vdev_path="$1" - vdev_status="$2" - - # Device is already FAULTED or DEGRADED. - # - if [ "${vdev_status}" = "FAULTED" ] \ - || [ "${vdev_status}" = "DEGRADED" ]; then - rv=3 - - else - rv=1 - - # 1) FAULT or DEGRADE the device. - # - "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" - - # 2) Set the SES fault beacon. - # - # TODO: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility. The only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 - - # 3) Replace the device with a hot spare. - # - # Round-robin through the spares trying those that are available. - # - for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") - spare_path="$1" - spare_status="$2" - - [ "${spare_status}" = "AVAIL" ] || continue - - zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ - "${ZEVENT_VDEV_GUID}" "${spare_path}" 2>&1)"; zpool_rv=$? - - if [ "${zpool_rv}" -ne 0 ]; then - [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" - else - notify "${vdev_path}" "${spare_path}" "${num_errors}" - rv=0 - break - fi - done - fi - - zed_unlock "${lockfile}" - exit "${rv}" -} - - -main "$@" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/data-notify.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/data-notify.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/data-notify.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/data-notify.sh 2017-01-20 18:18:28.000000000 +0000 @@ -1,6 +1,6 @@ #!/bin/sh # -# Send notification in response to a CHECKSUM, DATA, or IO error. +# Send notification in response to a DATA error. # # Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given # class/pool/[vdev] combination. This protects against spamming the recipient @@ -18,13 +18,7 @@ [ -n "${ZEVENT_POOL}" ] || exit 9 [ -n "${ZEVENT_SUBCLASS}" ] || exit 9 - -if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \ - && [ "${ZEVENT_SUBCLASS}" != "data" ] \ - && [ "${ZEVENT_SUBCLASS}" != "io" ]; then - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 -fi +[ -n "${ZED_NOTIFY_DATA}" ] || exit 3 rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" zed_rate_limit "${rate_limit_tag}" || exit 3 @@ -33,30 +27,15 @@ note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" { - [ "${ZEVENT_SUBCLASS}" = "io" ] && article="an" || article="a" - - echo "ZFS has detected ${article} ${ZEVENT_SUBCLASS} error:" + echo "ZFS has detected a data error:" echo echo " eid: ${ZEVENT_EID}" echo " class: ${ZEVENT_SUBCLASS}" echo " host: $(hostname)" echo " time: ${ZEVENT_TIME_STRING}" - - [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" - [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" - [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" - - [ -n "${ZEVENT_VDEV_CKSUM_ERRORS}" ] \ - && echo " cksum: ${ZEVENT_VDEV_CKSUM_ERRORS}" - - [ -n "${ZEVENT_VDEV_READ_ERRORS}" ] \ - && echo " read: ${ZEVENT_VDEV_READ_ERRORS}" - - [ -n "${ZEVENT_VDEV_WRITE_ERRORS}" ] \ - && echo " write: ${ZEVENT_VDEV_WRITE_ERRORS}" - + echo " error: ${ZEVENT_ZIO_ERR}" + echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}" echo " pool: ${ZEVENT_POOL}" - } > "${note_pathname}" zed_notify "${note_subject}" "${note_pathname}"; rv=$? diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/io-notify.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/io-notify.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/io-notify.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/io-notify.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,64 +0,0 @@ -#!/bin/sh -# -# Send notification in response to a CHECKSUM, DATA, or IO error. -# -# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given -# class/pool/[vdev] combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool/[vdev]. -# -# Exit codes: -# 0: notification sent -# 1: notification failed -# 2: notification not configured -# 3: notification suppressed -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -[ -n "${ZEVENT_POOL}" ] || exit 9 -[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 - -if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \ - && [ "${ZEVENT_SUBCLASS}" != "data" ] \ - && [ "${ZEVENT_SUBCLASS}" != "io" ]; then - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 -fi - -rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" -zed_rate_limit "${rate_limit_tag}" || exit 3 - -umask 077 -note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" -note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" -{ - [ "${ZEVENT_SUBCLASS}" = "io" ] && article="an" || article="a" - - echo "ZFS has detected ${article} ${ZEVENT_SUBCLASS} error:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - - [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" - [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" - [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" - - [ -n "${ZEVENT_VDEV_CKSUM_ERRORS}" ] \ - && echo " cksum: ${ZEVENT_VDEV_CKSUM_ERRORS}" - - [ -n "${ZEVENT_VDEV_READ_ERRORS}" ] \ - && echo " read: ${ZEVENT_VDEV_READ_ERRORS}" - - [ -n "${ZEVENT_VDEV_WRITE_ERRORS}" ] \ - && echo " write: ${ZEVENT_VDEV_WRITE_ERRORS}" - - echo " pool: ${ZEVENT_POOL}" - -} > "${note_pathname}" - -zed_notify "${note_subject}" "${note_pathname}"; rv=$? -rm -f "${note_pathname}" -exit "${rv}" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/io-spare.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/io-spare.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/io-spare.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/io-spare.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,239 +0,0 @@ -#!/bin/sh -# -# Replace a device with a hot spare in response to IO or CHECKSUM errors. -# The following actions will be performed automatically when the number -# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -# ZED_SPARE_ON_CHECKSUM_ERRORS. -# -# 1) FAULT the device on IO errors, no futher IO will be attempted. -# DEGRADE the device on checksum errors, the device is still -# functional and can be used to service IO requests. -# 2) Set the SES fault beacon for the device. -# 3) Replace the device with a hot spare if any are available. -# -# Once the hot sparing operation is complete either the failed device or -# the hot spare must be manually retired using the 'zpool detach' command. -# The 'autoreplace' functionality which would normally take care of this -# under Illumos has not yet been implemented. -# -# Full support for autoreplace is planned, but it requires that the full -# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -# the majority of the expected hot spare functionality. -# -# Exit codes: -# 0: hot spare replacement successful -# 1: hot spare device not available -# 2: hot sparing disabled or threshold not reached -# 3: device already faulted or degraded -# 9: internal error - -[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" -. "${ZED_ZEDLET_DIR}/zed-functions.sh" - -# Disabled by default. Enable in the zed.rc file. -: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" -: "${ZED_SPARE_ON_IO_ERRORS:=0}" - - -# query_vdev_status (pool, vdev) -# -# Given a [pool] and [vdev], return the matching vdev path & status on stdout. -# -# Warning: This function does not handle the case of [pool] or [vdev] -# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. -# -# Arguments -# pool: pool name -# vdev: virtual device name -# -# StdOut -# arg1: vdev pathname -# arg2: vdev status -# -query_vdev_status() -{ - local pool="$1" - local vdev="$2" - local t - - vdev="$(basename -- "${vdev}")" - ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return - t="$(printf '\t')" - - "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ - "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ - | tail -1 -} - - -# notify (old_vdev, new_vdev, num_errors) -# -# Send a notification regarding the hot spare replacement. -# -# Arguments -# old_vdev: path of old vdev that has failed -# new_vdev: path of new vdev used as the hot spare replacement -# num_errors: number of errors that triggered this replacement -# -notify() -{ - local old_vdev="$1" - local new_vdev="$2" - local num_errors="$3" - local note_subject - local note_pathname - local s - local rv - - umask 077 - note_subject="ZFS hot spare replacement for ${ZEVENT_POOL} on $(hostname)" - note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" - { - [ "${num_errors}" -ne 1 ] 2>/dev/null && s="s" - - echo "ZFS has replaced a failing device with a hot spare after" \ - "${num_errors} ${ZEVENT_SUBCLASS} error${s}:" - echo - echo " eid: ${ZEVENT_EID}" - echo " class: ${ZEVENT_SUBCLASS}" - echo " host: $(hostname)" - echo " time: ${ZEVENT_TIME_STRING}" - echo " old: ${old_vdev}" - echo " new: ${new_vdev}" - - "${ZPOOL}" status "${ZEVENT_POOL}" - - } > "${note_pathname}" - - zed_notify "${note_subject}" "${note_pathname}"; rv=$? - rm -f "${note_pathname}" - return "${rv}" -} - - -# main -# -# Arguments -# none -# -# Return -# see above -# -main() -{ - local num_errors - local action - local lockfile - local vdev_path - local vdev_status - local spare - local spare_path - local spare_status - local zpool_err - local zpool_rv - local rv - - # Avoid hot-sparing a hot-spare. - # - # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. - # - [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 - - [ -n "${ZEVENT_POOL}" ] || exit 9 - [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 - [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 - - zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 - - # Fault the device after a given number of I/O errors. - # - if [ "${ZEVENT_SUBCLASS}" = "io" ]; then - if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then - num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) - [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ - && action="fault" - fi 2>/dev/null - - # Degrade the device after a given number of checksum errors. - # - elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then - if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then - num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" - [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ - && action="degrade" - fi 2>/dev/null - - else - zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" - exit 9 - fi - - # Error threshold not reached. - # - if [ -z "${action}" ]; then - exit 2 - fi - - lockfile="zed.spare.lock" - zed_lock "${lockfile}" - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") - vdev_path="$1" - vdev_status="$2" - - # Device is already FAULTED or DEGRADED. - # - if [ "${vdev_status}" = "FAULTED" ] \ - || [ "${vdev_status}" = "DEGRADED" ]; then - rv=3 - - else - rv=1 - - # 1) FAULT or DEGRADE the device. - # - "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" - - # 2) Set the SES fault beacon. - # - # TODO: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility. The only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 - - # 3) Replace the device with a hot spare. - # - # Round-robin through the spares trying those that are available. - # - for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do - - # shellcheck disable=SC2046 - set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") - spare_path="$1" - spare_status="$2" - - [ "${spare_status}" = "AVAIL" ] || continue - - zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ - "${ZEVENT_VDEV_GUID}" "${spare_path}" 2>&1)"; zpool_rv=$? - - if [ "${zpool_rv}" -ne 0 ]; then - [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" - else - notify "${vdev_path}" "${spare_path}" "${num_errors}" - rv=0 - break - fi - done - fi - - zed_unlock "${lockfile}" - exit "${rv}" -} - - -main "$@" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/statechange-led.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/statechange-led.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/statechange-led.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/statechange-led.sh 2017-01-20 18:18:28.000000000 +0000 @@ -12,8 +12,6 @@ # Linux SCSI enclosure services (ses) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # -# This script also requires ZFS to be built with libdevmapper support. -# # Exit codes: # 0: enclosure led successfully set # 1: enclosure leds not not available diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/statechange-notify.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/statechange-notify.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/statechange-notify.sh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/statechange-notify.sh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,74 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# CDDL HEADER END +# + +# +# Send notification in response to a fault induced statechange +# +# ZEVENT_SUBCLASS: 'statechange' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: statechange not relevant +# 4: statechange string missing (unexpected) + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4 + +if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + exit 3 +fi + +umask 077 +note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + if [ "${ZEVENT_VDEV_STATE_STR}" == "FAULTED" ] ; then + echo "The number of I/O errors associated with a ZFS device exceeded" + echo "acceptable levels. ZFS has marked the device as faulted." + elif [ "${ZEVENT_VDEV_STATE_STR}" == "DEGRADED" ] ; then + echo "The number of checksum errors associated with a ZFS device" + echo "exceeded acceptable levels. ZFS has marked the device as" + echo "degraded." + else + echo "ZFS has detected that a device was removed." + fi + + echo + echo " impact: Fault tolerance of the pool may be compromised." + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " state: ${ZEVENT_VDEV_STATE_STR}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}" + + echo " pool: ${ZEVENT_POOL_GUID}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? + +rm -f "${note_pathname}" +exit "${rv}" diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/vdev_attach-led.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/vdev_attach-led.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/vdev_attach-led.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/vdev_attach-led.sh 2017-01-20 18:18:28.000000000 +0000 @@ -12,8 +12,6 @@ # Linux SCSI enclosure services (ses) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # -# This script also requires ZFS to be built with libdevmapper support. -# # Exit codes: # 0: enclosure led successfully set # 1: enclosure leds not not available diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/vdev_clear-led.sh zfs-linux-0.7.0-rc3/cmd/zed/zed.d/vdev_clear-led.sh --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/vdev_clear-led.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/vdev_clear-led.sh 2017-01-20 18:18:28.000000000 +0000 @@ -12,8 +12,6 @@ # Linux SCSI enclosure services (ses) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # -# This script also requires ZFS to be built with libdevmapper support. -# # Exit codes: # 0: enclosure led successfully set # 1: enclosure leds not not available diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed.d/zed.rc zfs-linux-0.7.0-rc3/cmd/zed/zed.d/zed.rc --- zfs-linux-0.7.0-rc2/cmd/zed/zed.d/zed.rc 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed.d/zed.rc 2017-01-20 18:18:28.000000000 +0000 @@ -51,6 +51,12 @@ #ZED_NOTIFY_VERBOSE=0 ## +# Send notifications for 'ereport.fs.zfs.data' events. +# Disabled by default +# +#ZED_NOTIFY_DATA=1 + +## # Pushbullet access token. # This grants full access to your account -- protect it accordingly! # @@ -74,18 +80,6 @@ #ZED_RUNDIR="/var/run" ## -# Replace a device with a hot spare after N checksum errors are detected. -# Disabled by default; uncomment to enable. -# -#ZED_SPARE_ON_CHECKSUM_ERRORS=10 - -## -# Replace a device with a hot spare after N I/O errors are detected. -# Disabled by default; uncomment to enable. -# -#ZED_SPARE_ON_IO_ERRORS=1 - -## # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for # device mapper and multipath devices as well. Your enclosure must be # supported by the Linux SES driver for this to work. diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_disk_event.c zfs-linux-0.7.0-rc3/cmd/zed/zed_disk_event.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_disk_event.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_disk_event.c 2017-01-20 18:18:28.000000000 +0000 @@ -80,7 +80,7 @@ if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); - (void) zfs_slm_event(class, subclass, nvl); + (void) zfs_agent_post_event(class, subclass, nvl); } /* @@ -185,7 +185,7 @@ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); /* - * Strongly typed device is the prefered filter + * Strongly typed device is the preferred filter */ type = udev_device_get_property_value(dev, "ID_FS_TYPE"); if (type != NULL && type[0] != '\0') { @@ -213,8 +213,6 @@ strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { /* skip and wait for partition event */ - zed_log_msg(LOG_INFO, "zed_udev_monitor: %s waiting " - "for slice", udev_device_get_devnode(dev)); udev_device_unref(dev); continue; } @@ -285,7 +283,7 @@ if (strcmp(class, EC_DEV_STATUS) == 0 && udev_device_get_property_value(dev, "DM_UUID") && udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { - tmp = (char *) udev_device_get_devnode(dev); + tmp = (char *)udev_device_get_devnode(dev); tmp2 = zfs_get_underlying_path(tmp); if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { /* @@ -297,12 +295,19 @@ * dev are the same name (i.e. /dev/dm-5), then * there is no real underlying disk for this * multipath device, and so this "change" event - * really a multipath removal. + * really is a multipath removal. */ class = EC_DEV_ADD; subclass = ESC_DISK; } else { - /* multipath remove, ignore it. */ + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } } free(tmp2); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_event.c zfs-linux-0.7.0-rc3/cmd/zed/zed_event.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_event.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_event.c 2017-01-20 18:18:28.000000000 +0000 @@ -55,12 +55,8 @@ zed_log_die("Failed to open \"%s\": %s", ZFS_DEV, strerror(errno)); - if (zfs_slm_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs slm"); - if (zfs_diagnosis_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs diagnosis"); - if (zfs_retire_init(zcp->zfs_hdl) != 0) - zed_log_die("Failed to initialize zfs retire"); + zfs_agent_init(zcp->zfs_hdl); + if (zed_disk_event_init() != 0) zed_log_die("Failed to initialize disk events"); } @@ -75,9 +71,7 @@ zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); zed_disk_event_fini(); - zfs_retire_fini(); - zfs_diagnosis_fini(); - zfs_slm_fini(); + zfs_agent_fini(); if (zcp->zevent_fd >= 0) { if (close(zcp->zevent_fd) < 0) @@ -270,6 +264,13 @@ *dstp++ = '='; buflen--; + if (buflen <= 0) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + va_start(vargs, fmt); n = vsnprintf(dstp, buflen, fmt, vargs); va_end(vargs); @@ -495,7 +496,7 @@ name = nvpair_name(nvp); (void) nvpair_value_int64_array(nvp, &i64p, &nelem); for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { - n = snprintf(p, buflen, "%lld ", (u_longlong_t) i64p[i]); + n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]); if ((n < 0) || (n >= buflen)) return (_zed_event_add_array_err(eid, name)); p += n; @@ -527,7 +528,7 @@ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { - n = snprintf(p, buflen, fmt, (u_longlong_t) u64p[i]); + n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]); if ((n < 0) || (n >= buflen)) return (_zed_event_add_array_err(eid, name)); p += n; @@ -609,7 +610,7 @@ _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); break; case DATA_TYPE_INT8: - (void) nvpair_value_int8(nvp, (int8_t *) &i8); + (void) nvpair_value_int8(nvp, (int8_t *)&i8); _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); break; case DATA_TYPE_UINT8: @@ -617,7 +618,7 @@ _zed_event_add_var(eid, zsp, prefix, name, "%u", i8); break; case DATA_TYPE_INT16: - (void) nvpair_value_int16(nvp, (int16_t *) &i16); + (void) nvpair_value_int16(nvp, (int16_t *)&i16); _zed_event_add_var(eid, zsp, prefix, name, "%d", i16); break; case DATA_TYPE_UINT16: @@ -625,7 +626,7 @@ _zed_event_add_var(eid, zsp, prefix, name, "%u", i16); break; case DATA_TYPE_INT32: - (void) nvpair_value_int32(nvp, (int32_t *) &i32); + (void) nvpair_value_int32(nvp, (int32_t *)&i32); _zed_event_add_var(eid, zsp, prefix, name, "%d", i32); break; case DATA_TYPE_UINT32: @@ -633,15 +634,15 @@ _zed_event_add_var(eid, zsp, prefix, name, "%u", i32); break; case DATA_TYPE_INT64: - (void) nvpair_value_int64(nvp, (int64_t *) &i64); + (void) nvpair_value_int64(nvp, (int64_t *)&i64); _zed_event_add_var(eid, zsp, prefix, name, - "%lld", (longlong_t) i64); + "%lld", (longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); _zed_event_add_var(eid, zsp, prefix, name, (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"), - (u_longlong_t) i64); + (u_longlong_t)i64); /* * shadow readable strings for vdev state pairs */ @@ -659,9 +660,9 @@ _zed_event_add_var(eid, zsp, prefix, name, "%g", d); break; case DATA_TYPE_HRTIME: - (void) nvpair_value_hrtime(nvp, (hrtime_t *) &i64); + (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64); _zed_event_add_var(eid, zsp, prefix, name, - "%llu", (u_longlong_t) i64); + "%llu", (u_longlong_t)i64); break; case DATA_TYPE_NVLIST: _zed_event_add_var(eid, zsp, prefix, name, @@ -832,17 +833,6 @@ } } -static void -_zed_internal_event(const char *class, nvlist_t *nvl) -{ - /* - * NOTE: only vdev check is handled for now - */ - if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { - (void) zfs_slm_event("EC_zfs", "ESC_ZFS_vdev_check", nvl); - } -} - /* * Service the next zevent, blocking until one is available. */ @@ -894,7 +884,7 @@ "Failed to lookup zevent class (eid=%llu)", eid); } else { /* let internal modules see this event first */ - _zed_internal_event(class, nvl); + zfs_agent_post_event(class, NULL, nvl); zsp = zed_strings_create(); @@ -906,7 +896,7 @@ _zed_event_add_env_preserve(eid, zsp); _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID", - "%d", (int) getpid()); + "%d", (int)getpid()); _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR", "%s", zcp->zedlet_dir); subclass = _zed_event_get_subclass(class); diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_exec.c zfs-linux-0.7.0-rc3/cmd/zed/zed_exec.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_exec.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_exec.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "zed_file.h" #include "zed_log.h" @@ -53,7 +54,7 @@ if (!buf) return (NULL); - pp = (char **) buf; + pp = (char **)buf; p = buf + (num_ptrs * sizeof (char *)); i = 0; for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { @@ -65,12 +66,12 @@ } pp[i] = NULL; assert(buf + buflen == p); - return ((char **) buf); + return ((char **)buf); } /* * Fork a child process to handle event [eid]. The program [prog] - * in directory [dir] is executed with the envionment [env]. + * in directory [dir] is executed with the environment [env]. * * The file descriptor [zfd] is the zevent_fd used to track the * current cursor location within the zevent nvlist. @@ -115,19 +116,39 @@ zed_file_close_from(ZEVENT_FILENO + 1); execle(path, prog, NULL, env); _exit(127); - } else { - zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", - prog, eid, pid); - /* FIXME: Timeout rogue child processes with sigalarm? */ -restart: - wpid = waitpid(pid, &status, 0); - if (wpid == (pid_t) -1) { + } + + /* parent process */ + + zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", + prog, eid, pid); + + /* FIXME: Timeout rogue child processes with sigalarm? */ + + /* + * Wait for child process using WNOHANG to limit + * the time spent waiting to 10 seconds (10,000ms). + */ + for (n = 0; n < 1000; n++) { + wpid = waitpid(pid, &status, WNOHANG); + if (wpid == (pid_t)-1) { if (errno == EINTR) - goto restart; + continue; zed_log_msg(LOG_WARNING, "Failed to wait for \"%s\" eid=%llu pid=%d", prog, eid, pid); - } else if (WIFEXITED(status)) { + break; + } else if (wpid == 0) { + struct timespec t; + + /* child still running */ + t.tv_sec = 0; + t.tv_nsec = 10000000; /* 10ms */ + (void) nanosleep(&t, NULL); + continue; + } + + if (WIFEXITED(status)) { zed_log_msg(LOG_INFO, "Finished \"%s\" eid=%llu pid=%d exit=%d", prog, eid, pid, WEXITSTATUS(status)); @@ -141,6 +162,16 @@ "Finished \"%s\" eid=%llu pid=%d status=0x%X", prog, eid, (unsigned int) status); } + break; + } + + /* + * kill child process after 10 seconds + */ + if (wpid == 0) { + zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", + prog, pid); + (void) kill(pid, SIGKILL); } } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_log.c zfs-linux-0.7.0-rc3/cmd/zed/zed_log.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_log.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_log.c 2017-01-20 18:18:28.000000000 +0000 @@ -66,11 +66,11 @@ { if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1)) zed_log_die("Invalid use of zed_log_pipe_open in PID %d", - (int) getpid()); + (int)getpid()); if (pipe(_ctx.pipe_fd) < 0) zed_log_die("Failed to create daemonize pipe in PID %d: %s", - (int) getpid(), strerror(errno)); + (int)getpid(), strerror(errno)); } /* @@ -85,12 +85,12 @@ if (_ctx.pipe_fd[0] < 0) zed_log_die( "Invalid use of zed_log_pipe_close_reads in PID %d", - (int) getpid()); + (int)getpid()); if (close(_ctx.pipe_fd[0]) < 0) zed_log_die( "Failed to close reads on daemonize pipe in PID %d: %s", - (int) getpid(), strerror(errno)); + (int)getpid(), strerror(errno)); _ctx.pipe_fd[0] = -1; } @@ -110,12 +110,12 @@ if (_ctx.pipe_fd[1] < 0) zed_log_die( "Invalid use of zed_log_pipe_close_writes in PID %d", - (int) getpid()); + (int)getpid()); if (close(_ctx.pipe_fd[1]) < 0) zed_log_die( "Failed to close writes on daemonize pipe in PID %d: %s", - (int) getpid(), strerror(errno)); + (int)getpid(), strerror(errno)); _ctx.pipe_fd[1] = -1; } @@ -135,7 +135,7 @@ if (_ctx.pipe_fd[0] < 0) zed_log_die("Invalid use of zed_log_pipe_wait in PID %d", - (int) getpid()); + (int)getpid()); for (;;) { n = read(_ctx.pipe_fd[0], &c, sizeof (c)); @@ -144,7 +144,7 @@ continue; zed_log_die( "Failed to read from daemonize pipe in PID %d: %s", - (int) getpid(), strerror(errno)); + (int)getpid(), strerror(errno)); } if (n == 0) { break; diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_strings.c zfs-linux-0.7.0-rc3/cmd/zed/zed_strings.c --- zfs-linux-0.7.0-rc2/cmd/zed/zed_strings.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_strings.c 2017-01-20 18:18:28.000000000 +0000 @@ -206,7 +206,7 @@ if (!zsp->iteratorp) return (NULL); - return (((zed_strings_node_t *) zsp->iteratorp)->val); + return (((zed_strings_node_t *)zsp->iteratorp)->val); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zed/zed_strings.h zfs-linux-0.7.0-rc3/cmd/zed/zed_strings.h --- zfs-linux-0.7.0-rc2/cmd/zed/zed_strings.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zed/zed_strings.h 2017-01-20 18:18:28.000000000 +0000 @@ -17,16 +17,11 @@ typedef struct zed_strings zed_strings_t; -zed_strings_t * zed_strings_create(void); - +zed_strings_t *zed_strings_create(void); void zed_strings_destroy(zed_strings_t *zsp); - int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s); - -const char * zed_strings_first(zed_strings_t *zsp); - -const char * zed_strings_next(zed_strings_t *zsp); - +const char *zed_strings_first(zed_strings_t *zsp); +const char *zed_strings_next(zed_strings_t *zsp); int zed_strings_count(zed_strings_t *zsp); #endif /* !ZED_STRINGS_H */ diff -Nru zfs-linux-0.7.0-rc2/cmd/zfs/zfs_iter.c zfs-linux-0.7.0-rc3/cmd/zfs/zfs_iter.c --- zfs-linux-0.7.0-rc2/cmd/zfs/zfs_iter.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zfs/zfs_iter.c 2017-01-20 18:18:28.000000000 +0000 @@ -92,7 +92,7 @@ zfs_callback(zfs_handle_t *zhp, void *data) { callback_data_t *cb = data; - boolean_t dontclose = B_FALSE; + boolean_t should_close = B_TRUE; boolean_t include_snaps = zfs_include_snapshots(zhp, cb); boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK); @@ -120,7 +120,7 @@ } } uu_avl_insert(cb->cb_avl, node, idx); - dontclose = B_TRUE; + should_close = B_FALSE; } else { free(node); } @@ -146,7 +146,7 @@ cb->cb_depth--; } - if (!dontclose) + if (should_close) zfs_close(zhp); return (0); diff -Nru zfs-linux-0.7.0-rc2/cmd/zfs/zfs_main.c zfs-linux-0.7.0-rc3/cmd/zfs/zfs_main.c --- zfs-linux-0.7.0-rc2/cmd/zfs/zfs_main.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zfs/zfs_main.c 2017-01-20 18:18:28.000000000 +0000 @@ -26,6 +26,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Igor Kozhukhov . */ #include @@ -270,7 +271,7 @@ return (gettext("\tset ... " " ...\n")); case HELP_SHARE: - return (gettext("\tshare <-a | filesystem>\n")); + return (gettext("\tshare <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " "@ ...\n")); @@ -279,7 +280,7 @@ "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " - "<-a | filesystem|mountpoint>\n")); + "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " @@ -857,7 +858,7 @@ char *strval; char msg[1024]; - if ((p = strchr(argv[0], '/'))) + if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) @@ -2174,7 +2175,7 @@ if (cb.cb_numfailed != 0) ret = 1; } else { - /* List old-version filesytems */ + /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); @@ -2332,9 +2333,9 @@ rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, - &lvstr) == ENOENT) || + &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, - &rvstr) == ENOENT)) { + &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); @@ -2353,6 +2354,7 @@ if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; + default: break; } @@ -2529,7 +2531,7 @@ namelen = strlen(name); } nameidx = us_field_index("name"); - if (namelen > cb->cb_width[nameidx]) + if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* @@ -2561,12 +2563,12 @@ if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || - prop == ZFS_PROP_GROUPOBJUSED) { + prop == ZFS_PROP_GROUPOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) (void) nvlist_add_uint64(props, "objquota", 0); } else if (prop == ZFS_PROP_USEROBJQUOTA || - prop == ZFS_PROP_GROUPOBJQUOTA) { + prop == ZFS_PROP_GROUPOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); @@ -2574,7 +2576,7 @@ return (-1); } sizeidx = us_field_index(propname); - if (sizelen > cb->cb_width[sizeidx]) + if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) @@ -2637,7 +2639,7 @@ case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", - (u_longlong_t) val64); + (u_longlong_t)val64); strval = valstr; } break; @@ -2648,7 +2650,7 @@ if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", - (u_longlong_t) val64); + (u_longlong_t)val64); } else { zfs_nicenum(val64, valstr, sizeof (valstr)); @@ -2672,9 +2674,9 @@ if (scripted) (void) printf("%s", strval); else if (field == USFIELD_TYPE || field == USFIELD_NAME) - (void) printf("%-*s", (int) width[field], strval); + (void) printf("%-*s", (int)width[field], strval); else - (void) printf("%*s", (int) width[field], strval); + (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; @@ -2699,10 +2701,10 @@ col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", - (int) width[field], col); + (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", - (int) width[field], col); + (int)width[field], col); } first = B_FALSE; cfield++; @@ -3376,7 +3378,7 @@ * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones - * -f ignored for backwards compatability + * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, @@ -3980,7 +3982,7 @@ static int zfs_do_receive(int argc, char **argv) { - int c, err; + int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; @@ -4263,7 +4265,7 @@ } } -static int inline +static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; @@ -4483,7 +4485,7 @@ uu_avl_destroy(fsperm->fsp_uge_avl); } -static void inline +static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { @@ -4580,8 +4582,9 @@ avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; + default: - break; + assert(!"unhandled zfs_deleg_who_type_t"); } if (is_set) { @@ -4617,6 +4620,7 @@ if (g) nice_name = g->gr_name; break; + default: break; } @@ -4832,9 +4836,9 @@ (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); - for (i = 0; i < (un ? unallow_size : allow_size); i++) { - const char *opt = opt_desc[i++]; - const char *optdsc = opt_desc[i]; + for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { + const char *opt = opt_desc[i]; + const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } @@ -4954,8 +4958,8 @@ int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; - char base_type = ZFS_DELEG_WHO_UNKNOWN; - char set_type = ZFS_DELEG_WHO_UNKNOWN; + char base_type = '\0'; + char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; @@ -5004,8 +5008,10 @@ ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; - default: break; + + default: + assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { @@ -5239,7 +5245,7 @@ } } -static void inline +static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { @@ -5290,8 +5296,10 @@ case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; - default: break; + + default: + assert(who != NULL); } prt_who = B_FALSE; @@ -5996,10 +6004,10 @@ shared_smb = zfs_is_shared_smb(zhp, NULL); if ((shared_nfs && shared_smb) || - ((shared_nfs && strcmp(shareopts, "on") == 0) && - (strcmp(smbshareopts, "off") == 0)) || - ((shared_smb && strcmp(smbshareopts, "on") == 0) && - (strcmp(shareopts, "off") == 0))) { + (shared_nfs && strcmp(shareopts, "on") == 0 && + strcmp(smbshareopts, "off") == 0) || + (shared_smb && strcmp(smbshareopts, "on") == 0 && + strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); @@ -6436,7 +6444,7 @@ char sharesmb[ZFS_MAXPROPLEN]; /* check options */ - while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) { + while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "af")) != -1) { switch (c) { case 'a': do_all = 1; @@ -6444,6 +6452,11 @@ case 'f': flags = MS_FORCE; break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6475,6 +6488,19 @@ unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); @@ -6567,8 +6593,8 @@ switch (op) { case OP_SHARE: - if (zfs_unshareall_bypath(node->un_zhp, - node->un_mountp) != 0) + if (zfs_unshareall_bytype(node->un_zhp, + node->un_mountp, protocol) != 0) ret = 1; break; @@ -6769,7 +6795,7 @@ if (copy == NULL) usage(B_FALSE); - if ((atp = strchr(copy, '@'))) + if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { diff -Nru zfs-linux-0.7.0-rc2/cmd/zinject/translate.c zfs-linux-0.7.0-rc3/cmd/zinject/translate.c --- zfs-linux-0.7.0-rc2/cmd/zinject/translate.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zinject/translate.c 2017-01-20 18:18:28.000000000 +0000 @@ -436,7 +436,7 @@ { /* * A raw bookmark of the form objset:object:level:blkid, where each - * number is a hexidecimal value. + * number is a hexadecimal value. */ if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset, (u_longlong_t *)&record->zi_object, &record->zi_level, diff -Nru zfs-linux-0.7.0-rc2/cmd/zinject/zinject.c zfs-linux-0.7.0-rc3/cmd/zinject/zinject.c --- zfs-linux-0.7.0-rc2/cmd/zinject/zinject.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zinject/zinject.c 2017-01-20 18:18:28.000000000 +0000 @@ -282,7 +282,7 @@ "\n" "\t\tInject an error into pool 'pool' with the numeric bookmark\n" "\t\tspecified by the remaining tuple. Each number is in\n" - "\t\thexidecimal, and only one block can be specified.\n" + "\t\thexadecimal, and only one block can be specified.\n" "\n" "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" "\t [-a] [-m] [-u] [-f freq] \n" @@ -1091,7 +1091,7 @@ record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) { - libzfs_fini(g_zfs); + libzfs_fini(g_zfs); return (1); } if (!error) @@ -1105,7 +1105,7 @@ */ if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, - ZFS_TYPE_DATASET)) == NULL) { + ZFS_TYPE_DATASET)) == NULL) { libzfs_fini(g_zfs); return (1); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zpios/zpios.h zfs-linux-0.7.0-rc3/cmd/zpios/zpios.h --- zfs-linux-0.7.0-rc2/cmd/zpios/zpios.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpios/zpios.h 2017-01-20 18:18:28.000000000 +0000 @@ -1,7 +1,7 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel - * against ZFS while still being flexibly controled from user space. + * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). diff -Nru zfs-linux-0.7.0-rc2/cmd/zpios/zpios_main.c zfs-linux-0.7.0-rc3/cmd/zpios/zpios_main.c --- zfs-linux-0.7.0-rc2/cmd/zpios/zpios_main.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpios/zpios_main.c 2017-01-20 18:18:28.000000000 +0000 @@ -99,44 +99,44 @@ { fprintf(stderr, "Usage: zpios\n"); fprintf(stderr, - " --threadcount -t =values\n" - " --threadcount_low -l =value\n" - " --threadcount_high -h =value\n" - " --threadcount_incr -e =value\n" - " --regioncount -n =values\n" - " --regioncount_low -i =value\n" - " --regioncount_high -j =value\n" - " --regioncount_incr -k =value\n" - " --offset -o =values\n" - " --offset_low -m =value\n" - " --offset_high -q =value\n" - " --offset_incr -r =value\n" - " --chunksize -c =values\n" - " --chunksize_low -a =value\n" - " --chunksize_high -b =value\n" - " --chunksize_incr -g =value\n" - " --regionsize -s =values\n" - " --regionsize_low -A =value\n" - " --regionsize_high -B =value\n" - " --regionsize_incr -C =value\n" - " --blocksize -S =values\n" - " --load -L =dmuio|ssf|fpp\n" - " --pool -p =pool name\n" - " --name -M =test name\n" - " --cleanup -x\n" - " --prerun -P =pre-command\n" - " --postrun -R =post-command\n" - " --log -G =log directory\n" - " --regionnoise -I =shift\n" - " --chunknoise -N =bytes\n" - " --threaddelay -T =jiffies\n" - " --verify -V\n" - " --zerocopy -z\n" - " --nowait -O\n" - " --noprefetch -f\n" - " --human-readable -H\n" - " --verbose -v =increase verbosity\n" - " --help -? =this help\n\n"); + " --threadcount -t =values\n" + " --threadcount_low -l =value\n" + " --threadcount_high -h =value\n" + " --threadcount_incr -e =value\n" + " --regioncount -n =values\n" + " --regioncount_low -i =value\n" + " --regioncount_high -j =value\n" + " --regioncount_incr -k =value\n" + " --offset -o =values\n" + " --offset_low -m =value\n" + " --offset_high -q =value\n" + " --offset_incr -r =value\n" + " --chunksize -c =values\n" + " --chunksize_low -a =value\n" + " --chunksize_high -b =value\n" + " --chunksize_incr -g =value\n" + " --regionsize -s =values\n" + " --regionsize_low -A =value\n" + " --regionsize_high -B =value\n" + " --regionsize_incr -C =value\n" + " --blocksize -S =values\n" + " --load -L =dmuio|ssf|fpp\n" + " --pool -p =pool name\n" + " --name -M =test name\n" + " --cleanup -x\n" + " --prerun -P =pre-command\n" + " --postrun -R =post-command\n" + " --log -G =log directory\n" + " --regionnoise -I =shift\n" + " --chunknoise -N =bytes\n" + " --threaddelay -T =jiffies\n" + " --verify -V\n" + " --zerocopy -z\n" + " --nowait -O\n" + " --noprefetch -f\n" + " --human-readable -H\n" + " --verbose -v =increase verbosity\n" + " --help -? =this help\n\n"); return (0); } @@ -517,9 +517,8 @@ dev_clear(); - cmd_size = - sizeof (zpios_cmd_t) - + ((T + N + 1) * sizeof (zpios_stats_t)); + cmd_size = sizeof (zpios_cmd_t) + + ((T + N + 1) * sizeof (zpios_stats_t)); cmd = (zpios_cmd_t *)malloc(cmd_size); if (cmd == NULL) return (ENOMEM); diff -Nru zfs-linux-0.7.0-rc2/cmd/zpios/zpios_util.c zfs-linux-0.7.0-rc3/cmd/zpios/zpios_util.c --- zfs-linux-0.7.0-rc2/cmd/zpios/zpios_util.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpios/zpios_util.c 2017-01-20 18:18:28.000000000 +0000 @@ -1,7 +1,7 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel - * against ZFS while still being flexibly controled from user space. + * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -143,7 +143,7 @@ return (rc); } - rc = regexec(&re, string, (size_t) 0, NULL, 0); + rc = regexec(&re, string, (size_t)0, NULL, 0); regfree(&re); return (rc); @@ -224,7 +224,7 @@ if ((rc = regex_match(optarg, pattern))) { fprintf(stderr, "Error: Wrong pattern in %s, '%s'\n", - arg, optarg); + arg, optarg); return (rc); } diff -Nru zfs-linux-0.7.0-rc2/cmd/zpool/zpool_iter.c zfs-linux-0.7.0-rc3/cmd/zpool/zpool_iter.c --- zfs-linux-0.7.0-rc2/cmd/zpool/zpool_iter.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpool/zpool_iter.c 2017-01-20 18:18:28.000000000 +0000 @@ -23,7 +23,9 @@ * Use is subject to license terms. */ - +/* + * Copyright 2016 Igor Kozhukhov . + */ #include #include @@ -33,6 +35,7 @@ #include #include +#include #include "zpool_util.h" @@ -131,7 +134,8 @@ for (i = 0; i < argc; i++) { zpool_handle_t *zhp; - if ((zhp = zpool_open_canfail(g_zfs, argv[i]))) { + if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != + NULL) { if (add_pool(zhp, zlp) != 0) *err = B_TRUE; } else { @@ -316,3 +320,187 @@ } return (for_each_vdev_cb(zhp, nvroot, func, data)); } + +/* Thread function run for each vdev */ +static void +vdev_run_cmd_thread(void *cb_cmd_data) +{ + vdev_cmd_data_t *data = cb_cmd_data; + char *pos = NULL; + FILE *fp; + size_t len = 0; + char cmd[_POSIX_ARG_MAX]; + + /* Set our VDEV_PATH and VDEV_UPATH env vars and run command */ + if (snprintf(cmd, sizeof (cmd), "VDEV_PATH=%s && VDEV_UPATH=%s && %s", + data->path, data->upath ? data->upath : "\"\"", data->cmd) >= + sizeof (cmd)) { + /* Our string was truncated */ + return; + } + + fp = popen(cmd, "r"); + if (fp == NULL) + return; + + data->line = NULL; + + /* Save the first line of output from the command */ + if (getline(&data->line, &len, fp) != -1) { + /* Success. Remove newline from the end, if necessary. */ + if ((pos = strchr(data->line, '\n')) != NULL) + *pos = '\0'; + } else { + data->line = NULL; + } + pclose(fp); +} + +/* For each vdev in the pool run a command */ +static int +for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) +{ + vdev_cmd_data_list_t *vcdl = cb_vcdl; + vdev_cmd_data_t *data; + char *path = NULL; + char *vname = NULL; + int i, match = 0; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return (1); + + /* Spares show more than once if they're in use, so skip if exists */ + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) == 0) && + (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) { + /* vdev already exists, skip it */ + return (0); + } + } + + /* Check for whitelisted vdevs here, if any */ + for (i = 0; i < vcdl->vdev_names_count; i++) { + vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); + if (strcmp(vcdl->vdev_names[i], vname) == 0) { + free(vname); + match = 1; + break; /* match */ + } + free(vname); + } + + /* If we whitelisted vdevs, and this isn't one of them, then bail out */ + if (!match && vcdl->vdev_names_count) + return (0); + + /* + * Resize our array and add in the new element. + */ + if (!(vcdl->data = realloc(vcdl->data, + sizeof (*vcdl->data) * (vcdl->count + 1)))) + return (ENOMEM); /* couldn't realloc */ + + data = &vcdl->data[vcdl->count]; + + data->pool = strdup(zpool_get_name(zhp)); + data->path = strdup(path); + data->upath = zfs_get_underlying_path(path); + data->cmd = vcdl->cmd; + + vcdl->count++; + + return (0); +} + +/* Get the names and count of the vdevs */ +static int +all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl) +{ + return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl)); +} + +/* + * Now that vcdl is populated with our complete list of vdevs, spawn + * off the commands. + */ +static void +all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl) +{ + taskq_t *t; + int i; + /* 5 * boot_ncpus selfishly chosen since it works best on LLNL's HW */ + int max_threads = 5 * boot_ncpus; + + /* + * Under Linux we use a taskq to parallelize running a command + * on each vdev. It is therefore necessary to initialize this + * functionality for the duration of the threads. + */ + thread_init(); + + t = taskq_create("z_pool_cmd", max_threads, defclsyspri, max_threads, + INT_MAX, 0); + if (t == NULL) + return; + + /* Spawn off the command for each vdev */ + for (i = 0; i < vcdl->count; i++) { + (void) taskq_dispatch(t, vdev_run_cmd_thread, + (void *) &vcdl->data[i], TQ_SLEEP); + } + + /* Wait for threads to finish */ + taskq_wait(t); + taskq_destroy(t); + thread_fini(); +} + +/* + * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of + * output from the command in vcdk->data[].line for all vdevs. If you want + * to run the command on only certain vdevs, fill in g_zfs, vdev_names, + * vdev_names_count, and cb_name_flags. Otherwise leave them as zero. + * + * Returns a vdev_cmd_data_list_t that must be freed with + * free_vdev_cmd_data_list(); + */ +vdev_cmd_data_list_t * +all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, + libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags) +{ + vdev_cmd_data_list_t *vcdl; + vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t)); + vcdl->cmd = cmd; + + vcdl->vdev_names = vdev_names; + vcdl->vdev_names_count = vdev_names_count; + vcdl->cb_name_flags = cb_name_flags; + vcdl->g_zfs = g_zfs; + + /* Gather our list of all vdevs in all pools */ + for_each_pool(argc, argv, B_TRUE, NULL, + all_pools_for_each_vdev_gather_cb, vcdl); + + /* Run command on all vdevs in all pools */ + all_pools_for_each_vdev_run_vcdl(vcdl); + + return (vcdl); +} + +/* + * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run() + */ +void +free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl) +{ + int i; + for (i = 0; i < vcdl->count; i++) { + free(vcdl->data[i].path); + free(vcdl->data[i].pool); + free(vcdl->data[i].upath); + free(vcdl->data[i].line); + } + free(vcdl->data); + free(vcdl); +} diff -Nru zfs-linux-0.7.0-rc2/cmd/zpool/zpool_main.c zfs-linux-0.7.0-rc3/cmd/zpool/zpool_main.c --- zfs-linux-0.7.0-rc2/cmd/zpool/zpool_main.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpool/zpool_main.c 2017-01-20 18:18:28.000000000 +0000 @@ -26,6 +26,7 @@ * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. + * Copyright 2016 Igor Kozhukhov . */ #include @@ -279,7 +280,8 @@ static uint_t timestamp_fmt = NODATE; static const char * -get_usage(zpool_help_t idx) { +get_usage(zpool_help_t idx) +{ switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " @@ -312,7 +314,7 @@ "[-R root] [-F [-n]]\n" "\t [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-T d | u] [-ghHLpPvy] " + return (gettext("\tiostat [-c CMD] [-T d | u] [-ghHLpPvy] " "[[-lq]|[-r|-w]]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]] " "[interval [count]]\n")); @@ -335,8 +337,8 @@ case HELP_SCRUB: return (gettext("\tscrub [-s] ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-gLPvxD] [-T d|u] [pool] ... " - "[interval [count]]\n")); + return (gettext("\tstatus [-c CMD] [-gLPvxD] [-T d|u] [pool]" + " ... [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" @@ -1422,7 +1424,7 @@ uint_t c, children; int ret; - name = zpool_vdev_name(g_zfs, zhp, nv, name_flags | VDEV_NAME_TYPE_ID); + name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; @@ -1510,8 +1512,23 @@ boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; + vdev_cmd_data_list_t *vcdl; } status_cbdata_t; +/* Print output line for specific vdev in a specific pool */ +static void +zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) +{ + int i; + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) == 0) && + (strcmp(vcdl->data[i].pool, pool) == 0)) { + printf("%s", vcdl->data[i].line); + break; + } + } +} + /* * Print out configuration state as requested by status_callback. */ @@ -1528,6 +1545,7 @@ uint64_t notpresent; spare_cbdata_t spare_cb; char *state; + char *path = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -1560,7 +1578,6 @@ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { - char *path; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" was %s", path); } else if (vs->vs_aux != 0) { @@ -1641,6 +1658,13 @@ "resilvering" : "repairing"); } + if (cb->vcdl != NULL) { + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + } + } + (void) printf("\n"); for (c = 0; c < children; c++) { @@ -1887,7 +1911,7 @@ case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool uses the following " - "feature(s) not supported on this sytem:\n")); + "feature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; @@ -2044,7 +2068,7 @@ (void) printf(gettext(" config:\n\n")); - cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0); + cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; @@ -2586,6 +2610,7 @@ boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; + vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ @@ -2729,7 +2754,7 @@ rw_column_width = (column_width * columns) + (2 * (columns - 1)); - text_start = (int) ((rw_column_width)/columns - + text_start = (int)((rw_column_width)/columns - slen/columns); printf(" "); /* Two spaces between columns */ @@ -3067,7 +3092,7 @@ } if (cb->cb_scripted) - printf("%llu", (u_longlong_t) val); + printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); @@ -3393,6 +3418,18 @@ print_iostat_histos(cb, oldnv, newnv, scale, name); } + if (cb->vcdl != NULL) { + char *path; + if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, + &path) == 0) { + if (!(cb->cb_flags & IOS_ANYHISTO_M)) + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + if (cb->cb_flags & IOS_ANYHISTO_M) + printf("\n"); + } + } + if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); @@ -3532,7 +3569,7 @@ &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, - cb, 0); + cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); @@ -3924,10 +3961,11 @@ /* - * zpool iostat [-ghHLpPvy] [[-lq]|[-r|-w]] [-n name] [-T d|u] + * zpool iostat [-c CMD] [-ghHLpPvy] [[-lq]|[-r|-w]] [-n name] [-T d|u] * [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * + * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. @@ -3965,6 +4003,7 @@ boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; iostat_cbdata_t cb = { 0 }; + char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', @@ -3973,8 +4012,11 @@ uint64_t unsupported_flags; /* check options */ - while ((c = getopt(argc, argv, "gLPT:vyhplqrwH")) != -1) { + while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwH")) != -1) { switch (c) { + case 'c': + cmd = optarg; + break; case 'g': guid = B_TRUE; break; @@ -4015,8 +4057,13 @@ usage(B_FALSE); break; case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); + if (optopt == 'c') { + fprintf(stderr, + gettext("Missing CMD for -c\n")); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } usage(B_FALSE); } } @@ -4161,13 +4208,12 @@ fprintf(stderr, " -%c", flag_to_arg[idx]); } - fprintf(stderr, ". Try running a newer module.\n"), + fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } - for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); @@ -4217,8 +4263,16 @@ continue; } + if (cmd != NULL && cb.cb_verbose) + cb.vcdl = all_pools_for_each_vdev_run(argc, + argv, cmd, g_zfs, cb.cb_vdev_names, + cb.cb_vdev_names_count, cb.cb_name_flags); + pool_list_iter(list, B_FALSE, print_iostat, &cb); + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), @@ -5949,7 +6003,7 @@ print_scan_status(ps); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, - cbp->cb_name_flags); + cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; @@ -6016,8 +6070,9 @@ } /* - * zpool status [-gLPvx] [-T d|u] [pool] ... [interval [count]] + * zpool status [-c CMD] [-gLPvx] [-T d|u] [pool] ... [interval [count]] * + * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. @@ -6036,10 +6091,14 @@ float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; + char *cmd = NULL; /* check options */ - while ((c = getopt(argc, argv, "gLPvxDT:")) != -1) { + while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) { switch (c) { + case 'c': + cmd = optarg; + break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; @@ -6062,8 +6121,13 @@ get_timestamp_arg(*optarg); break; case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); + if (optopt == 'c') { + fprintf(stderr, + gettext("Missing CMD for -c\n")); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } usage(B_FALSE); } } @@ -6083,9 +6147,16 @@ if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); + if (cmd != NULL) + cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, + NULL, NULL, 0, 0); + ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) @@ -6116,7 +6187,7 @@ static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { - int zfs_version = (int) zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { @@ -6155,7 +6226,7 @@ if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), - unsupp_fs, (int) ZPL_VERSION); + unsupp_fs, (int)ZPL_VERSION); return (1); } @@ -6166,12 +6237,12 @@ if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), - zpool_get_name(zhp), (u_longlong_t) oldversion); + zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), - zpool_get_name(zhp), (u_longlong_t) oldversion, - (u_longlong_t) version); + zpool_get_name(zhp), (u_longlong_t)oldversion, + (u_longlong_t)version); } return (0); @@ -6378,14 +6449,14 @@ if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), - zpool_get_name(zhp), (u_longlong_t) cur_version); + zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), - (u_longlong_t) cbp->cb_version); + (u_longlong_t)cbp->cb_version); return (0); } @@ -6572,7 +6643,7 @@ } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), - (u_longlong_t) cb.cb_version); + (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { @@ -6663,14 +6734,14 @@ } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], - (longlong_t) fnvlist_lookup_uint64( + (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, - (longlong_t) fnvlist_lookup_uint64( + (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { @@ -7182,7 +7253,7 @@ * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". - * -p Diplay values in parsable (exact) format. + * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. @@ -7394,7 +7465,7 @@ int main(int argc, char **argv) { - int ret; + int ret = 0; int i = 0; char *cmdname; diff -Nru zfs-linux-0.7.0-rc2/cmd/zpool/zpool_util.h zfs-linux-0.7.0-rc3/cmd/zpool/zpool_util.h --- zfs-linux-0.7.0-rc2/cmd/zpool/zpool_util.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpool/zpool_util.h 2017-01-20 18:18:28.000000000 +0000 @@ -72,6 +72,37 @@ libzfs_handle_t *g_zfs; + +typedef struct vdev_cmd_data +{ + char *line; /* cmd output */ + char *path; /* vdev path */ + char *upath; /* vdev underlying path */ + char *pool; /* Pool name */ + char *cmd; /* backpointer to cmd */ +} vdev_cmd_data_t; + +typedef struct vdev_cmd_data_list +{ + char *cmd; /* Command to run */ + unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ + + /* vars to whitelist only certain vdevs, if requested */ + libzfs_handle_t *g_zfs; + char **vdev_names; + int vdev_names_count; + int cb_name_flags; + + vdev_cmd_data_t *data; /* Array of vdevs */ + +} vdev_cmd_data_list_t; + +vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, + char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags); + +void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); + #ifdef __cplusplus } #endif diff -Nru zfs-linux-0.7.0-rc2/cmd/zpool/zpool_vdev.c zfs-linux-0.7.0-rc3/cmd/zpool/zpool_vdev.c --- zfs-linux-0.7.0-rc2/cmd/zpool/zpool_vdev.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zpool/zpool_vdev.c 2017-01-20 18:18:28.000000000 +0000 @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2016 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . */ /* @@ -776,7 +777,9 @@ uint_t c, children; nvlist_t *nv; char *type; - replication_level_t lastrep = { 0 }, rep, *ret; + replication_level_t lastrep = {0}; + replication_level_t rep; + replication_level_t *ret; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); diff -Nru zfs-linux-0.7.0-rc2/cmd/zstreamdump/zstreamdump.c zfs-linux-0.7.0-rc3/cmd/zstreamdump/zstreamdump.c --- zfs-linux-0.7.0-rc2/cmd/zstreamdump/zstreamdump.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/zstreamdump/zstreamdump.c 2017-01-20 18:18:28.000000000 +0000 @@ -265,6 +265,7 @@ exit(1); } + fletcher_4_init(); send_stream = stdin; while (read_hdr(drr, &zc)) { @@ -618,6 +619,7 @@ pcksum = zc; } free(buf); + fletcher_4_fini(); /* Print final summary */ diff -Nru zfs-linux-0.7.0-rc2/cmd/ztest/ztest.c zfs-linux-0.7.0-rc3/cmd/ztest/ztest.c --- zfs-linux-0.7.0-rc2/cmd/ztest/ztest.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/cmd/ztest/ztest.c 2017-01-20 18:18:28.000000000 +0000 @@ -114,6 +114,7 @@ #include #include #include +#include #include #include #include @@ -177,7 +178,7 @@ .zo_mirrors = 2, .zo_raidz = 4, .zo_raidz_parity = 1, - .zo_vdev_size = SPA_MINDEVSIZE * 2, + .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ @@ -193,6 +194,7 @@ extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -1129,9 +1131,8 @@ err = zfs_prop_index_to_string(prop, curval, &valname); if (err) - (void) printf("%s %s = %llu at '%s'\n", - osname, propname, (unsigned long long)curval, - setpoint); + (void) printf("%s %s = %llu at '%s'\n", osname, + propname, (unsigned long long)curval, setpoint); else (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); @@ -4477,7 +4478,7 @@ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), - !ztest_random(2)) != 0) + !ztest_random(2)) != 0) goto out; object = od->od_object; @@ -4614,7 +4615,7 @@ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), - !ztest_random(2)) != 0) + !ztest_random(2)) != 0) goto out; object = od->od_object; @@ -5444,7 +5445,7 @@ enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5525,14 +5526,14 @@ * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); - ztest_pattern_set(buf, psize, ~pattern); + abd = abd_alloc_linear(psize, B_TRUE); + ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(abd); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); @@ -5965,6 +5966,12 @@ */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); diff -Nru zfs-linux-0.7.0-rc2/config/kernel-acl.m4 zfs-linux-0.7.0-rc3/config/kernel-acl.m4 --- zfs-linux-0.7.0-rc2/config/kernel-acl.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-acl.m4 2017-01-20 18:18:28.000000000 +0000 @@ -41,6 +41,35 @@ ]) dnl # +dnl # 3.14 API change, +dnl # set_cached_acl() and forget_cached_acl() changed from inline to +dnl # EXPORT_SYMBOL. In the former case, they may not be usable because of +dnl # posix_acl_release. In the latter case, we can always use them. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ + AC_MSG_CHECKING([whether set_cached_acl() is usable]) + ZFS_LINUX_TRY_COMPILE([ + #include + #include + #include + #include + + MODULE_LICENSE("$ZFS_META_LICENSE"); + ],[ + struct inode *ip = NULL; + struct posix_acl *acl = posix_acl_alloc(1, 0); + set_cached_acl(ip, ACL_TYPE_ACCESS, acl); + forget_cached_acl(ip, ACL_TYPE_ACCESS); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_CACHED_ACL_USABLE, 1, + [posix_acl_release() is usable]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # dnl # 3.1 API change, dnl # posix_acl_chmod_masq() is not exported anymore and posix_acl_chmod() dnl # was introduced to replace it. @@ -251,12 +280,37 @@ ]) dnl # +dnl # 3.14 API change, +dnl # Check if inode_operations contains the function set_acl +dnl # +AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL], [ + AC_MSG_CHECKING([whether iops->set_acl() exists]) + ZFS_LINUX_TRY_COMPILE([ + #include + + int set_acl_fn(struct inode *inode, struct posix_acl *acl, int type) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .set_acl = set_acl_fn, + }; + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # dnl # 4.7 API change, dnl # The kernel get_acl will now check cache before calling i_op->get_acl and dnl # do set_cached_acl after that, so i_op->get_acl don't need to do that dnl # anymore. dnl # -AC_DEFUN([ZFS_AC_KERNE_GET_ACL_HANDLE_CACHE], [ +AC_DEFUN([ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE], [ AC_MSG_CHECKING([whether uncached_acl_sentinel() exists]) ZFS_LINUX_TRY_COMPILE([ #include diff -Nru zfs-linux-0.7.0-rc2/config/kernel-aio-fsync.m4 zfs-linux-0.7.0-rc3/config/kernel-aio-fsync.m4 --- zfs-linux-0.7.0-rc2/config/kernel-aio-fsync.m4 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-aio-fsync.m4 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,21 @@ +dnl # +dnl # Linux 4.9-rc5+ ABI, removal of the .aio_fsync field +dnl # +AC_DEFUN([ZFS_AC_KERNEL_AIO_FSYNC], [ + AC_MSG_CHECKING([whether fops->aio_fsync() exists]) + ZFS_LINUX_TRY_COMPILE([ + #include + + static const struct file_operations + fops __attribute__ ((unused)) = { + .aio_fsync = NULL, + }; + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_AIO_FSYNC, 1, [fops->aio_fsync() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff -Nru zfs-linux-0.7.0-rc2/config/kernel-bio-op.m4 zfs-linux-0.7.0-rc3/config/kernel-bio-op.m4 --- zfs-linux-0.7.0-rc2/config/kernel-bio-op.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-bio-op.m4 2017-01-20 18:18:28.000000000 +0000 @@ -10,7 +10,7 @@ ZFS_LINUX_TRY_COMPILE([ #include ],[ - enum req_op op __attribute__ ((unused)) = REQ_OP_DISCARD; + int op __attribute__ ((unused)) = REQ_OP_DISCARD; ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_REQ_OP_DISCARD, 1, @@ -25,10 +25,10 @@ ZFS_LINUX_TRY_COMPILE([ #include ],[ - enum req_op op __attribute__ ((unused)) = REQ_OP_SECURE_ERASE; + int op __attribute__ ((unused)) = REQ_OP_SECURE_ERASE; ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_REQ_OP_SECURE_DISCARD, 1, + AC_DEFINE(HAVE_REQ_OP_SECURE_ERASE, 1, [REQ_OP_SECURE_ERASE is defined]) ],[ AC_MSG_RESULT(no) @@ -41,7 +41,7 @@ ZFS_LINUX_TRY_COMPILE([ #include ],[ - enum req_op op __attribute__ ((unused)) = REQ_OP_FLUSH; + int op __attribute__ ((unused)) = REQ_OP_FLUSH; ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_REQ_OP_FLUSH, 1, @@ -64,4 +64,21 @@ ],[ AC_MSG_RESULT(no) ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS], [ + AC_MSG_CHECKING([whether bio_set_op_attrs is available]) + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio *bio __attribute__ ((unused)) = NULL; + + bio_set_op_attrs(bio, 0, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_OP_ATTRS, 1, + [bio_set_op_attrs is available]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff -Nru zfs-linux-0.7.0-rc2/config/kernel-generic_readlink.m4 zfs-linux-0.7.0-rc3/config/kernel-generic_readlink.m4 --- zfs-linux-0.7.0-rc2/config/kernel-generic_readlink.m4 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-generic_readlink.m4 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,22 @@ +dnl # +dnl # 4.10 API +dnl # +dnl # NULL inode_operations.readlink implies generic_readlink(), which +dnl # has been made static. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL], [ + AC_MSG_CHECKING([whether generic_readlink is global]) + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + int i __attribute__ ((unused)); + + i = generic_readlink(NULL, NULL, 0); + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_READLINK, 1, + [generic_readlink is global]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff -Nru zfs-linux-0.7.0-rc2/config/kernel-inode-set-flags.m4 zfs-linux-0.7.0-rc3/config/kernel-inode-set-flags.m4 --- zfs-linux-0.7.0-rc2/config/kernel-inode-set-flags.m4 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-inode-set-flags.m4 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,18 @@ +dnl # +dnl # 3.15 API change +dnl # inode_set_flags introduced to set i_flags +dnl # +AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_FLAGS], [ + AC_MSG_CHECKING([whether inode_set_flags() exists]) + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + struct inode inode; + inode_set_flags(&inode, S_IMMUTABLE, S_IMMUTABLE); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_FLAGS, 1, [inode_set_flags() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff -Nru zfs-linux-0.7.0-rc2/config/kernel.m4 zfs-linux-0.7.0-rc3/config/kernel.m4 --- zfs-linux-0.7.0-rc2/config/kernel.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel.m4 2017-01-20 18:18:28.000000000 +0000 @@ -37,6 +37,8 @@ ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GET_GENDISK + ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS + ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_CONST_XATTR_HANDLER ZFS_AC_KERNEL_XATTR_HANDLER_NAME @@ -46,6 +48,7 @@ ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS ZFS_AC_KERNEL_POSIX_ACL_RELEASE + ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE ZFS_AC_KERNEL_POSIX_ACL_CHMOD ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS @@ -54,7 +57,9 @@ ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL - ZFS_AC_KERNE_GET_ACL_HANDLE_CACHE + ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL + ZFS_AC_KERNEL_INODE_SET_FLAGS + ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY @@ -64,11 +69,13 @@ ZFS_AC_KERNEL_NR_CACHED_OBJECTS ZFS_AC_KERNEL_FREE_CACHED_OBJECTS ZFS_AC_KERNEL_FALLOCATE + ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_MKDIR_UMODE_T ZFS_AC_KERNEL_LOOKUP_NAMEIDATA ZFS_AC_KERNEL_CREATE_NAMEIDATA ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK + ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_TRUNCATE_RANGE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE diff -Nru zfs-linux-0.7.0-rc2/config/kernel-mkdir-umode-t.m4 zfs-linux-0.7.0-rc3/config/kernel-mkdir-umode-t.m4 --- zfs-linux-0.7.0-rc2/config/kernel-mkdir-umode-t.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-mkdir-umode-t.m4 2017-01-20 18:18:28.000000000 +0000 @@ -4,7 +4,7 @@ dnl # umode_t type rather than an int. The expectation is that any backport dnl # would also change all three prototypes. However, if it turns out that dnl # some distribution doesn't backport the whole thing this could be -dnl # broken apart in to three seperate checks. +dnl # broken apart in to three separate checks. dnl # AC_DEFUN([ZFS_AC_KERNEL_MKDIR_UMODE_T], [ AC_MSG_CHECKING([whether iops->create()/mkdir()/mknod() take umode_t]) diff -Nru zfs-linux-0.7.0-rc2/config/kernel-tmpfile.m4 zfs-linux-0.7.0-rc3/config/kernel-tmpfile.m4 --- zfs-linux-0.7.0-rc2/config/kernel-tmpfile.m4 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-tmpfile.m4 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,23 @@ +dnl # +dnl # 3.11 API change +dnl # Add support for i_op->tmpfile +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TMPFILE], [ + AC_MSG_CHECKING([whether i_op->tmpfile() exists]) + ZFS_LINUX_TRY_COMPILE([ + #include + int tmpfile(struct inode *inode, struct dentry *dentry, + umode_t mode) { return 0; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .tmpfile = tmpfile, + }; + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TMPFILE, 1, + [i_op->tmpfile() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff -Nru zfs-linux-0.7.0-rc2/config/kernel-xattr-handler.m4 zfs-linux-0.7.0-rc3/config/kernel-xattr-handler.m4 --- zfs-linux-0.7.0-rc2/config/kernel-xattr-handler.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/kernel-xattr-handler.m4 2017-01-20 18:18:28.000000000 +0000 @@ -296,7 +296,7 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ dnl # 4.5 API change, dnl # The xattr_handler->list() callback was changed to take only a - dnl # dentry and it only needs to return if it's accessable. + dnl # dentry and it only needs to return if it's accessible. AC_MSG_CHECKING([whether xattr_handler->list() wants simple]) ZFS_LINUX_TRY_COMPILE([ #include diff -Nru zfs-linux-0.7.0-rc2/config/user-commands.m4 zfs-linux-0.7.0-rc3/config/user-commands.m4 --- zfs-linux-0.7.0-rc2/config/user-commands.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/user-commands.m4 2017-01-20 18:18:28.000000000 +0000 @@ -95,7 +95,7 @@ ]) dnl # -dnl # Linux commands, used withing 'is_linux' blocks of test scripts. +dnl # Linux commands, used within 'is_linux' blocks of test scripts. dnl # These commands may take different command line arguments. dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_LINUX], [ @@ -116,6 +116,7 @@ AC_PATH_TOOL(READLINK, readlink, "") AC_PATH_TOOL(SETFACL, setfacl, "") AC_PATH_TOOL(SHARE, exportfs, "") + AC_PATH_TOOL(NET, net, "") AC_PATH_TOOL(SWAP, swapon, "") AC_PATH_TOOL(SWAPADD, swapon, "") AC_PATH_TOOL(UDEVADM, udevadm, "") @@ -144,7 +145,6 @@ AC_PATH_TOOL(DUMPADM, dumpadm, "") AC_PATH_TOOL(FORMAT, format, "") AC_PATH_TOOL(GETMAJOR, getmajor, "") - AC_PATH_TOOL(ISAINFO, isainfo, "") AC_PATH_TOOL(KSTAT, kstat, "") AC_PATH_TOOL(LOCKFS, lockfs, "") AC_PATH_TOOL(LOFIADM, lofiadm, "") diff -Nru zfs-linux-0.7.0-rc2/config/user-libdevmapper.m4 zfs-linux-0.7.0-rc3/config/user-libdevmapper.m4 --- zfs-linux-0.7.0-rc2/config/user-libdevmapper.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/user-libdevmapper.m4 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -dnl # -dnl # Check for libdevmapper. libdevmapper is optional for building, but -dnl # required for auto-online/auto-replace functionality for DM/multipath -dnl # disks. -dnl # -AC_DEFUN([ZFS_AC_CONFIG_USER_LIBDEVMAPPER], [ - AC_CHECK_HEADER([libdevmapper.h], [ - AC_SUBST([LIBDEVMAPPER], ["-ldevmapper"]) - AC_DEFINE([HAVE_LIBDEVMAPPER], 1, [Define if you have libdevmapper]) - - user_libdevmapper=yes - ], [ - user_libdevmapper=no - ]) -]) diff -Nru zfs-linux-0.7.0-rc2/config/user.m4 zfs-linux-0.7.0-rc3/config/user.m4 --- zfs-linux-0.7.0-rc2/config/user.m4 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/config/user.m4 2017-01-20 18:18:28.000000000 +0000 @@ -12,7 +12,6 @@ ZFS_AC_CONFIG_USER_LIBTIRPC ZFS_AC_CONFIG_USER_LIBBLKID ZFS_AC_CONFIG_USER_LIBATTR - ZFS_AC_CONFIG_USER_LIBDEVMAPPER ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN ZFS_AC_CONFIG_USER_RUNSTATEDIR diff -Nru zfs-linux-0.7.0-rc2/configure.ac zfs-linux-0.7.0-rc3/configure.ac --- zfs-linux-0.7.0-rc2/configure.ac 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/configure.ac 2017-01-20 18:18:28.000000000 +0000 @@ -180,6 +180,7 @@ tests/zfs-tests/tests/functional/cachefile/Makefile tests/zfs-tests/tests/functional/casenorm/Makefile tests/zfs-tests/tests/functional/checksum/Makefile + tests/zfs-tests/tests/functional/chattr/Makefile tests/zfs-tests/tests/functional/clean_mirror/Makefile tests/zfs-tests/tests/functional/cli_root/Makefile tests/zfs-tests/tests/functional/cli_root/zdb/Makefile @@ -272,6 +273,7 @@ tests/zfs-tests/tests/functional/snapused/Makefile tests/zfs-tests/tests/functional/sparse/Makefile tests/zfs-tests/tests/functional/threadsappend/Makefile + tests/zfs-tests/tests/functional/tmpfile/Makefile tests/zfs-tests/tests/functional/truncate/Makefile tests/zfs-tests/tests/functional/userquota/Makefile tests/zfs-tests/tests/functional/upgrade/Makefile @@ -303,8 +305,3 @@ AC_OUTPUT - -AS_IF([test "x$user_libdevmapper" != xyes && test "$ZFS_CONFIG" != kernel ], [ - AC_MSG_WARN([Building without libdevmapper. Auto-replace, auto-online, \ -and statechange-led.sh may not work correctly with device mapper vdevs.]) -]) diff -Nru zfs-linux-0.7.0-rc2/contrib/initramfs/scripts/zfs zfs-linux-0.7.0-rc3/contrib/initramfs/scripts/zfs --- zfs-linux-0.7.0-rc2/contrib/initramfs/scripts/zfs 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/contrib/initramfs/scripts/zfs 2017-01-20 18:18:28.000000000 +0000 @@ -130,7 +130,7 @@ if [ -n "$npools" ] then # Because we have found extra pool(s) here, which wasn't - # found 'normaly', we need to force USE_DISK_BY_ID to + # found 'normally', we need to force USE_DISK_BY_ID to # make sure we're able to actually import it/them later. USE_DISK_BY_ID='yes' @@ -195,7 +195,7 @@ # Make as sure as we can to not require '-f' to import. "${ZPOOL}" status "$pool" > /dev/null 2>&1 && return 0 - # For backwards compability, make sure that ZPOOL_IMPORT_PATH is set + # For backwards compatibility, make sure that ZPOOL_IMPORT_PATH is set # to something we can use later with the real import(s). We want to # make sure we find all by* dirs, BUT by-vdev should be first (if it # exists). @@ -485,7 +485,7 @@ # Clone snapshot $1 to destination filesystem $2 # Set 'canmount=noauto' and 'mountpoint=none' so that we get to keep -# manual controll over it's mounting (i.e., make sure it's not automatically +# manual control over it's mounting (i.e., make sure it's not automatically # mounted with a 'zfs mount -a' in the init/systemd scripts). clone_snap() { @@ -497,7 +497,7 @@ # Clone the snapshot into a dataset we can boot from # + We don't want this filesystem to be automatically mounted, we - # want controll over this here and nowhere else. + # want control over this here and nowhere else. # + We don't need any mountpoint set for the same reason. # We use the 'org.zol:mountpoint' property to remember the mountpoint. ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none" @@ -585,7 +585,7 @@ echo -n " Snap nr [0-$((i-1))]? " > /dev/stderr read snapnr - # Reenable debugging. + # Re-enable debugging. if [ -n "${debug}" ]; then ZFS_DEBUG=1 set -x @@ -795,7 +795,7 @@ # supported by ZoL (whatever it's for). if [ -z "$ZFS_RPOOL" ] then - # The ${zfs-bootfs} variable is set at the kernel commmand + # The ${zfs-bootfs} variable is set at the kernel command # line, usually by GRUB, but it cannot be referenced here # directly because bourne variable names cannot contain a # hyphen. diff -Nru zfs-linux-0.7.0-rc2/debian/changelog zfs-linux-0.7.0-rc3/debian/changelog --- zfs-linux-0.7.0-rc2/debian/changelog 2016-10-27 04:23:37.000000000 +0000 +++ zfs-linux-0.7.0-rc3/debian/changelog 2017-01-20 23:37:22.000000000 +0000 @@ -1,3 +1,321 @@ +zfs-linux (0.7.0-rc3-1~trusty~1.gbp45e48b) trusty; urgency=low + + ** SNAPSHOT build @45e48bdf9bb4f9c51620bde10d8760da378a7ad4 ** + + [ Jason Zaman ] + * Add paxcheck make lint target + + [ legend-hua ] + * Update migration tests + + [ GeLiXin ] + * Fix coverity defects: CID 147509 + + [ cao ] + * Fix coverity defects: CID 152975 + * Fix coverity defects: CID 147548 + * Fix coverity defects: CID 147553 + * Fix sa_legacy_attr_count to use ARRAY_SIZE + + [ Neal Gompa (ニール・ゴンパ) ] + * Process all systemd services through the systemd scriptlets + + [ Brian Behlendorf ] + * Add TASKQID_INVALID + * Enable .zfs/snapshot for 32-bit systems + * Fix 32-bit maximum volume size + * Use vmem_size() for 32-bit systems + * Skip async_destroy_001_pos on 32-bit systems + + [ BearBabyLiu ] + * Fix dsl_prop_get_all_dsl() memory leak + + [ Romain Dolbeau ] + * Add parity generation/rebuild using AVX-512 for x86-64 + + [ Chunwei Chen ] + * Fix unlinked file cannot do xattr operations + * Add support for O_TMPFILE + + [ Romain Dolbeau ] + * Add superscalar fletcher4 + + [ Tony Hutter ] + * Allow autoreplace even when enclosure LED sysfs entries don't exist + + [ Brian Behlendorf ] + * Replace ISAINFO with is_32bit function + * Allow 16M zio buffers in user space + * Fix 'zpool import' detection issues + + [ Chunwei Chen ] + * Batch free zpl_posix_acl_release + * Use set_cached_acl and forget_cached_acl when possible + + [ cao ] + * Fix coverity defects: CID 147575, 147577, 147578, 147579 + + [ Don Brady ] + * Add illumos FMD ZFS logic to ZED -- phase 2 + + [ Brian Behlendorf ] + * Skip test suites on 32-bit TEST builders + + [ cao ] + * Fix coverity defects: CID 147626, 147628 + + [ luozhengzheng ] + * Fix coverity defects: 154021 + + [ cao ] + * Fix coverity defects: CID 147629 + * Fix coverity defects: CID 147586 + + [ Olaf Faaland ] + * Fix symlinks for {vdev_clear,statechange}-led.sh + + [ tuxoko ] + * Linux 3.14 compat: assign inode->set_acl + + [ jxiong ] + * Export symbol dmu_objset_userobjspace_upgradable + + [ Gvozden Neskovic ] + * Fix ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE check + + [ cao ] + * Fix coverity defects: CID 147540, 147542 + + [ luozhengzheng ] + * Fix coverity defects: CID 147503 + + [ Don Brady ] + * Add a statechange notify zedlet + + [ Brian Behlendorf ] + * Fix 'zpool import' detection issue + + [ HÃ¥kan Johansson ] + * Repair indent of zpool.8 man page + + [ DeHackEd ] + * Fix man page formatting in zfs-module-parameters + * Kernel 4.9 compat: file_operations->aio_fsync removal + + [ ChaoyuZhang ] + * Enable user_property_002_pos + + [ Giuseppe Di Natale ] + * Ensure that perf regression tests cleanup properly + + [ LOLi ] + * Allow zfs unshare -a + + [ Tony Hutter ] + * Add -c to zpool iostat & status to run command + + [ Tim Chase ] + * zstreamdump needs to initialize fletcher 4 support + + [ David Quigley ] + * DLPX-44812 integrate EP-220 large memory scalability + + [ Isaac Huang ] + * ABD page support to vdev_disk.c + + [ Gvozden Neskovic ] + * ABD changes for vectorized RAIDZ + * ABD Vectorized raidz + * ABD raidz avx512f support + + [ Romain Dolbeau ] + * ABD raidz NEON support + + [ Chunwei Chen ] + * ABD kmap to kmap_atomic + * ABD optimized page allocation code + + [ luozhengzheng ] + * Fix coverity defects: CID 154591 + + [ ChaoyuZhang ] + * Enable ro_props_001_pos + + [ Brian Behlendorf ] + * Convert zio_buf_alloc() consumers + + [ HÃ¥kan Johansson ] + * Do not force VDEV_NAME_TYPE_ID in max_width() + + [ Chunwei Chen ] + * Use system_delay_taskq for long delay tasks + * zvol: reduce linear list search + + [ cao ] + * Compile zio.h and zio_impl.h mutual include + * Remove unused sa_update_from_cb() + + [ luozhengzheng ] + * Fix incorrect operator in abd_alloc_sametype() + + [ Brian Behlendorf ] + * OpenZFS 7143 - dbuf_read() creates unnecessary zio_root() for bonus buf + * Skip zpool_scrub_004_pos on 32-bit systems + + [ ChaoyuZhang ] + * Enable mountpoint_003_pos + + [ Chunwei Chen ] + * zpool_create_minors parallel prefetch + * zvol_remove_minors do parallel zvol_free + + [ Brian Behlendorf ] + * Refactor txg history kstat + + [ Gvozden Neskovic ] + * Cache ddt_get_dedup_dspace() value if there was no ddt changes + + [ Brian Behlendorf ] + * Revert "Disable zio_dva_throttle_enabled by default" + + [ luozhengzheng ] + * Fix coverity defects: CID 154617 + + [ Chunwei Chen ] + * Don't count '@' for dataset namelen if not a snapshot + + [ liaoyuxiangqin ] + * Fix coverity defects: CID 147475 + + [ George Melikov ] + * Add CONTRIBUTING information and templates + + [ Brian Behlendorf ] + * Use cstyle -cpP in `make cstyle` check + + [ bunder2015 ] + * Fix typos in dbuf.c + + [ Brian Behlendorf ] + * Skip slow tests when kmemleak is enabled + * Skip xfstests on Ubuntu 16.04 and CentOS 7 + + [ Chunwei Chen ] + * Add ida_destroy in zvol_fini to fix memleak + * Fix i_flags issue caused by 64c688d + * Fix wrong operator in xvattr.h + + [ Gvozden Neskovic ] + * ABD: Adapt avx512bw raidz assembly + + [ cao ] + * Fix coverity defects: CID 147534 + + [ Chunwei Chen ] + * Fix fchange in zpl_ioctl_setflags + * Use inode_set_flags when available + * Add test for chattr + + [ Tony Hutter ] + * Don't run 'zpool iostat -c CMD' command on all vdevs, if vdevs specified + + [ Chunwei Chen ] + * Fix zmo leak when zfs_sb_create fails + + [ cao ] + * Fix coverity defects: CID 155008 + + [ LOLi ] + * Fix dsl_props_set_sync_impl to work with nested nvlist + + [ Chunwei Chen ] + * Use a dedicated taskq for vdev_file + + [ bunder2015 ] + * Remove extra + from zfs man page + + [ GeLiXin ] + * Fix coverity defects: CID 147587 + + [ LOLi ] + * Don't persist temporary pool name on devices + + [ Tim Chase ] + * 4.10 compat - BIO flag changes and others + + [ ka7 ] + * Fix spelling + + [ George Melikov ] + * OpenZFS 7259 - DS_FIELD_LARGE_BLOCKS is unused + + [ Johnny Stenback ] + * Fix TypeError: unorderable types: str() > int() in arc_summary.py + + [ George Melikov ] + * Further work on Github usability (issue templates) + * OpenZFS 6328 - Fix cstyle errors in zfs codebase + * OpenZFS 6637 - replacing "dontclose" with "should_close" + + [ Don Brady ] + * OpenZFS 7303 - dynamic metaslab selection + + [ George Melikov ] + * OpenZFS 6603 - zfeature_register() should verify ZFEATURE_FLAG_PER_DATASET implies SPA_FEATURE_EXTENSIBLE_DATASET + + [ LOLi ] + * Fix zfs-share systemd unit file + + [ George Melikov ] + * OpenZFS 7276 - zfs(1m) manpage could better describe space properties + + [ Don Brady ] + * OpenZFS 7743 - per-vdev-zaps init path for upgrade + + [ bzzz77 ] + * Add *_by-dnode routines + + [ Jörg Thalheim ] + * module/Makefile.in: use relative cp + + [ Brian Behlendorf ] + * OpenZFS 7181 - race between zfs_mount and zfs_ioc_rollback + * OpenZFS 7603 - xuio_stat_wbuf_* should be declared (void) + + [ LOLi ] + * Fix unallocated object detection for large_dnode datasets + + [ clefru ] + * Don't hardcode perl path but use env instead + + [ Brian Behlendorf ] + * OpenZFS 6551 - cmd/zpool: cleanup gcc warnings + * OpenZFS 6550 - cmd/zfs: cleanup gcc warnings + * OpenZFS 6586 - Whitespace inconsistencies in the spa feature dependency arrays in zfeature_common.c + + [ George Melikov ] + * OpenZFS 7082 - bptree_iterate() passes wrong args to zfs_dbgmsg() + * OpenZFS 7071 - lzc_snapshot does not fill in errlist on ENOENT + * OpenZFS 7256 - low probability race in zfs_get_data + * OpenZFS 7235 - remove unused func dsl_dataset_set_blkptr + * OpenZFS 7257 - zfs manpage user property length needs to be updated + * OpenZFS 7659 - Missing thread_exit() in dmu_send.c + + [ Brian Behlendorf ] + * Disable racy test cases + + [ George Melikov ] + * OpenZFS 6529 - Properly handle updates of variably-sized SA entries + + [ Chunwei Chen ] + * Suspend/resume zvol for recv and rollback + + [ Brian Behlendorf ] + * Fix unused variable warning + * Tag 0.7.0-rc3 + + -- Darik Horn Fri, 20 Jan 2017 18:35:47 -0500 + zfs-linux (0.7.0-rc2-1~trusty~2.gbp1cf5b2) trusty; urgency=low ** SNAPSHOT build @1cf5b28768d4a156cfc4719c920e50dfcd82e1fc ** diff -Nru zfs-linux-0.7.0-rc2/debian/patches/debian-changes-0.7.0-rc2-1~trusty~2.gbp1cf5b2 zfs-linux-0.7.0-rc3/debian/patches/debian-changes-0.7.0-rc2-1~trusty~2.gbp1cf5b2 --- zfs-linux-0.7.0-rc2/debian/patches/debian-changes-0.7.0-rc2-1~trusty~2.gbp1cf5b2 2016-10-27 04:23:45.000000000 +0000 +++ zfs-linux-0.7.0-rc3/debian/patches/debian-changes-0.7.0-rc2-1~trusty~2.gbp1cf5b2 1970-01-01 00:00:00.000000000 +0000 @@ -1,167 +0,0 @@ -Description: - TODO: Put a short summary on the line above and replace this paragraph - with a longer explanation of this change. Complete the meta-information - with other relevant fields (see below for details). To make it easier, the - information below has been extracted from the changelog. Adjust it or drop - it. - . - zfs-linux (0.7.0-rc2-1~trusty~2.gbp1cf5b2) trusty; urgency=low - . - ** SNAPSHOT build @1cf5b28768d4a156cfc4719c920e50dfcd82e1fc ** - . - * Add the new icp module to the packaging overlay. -Author: Darik Horn - ---- -The information above should follow the Patch Tagging Guidelines, please -checkout http://dep.debian.net/deps/dep3/ to learn about the format. Here -are templates for supplementary fields that you might want to add: - -Origin: , -Bug: -Bug-Debian: http://bugs.debian.org/ -Bug-Ubuntu: https://launchpad.net/bugs/ -Forwarded: -Reviewed-By: -Last-Update: - ---- zfs-linux-0.7.0-rc2.orig/rpm/generic/zfs-kmod.spec.in -+++ zfs-linux-0.7.0-rc2/rpm/generic/zfs-kmod.spec.in -@@ -186,6 +186,67 @@ chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir - rm -rf $RPM_BUILD_ROOT - - %changelog -+* Thu May 12 2016 Ned Bass - 0.6.5.7-1 -+- Fix user namespaces uid/gid mapping zfsonlinux/zfs#4177 -+- Fix ZPL miswrite of default POSIX ACL zfsonlinux/zfs#4520 -+- Linux 4.5 and 4.6 compatibility zfsonlinux/zfs#4537 zfsonlinux/zfs#4489 -+- Ensure /dev/disk/by-partlabel gets correctly populated zfsonlinux/zfs#4517 -+- Utilities now work reliably with newly created partitions zfsonlinux/zfs#3708 -+- Import now reliably uses device names stored in label zfsonlinux/zfs#3043 -+- Fix possible deadlock in zfs_secpolicy_write_perms ioctl zfsonlinux/zfs#4554 -+- Fix inverted logic on none elevator comparison zfsonlinux/zfs#4507 -+- Add 32 bit FS_IOC32_{GET|SET}FLAGS compat ioctls for PPC zfsonlinux/zfs#4477 -+* Tue Mar 22 2016 Ned Bass - 0.6.5.6-1 -+- Remove artificial architecture restrictions in packaging -+- Add support for s390[x] zfsonlinux/zfs#4425 -+- Handle negative dentries in case insensitive filesystem zfsonlinux/zfs#4243 -+- Fix casesensitivity=insensitive deadlock zfsonlinux/zfs#4136 -+- Correctly parse zdb -R flag arguments zfsonlinux/zfs#4304 -+- Fix lock order inversion with zvol_open() zfsonlinux/zfs#3681 -+- Add support for asynchronous zvol minor operations zfsonlinux/zfs#2217 -+- Make zvol minor functionality more robust zfsonlinux/zfs#4344 -+- Prevent zpool_find_vdev() from truncating vdev path zfsonlinux/zfs#4312 -+- Add -gLP to zpool subcommands for alt vdev names zfsonlinux/zfs#4341 -+- Fix zpool list -v output for spares and log devices zfsonlinux/zfs#4313 -+* Wed Mar 9 2016 Ned Bass - 0.6.5.5-1 -+- Linux 4.5 compatibility zfsonlinux/zfs#4228 -+- Create working debuginfo packages on Red Hat zfsonlinux/zfs#4224 -+- Make arc_summary.py and dbufstat.py compatible with python3 -+- musl libc compatibility for option parsing zfsonlinux/zfs#4222 -+- Prevent arc_c collapse and possible panic zfsonlinux/zfs#3904 -+- Prevent duplicated xattr between SA and dir zfsonlinux/zfs#4153 -+- Fix zsb->z_hold_mtx deadlock zfsonlinux/zfs#4106 -+- Prevent SA header corruption zfsonlinux/zfs#4150 -+* Fri Jan 8 2016 Ned Bass - 0.6.5.4-1 -+- Linux 4.4 compat -+- Assorted stability fixes -+- Fixes for NFS-exported snapshots -+- Fix kernel warning in unlock_new_inode() and deadlock -+- Fix overflow in P2ROUNDUP_TYPED macro -+- Fix write performance issue due to bad zfs_dirty_data_max calculation -+- Fix builtin kernel builds -+- Fix deadlock during direct memory reclaim -+* Tue Oct 13 2015 Ned Bass - 0.6.5.3-1 -+- Don't import all visible pools in zfs-import init script zfsonlinux/zfs#3777 -+- Fix use-after-free in vdev_disk_physio_completion zfsonlinux/zfs#3920 -+- Fix avl_is_empty(&dn->dn_dbufs) assertion zfsonlinux/zfs#3865 -+* Wed Sep 30 2015 Ned Bass - 0.6.5.2-1 -+- Init script fixes zfsonlinux/zfs#3816 -+- Fix uioskip crash when skip to end zfsonlinux/zfs#3806 zfsonlinux/zfs#3850 -+- Userspace can trigger an assertion zfsonlinux/zfs#3792 -+- Fix quota userused underflow bug zfsonlinux/zfs#3789 -+- Fix performance regression from unwanted synchronous I/O zfsonlinux/zfs#3780 -+- Fix deadlock during ARC reclaim zfsonlinux/zfs#3808 zfsonlinux/zfs#3834 -+- Fix deadlock with zfs receive and clamscan zfsonlinux/zfs#3719 -+- Allow NFS activity to defer snapshot unmounts zfsonlinux/zfs#3794 -+- Linux 4.3 compatibility zfsonlinux/zfs#3799 -+- Zed reload fixes zfsonlinux/zfs#3773 -+- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796 -+- Always remove during dkms uninstall/update zfsonlinux/spl#476 -+* Sat Sep 19 2015 Ned Bass - 0.6.5.1-1 -+- Fix zvol corruption with TRIM/discard zfsonlinux/zfs#3798 -+- Fix NULL as mount(2) syscall data parameter zfsonlinux/zfs#3804 -+- Fix xattr=sa dataset property not honored zfsonlinux/zfs#3787 - * Fri Sep 11 2015 Brian Behlendorf - 0.6.5-1 - - Released 0.6.5-1, detailed release notes are available at: - - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5 ---- zfs-linux-0.7.0-rc2.orig/rpm/generic/zfs.spec.in -+++ zfs-linux-0.7.0-rc2/rpm/generic/zfs.spec.in -@@ -326,6 +326,67 @@ exit 0 - %endif - - %changelog -+* Thu May 12 2016 Ned Bass - 0.6.5.7-1 -+- Fix user namespaces uid/gid mapping zfsonlinux/zfs#4177 -+- Fix ZPL miswrite of default POSIX ACL zfsonlinux/zfs#4520 -+- Linux 4.5 and 4.6 compatibility zfsonlinux/zfs#4537 zfsonlinux/zfs#4489 -+- Ensure /dev/disk/by-partlabel gets correctly populated zfsonlinux/zfs#4517 -+- Utilities now work reliably with newly created partitions zfsonlinux/zfs#3708 -+- Import now reliably uses device names stored in label zfsonlinux/zfs#3043 -+- Fix possible deadlock in zfs_secpolicy_write_perms ioctl zfsonlinux/zfs#4554 -+- Fix inverted logic on none elevator comparison zfsonlinux/zfs#4507 -+- Add 32 bit FS_IOC32_{GET|SET}FLAGS compat ioctls for PPC zfsonlinux/zfs#4477 -+* Tue Mar 22 2016 Ned Bass - 0.6.5.6-1 -+- Remove artificial architecture restrictions in packaging -+- Add support for s390[x] zfsonlinux/zfs#4425 -+- Handle negative dentries in case insensitive filesystem zfsonlinux/zfs#4243 -+- Fix casesensitivity=insensitive deadlock zfsonlinux/zfs#4136 -+- Correctly parse zdb -R flag arguments zfsonlinux/zfs#4304 -+- Fix lock order inversion with zvol_open() zfsonlinux/zfs#3681 -+- Add support for asynchronous zvol minor operations zfsonlinux/zfs#2217 -+- Make zvol minor functionality more robust zfsonlinux/zfs#4344 -+- Prevent zpool_find_vdev() from truncating vdev path zfsonlinux/zfs#4312 -+- Add -gLP to zpool subcommands for alt vdev names zfsonlinux/zfs#4341 -+- Fix zpool list -v output for spares and log devices zfsonlinux/zfs#4313 -+* Wed Mar 9 2016 Ned Bass - 0.6.5.5-1 -+- Linux 4.5 compatibility zfsonlinux/zfs#4228 -+- Create working debuginfo packages on Red Hat zfsonlinux/zfs#4224 -+- Make arc_summary.py and dbufstat.py compatible with python3 -+- musl libc compatibility for option parsing zfsonlinux/zfs#4222 -+- Prevent arc_c collapse and possible panic zfsonlinux/zfs#3904 -+- Prevent duplicated xattr between SA and dir zfsonlinux/zfs#4153 -+- Fix zsb->z_hold_mtx deadlock zfsonlinux/zfs#4106 -+- Prevent SA header corruption zfsonlinux/zfs#4150 -+* Fri Jan 8 2016 Ned Bass - 0.6.5.4-1 -+- Linux 4.4 compat -+- Assorted stability fixes -+- Fixes for NFS-exported snapshots -+- Fix kernel warning in unlock_new_inode() and deadlock -+- Fix overflow in P2ROUNDUP_TYPED macro -+- Fix write performance issue due to bad zfs_dirty_data_max calculation -+- Fix builtin kernel builds -+- Fix deadlock during direct memory reclaim -+* Tue Oct 13 2015 Ned Bass - 0.6.5.3-1 -+- Don't import all visible pools in zfs-import init script zfsonlinux/zfs#3777 -+- Fix use-after-free in vdev_disk_physio_completion zfsonlinux/zfs#3920 -+- Fix avl_is_empty(&dn->dn_dbufs) assertion zfsonlinux/zfs#3865 -+* Wed Sep 30 2015 Ned Bass - 0.6.5.2-1 -+- Init script fixes zfsonlinux/zfs#3816 -+- Fix uioskip crash when skip to end zfsonlinux/zfs#3806 zfsonlinux/zfs#3850 -+- Userspace can trigger an assertion zfsonlinux/zfs#3792 -+- Fix quota userused underflow bug zfsonlinux/zfs#3789 -+- Fix performance regression from unwanted synchronous I/O zfsonlinux/zfs#3780 -+- Fix deadlock during ARC reclaim zfsonlinux/zfs#3808 zfsonlinux/zfs#3834 -+- Fix deadlock with zfs receive and clamscan zfsonlinux/zfs#3719 -+- Allow NFS activity to defer snapshot unmounts zfsonlinux/zfs#3794 -+- Linux 4.3 compatibility zfsonlinux/zfs#3799 -+- Zed reload fixes zfsonlinux/zfs#3773 -+- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796 -+- Always remove during dkms uninstall/update zfsonlinux/spl#476 -+* Sat Sep 19 2015 Ned Bass - 0.6.5.1-1 -+- Fix zvol corruption with TRIM/discard zfsonlinux/zfs#3798 -+- Fix NULL as mount(2) syscall data parameter zfsonlinux/zfs#3804 -+- Fix xattr=sa dataset property not honored zfsonlinux/zfs#3787 - * Fri Sep 11 2015 Brian Behlendorf - 0.6.5-1 - - Released 0.6.5-1, detailed release notes are available at: - - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5 diff -Nru zfs-linux-0.7.0-rc2/debian/patches/debian-changes-0.7.0-rc3-1~trusty~1.gbp45e48b zfs-linux-0.7.0-rc3/debian/patches/debian-changes-0.7.0-rc3-1~trusty~1.gbp45e48b --- zfs-linux-0.7.0-rc2/debian/patches/debian-changes-0.7.0-rc3-1~trusty~1.gbp45e48b 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/debian/patches/debian-changes-0.7.0-rc3-1~trusty~1.gbp45e48b 2017-01-20 23:38:16.000000000 +0000 @@ -0,0 +1,477 @@ +Description: + TODO: Put a short summary on the line above and replace this paragraph + with a longer explanation of this change. Complete the meta-information + with other relevant fields (see below for details). To make it easier, the + information below has been extracted from the changelog. Adjust it or drop + it. + . + zfs-linux (0.7.0-rc3-1~trusty~1.gbp45e48b) trusty; urgency=low + . + ** SNAPSHOT build @45e48bdf9bb4f9c51620bde10d8760da378a7ad4 ** + . + [ Jason Zaman ] + * Add paxcheck make lint target + . + [ legend-hua ] + * Update migration tests + . + [ GeLiXin ] + * Fix coverity defects: CID 147509 + . + [ cao ] + * Fix coverity defects: CID 152975 + * Fix coverity defects: CID 147548 + * Fix coverity defects: CID 147553 + * Fix sa_legacy_attr_count to use ARRAY_SIZE + . + [ Neal Gompa (ニール・ゴンパ) ] + * Process all systemd services through the systemd scriptlets + . + [ Brian Behlendorf ] + * Add TASKQID_INVALID + * Enable .zfs/snapshot for 32-bit systems + * Fix 32-bit maximum volume size + * Use vmem_size() for 32-bit systems + * Skip async_destroy_001_pos on 32-bit systems + . + [ BearBabyLiu ] + * Fix dsl_prop_get_all_dsl() memory leak + . + [ Romain Dolbeau ] + * Add parity generation/rebuild using AVX-512 for x86-64 + . + [ Chunwei Chen ] + * Fix unlinked file cannot do xattr operations + * Add support for O_TMPFILE + . + [ Romain Dolbeau ] + * Add superscalar fletcher4 + . + [ Tony Hutter ] + * Allow autoreplace even when enclosure LED sysfs entries don't exist + . + [ Brian Behlendorf ] + * Replace ISAINFO with is_32bit function + * Allow 16M zio buffers in user space + * Fix 'zpool import' detection issues + . + [ Chunwei Chen ] + * Batch free zpl_posix_acl_release + * Use set_cached_acl and forget_cached_acl when possible + . + [ cao ] + * Fix coverity defects: CID 147575, 147577, 147578, 147579 + . + [ Don Brady ] + * Add illumos FMD ZFS logic to ZED -- phase 2 + . + [ Brian Behlendorf ] + * Skip test suites on 32-bit TEST builders + . + [ cao ] + * Fix coverity defects: CID 147626, 147628 + . + [ luozhengzheng ] + * Fix coverity defects: 154021 + . + [ cao ] + * Fix coverity defects: CID 147629 + * Fix coverity defects: CID 147586 + . + [ Olaf Faaland ] + * Fix symlinks for {vdev_clear,statechange}-led.sh + . + [ tuxoko ] + * Linux 3.14 compat: assign inode->set_acl + . + [ jxiong ] + * Export symbol dmu_objset_userobjspace_upgradable + . + [ Gvozden Neskovic ] + * Fix ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE check + . + [ cao ] + * Fix coverity defects: CID 147540, 147542 + . + [ luozhengzheng ] + * Fix coverity defects: CID 147503 + . + [ Don Brady ] + * Add a statechange notify zedlet + . + [ Brian Behlendorf ] + * Fix 'zpool import' detection issue + . + [ HÃ¥kan Johansson ] + * Repair indent of zpool.8 man page + . + [ DeHackEd ] + * Fix man page formatting in zfs-module-parameters + * Kernel 4.9 compat: file_operations->aio_fsync removal + . + [ ChaoyuZhang ] + * Enable user_property_002_pos + . + [ Giuseppe Di Natale ] + * Ensure that perf regression tests cleanup properly + . + [ LOLi ] + * Allow zfs unshare -a + . + [ Tony Hutter ] + * Add -c to zpool iostat & status to run command + . + [ Tim Chase ] + * zstreamdump needs to initialize fletcher 4 support + . + [ David Quigley ] + * DLPX-44812 integrate EP-220 large memory scalability + . + [ Isaac Huang ] + * ABD page support to vdev_disk.c + . + [ Gvozden Neskovic ] + * ABD changes for vectorized RAIDZ + * ABD Vectorized raidz + * ABD raidz avx512f support + . + [ Romain Dolbeau ] + * ABD raidz NEON support + . + [ Chunwei Chen ] + * ABD kmap to kmap_atomic + * ABD optimized page allocation code + . + [ luozhengzheng ] + * Fix coverity defects: CID 154591 + . + [ ChaoyuZhang ] + * Enable ro_props_001_pos + . + [ Brian Behlendorf ] + * Convert zio_buf_alloc() consumers + . + [ HÃ¥kan Johansson ] + * Do not force VDEV_NAME_TYPE_ID in max_width() + . + [ Chunwei Chen ] + * Use system_delay_taskq for long delay tasks + * zvol: reduce linear list search + . + [ cao ] + * Compile zio.h and zio_impl.h mutual include + * Remove unused sa_update_from_cb() + . + [ luozhengzheng ] + * Fix incorrect operator in abd_alloc_sametype() + . + [ Brian Behlendorf ] + * OpenZFS 7143 - dbuf_read() creates unnecessary zio_root() for bonus buf + * Skip zpool_scrub_004_pos on 32-bit systems + . + [ ChaoyuZhang ] + * Enable mountpoint_003_pos + . + [ Chunwei Chen ] + * zpool_create_minors parallel prefetch + * zvol_remove_minors do parallel zvol_free + . + [ Brian Behlendorf ] + * Refactor txg history kstat + . + [ Gvozden Neskovic ] + * Cache ddt_get_dedup_dspace() value if there was no ddt changes + . + [ Brian Behlendorf ] + * Revert "Disable zio_dva_throttle_enabled by default" + . + [ luozhengzheng ] + * Fix coverity defects: CID 154617 + . + [ Chunwei Chen ] + * Don't count '@' for dataset namelen if not a snapshot + . + [ liaoyuxiangqin ] + * Fix coverity defects: CID 147475 + . + [ George Melikov ] + * Add CONTRIBUTING information and templates + . + [ Brian Behlendorf ] + * Use cstyle -cpP in `make cstyle` check + . + [ bunder2015 ] + * Fix typos in dbuf.c + . + [ Brian Behlendorf ] + * Skip slow tests when kmemleak is enabled + * Skip xfstests on Ubuntu 16.04 and CentOS 7 + . + [ Chunwei Chen ] + * Add ida_destroy in zvol_fini to fix memleak + * Fix i_flags issue caused by 64c688d + * Fix wrong operator in xvattr.h + . + [ Gvozden Neskovic ] + * ABD: Adapt avx512bw raidz assembly + . + [ cao ] + * Fix coverity defects: CID 147534 + . + [ Chunwei Chen ] + * Fix fchange in zpl_ioctl_setflags + * Use inode_set_flags when available + * Add test for chattr + . + [ Tony Hutter ] + * Don't run 'zpool iostat -c CMD' command on all vdevs, if vdevs specified + . + [ Chunwei Chen ] + * Fix zmo leak when zfs_sb_create fails + . + [ cao ] + * Fix coverity defects: CID 155008 + . + [ LOLi ] + * Fix dsl_props_set_sync_impl to work with nested nvlist + . + [ Chunwei Chen ] + * Use a dedicated taskq for vdev_file + . + [ bunder2015 ] + * Remove extra + from zfs man page + . + [ GeLiXin ] + * Fix coverity defects: CID 147587 + . + [ LOLi ] + * Don't persist temporary pool name on devices + . + [ Tim Chase ] + * 4.10 compat - BIO flag changes and others + . + [ ka7 ] + * Fix spelling + . + [ George Melikov ] + * OpenZFS 7259 - DS_FIELD_LARGE_BLOCKS is unused + . + [ Johnny Stenback ] + * Fix TypeError: unorderable types: str() > int() in arc_summary.py + . + [ George Melikov ] + * Further work on Github usability (issue templates) + * OpenZFS 6328 - Fix cstyle errors in zfs codebase + * OpenZFS 6637 - replacing "dontclose" with "should_close" + . + [ Don Brady ] + * OpenZFS 7303 - dynamic metaslab selection + . + [ George Melikov ] + * OpenZFS 6603 - zfeature_register() should verify ZFEATURE_FLAG_PER_DATASET implies SPA_FEATURE_EXTENSIBLE_DATASET + . + [ LOLi ] + * Fix zfs-share systemd unit file + . + [ George Melikov ] + * OpenZFS 7276 - zfs(1m) manpage could better describe space properties + . + [ Don Brady ] + * OpenZFS 7743 - per-vdev-zaps init path for upgrade + . + [ bzzz77 ] + * Add *_by-dnode routines + . + [ Jörg Thalheim ] + * module/Makefile.in: use relative cp + . + [ Brian Behlendorf ] + * OpenZFS 7181 - race between zfs_mount and zfs_ioc_rollback + * OpenZFS 7603 - xuio_stat_wbuf_* should be declared (void) + . + [ LOLi ] + * Fix unallocated object detection for large_dnode datasets + . + [ clefru ] + * Don't hardcode perl path but use env instead + . + [ Brian Behlendorf ] + * OpenZFS 6551 - cmd/zpool: cleanup gcc warnings + * OpenZFS 6550 - cmd/zfs: cleanup gcc warnings + * OpenZFS 6586 - Whitespace inconsistencies in the spa feature dependency arrays in zfeature_common.c + . + [ George Melikov ] + * OpenZFS 7082 - bptree_iterate() passes wrong args to zfs_dbgmsg() + * OpenZFS 7071 - lzc_snapshot does not fill in errlist on ENOENT + * OpenZFS 7256 - low probability race in zfs_get_data + * OpenZFS 7235 - remove unused func dsl_dataset_set_blkptr + * OpenZFS 7257 - zfs manpage user property length needs to be updated + * OpenZFS 7659 - Missing thread_exit() in dmu_send.c + . + [ Brian Behlendorf ] + * Disable racy test cases + . + [ George Melikov ] + * OpenZFS 6529 - Properly handle updates of variably-sized SA entries + . + [ Chunwei Chen ] + * Suspend/resume zvol for recv and rollback + . + [ Brian Behlendorf ] + * Fix unused variable warning + * Tag 0.7.0-rc3 +Author: Darik Horn + +--- +The information above should follow the Patch Tagging Guidelines, please +checkout http://dep.debian.net/deps/dep3/ to learn about the format. Here +are templates for supplementary fields that you might want to add: + +Origin: , +Bug: +Bug-Debian: http://bugs.debian.org/ +Bug-Ubuntu: https://launchpad.net/bugs/ +Forwarded: +Reviewed-By: +Last-Update: + +--- zfs-linux-0.7.0-rc3.orig/rpm/generic/zfs-kmod.spec.in ++++ zfs-linux-0.7.0-rc3/rpm/generic/zfs-kmod.spec.in +@@ -186,6 +186,67 @@ chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir + rm -rf $RPM_BUILD_ROOT + + %changelog ++* Thu May 12 2016 Ned Bass - 0.6.5.7-1 ++- Fix user namespaces uid/gid mapping zfsonlinux/zfs#4177 ++- Fix ZPL miswrite of default POSIX ACL zfsonlinux/zfs#4520 ++- Linux 4.5 and 4.6 compatibility zfsonlinux/zfs#4537 zfsonlinux/zfs#4489 ++- Ensure /dev/disk/by-partlabel gets correctly populated zfsonlinux/zfs#4517 ++- Utilities now work reliably with newly created partitions zfsonlinux/zfs#3708 ++- Import now reliably uses device names stored in label zfsonlinux/zfs#3043 ++- Fix possible deadlock in zfs_secpolicy_write_perms ioctl zfsonlinux/zfs#4554 ++- Fix inverted logic on none elevator comparison zfsonlinux/zfs#4507 ++- Add 32 bit FS_IOC32_{GET|SET}FLAGS compat ioctls for PPC zfsonlinux/zfs#4477 ++* Tue Mar 22 2016 Ned Bass - 0.6.5.6-1 ++- Remove artificial architecture restrictions in packaging ++- Add support for s390[x] zfsonlinux/zfs#4425 ++- Handle negative dentries in case insensitive filesystem zfsonlinux/zfs#4243 ++- Fix casesensitivity=insensitive deadlock zfsonlinux/zfs#4136 ++- Correctly parse zdb -R flag arguments zfsonlinux/zfs#4304 ++- Fix lock order inversion with zvol_open() zfsonlinux/zfs#3681 ++- Add support for asynchronous zvol minor operations zfsonlinux/zfs#2217 ++- Make zvol minor functionality more robust zfsonlinux/zfs#4344 ++- Prevent zpool_find_vdev() from truncating vdev path zfsonlinux/zfs#4312 ++- Add -gLP to zpool subcommands for alt vdev names zfsonlinux/zfs#4341 ++- Fix zpool list -v output for spares and log devices zfsonlinux/zfs#4313 ++* Wed Mar 9 2016 Ned Bass - 0.6.5.5-1 ++- Linux 4.5 compatibility zfsonlinux/zfs#4228 ++- Create working debuginfo packages on Red Hat zfsonlinux/zfs#4224 ++- Make arc_summary.py and dbufstat.py compatible with python3 ++- musl libc compatibility for option parsing zfsonlinux/zfs#4222 ++- Prevent arc_c collapse and possible panic zfsonlinux/zfs#3904 ++- Prevent duplicated xattr between SA and dir zfsonlinux/zfs#4153 ++- Fix zsb->z_hold_mtx deadlock zfsonlinux/zfs#4106 ++- Prevent SA header corruption zfsonlinux/zfs#4150 ++* Fri Jan 8 2016 Ned Bass - 0.6.5.4-1 ++- Linux 4.4 compat ++- Assorted stability fixes ++- Fixes for NFS-exported snapshots ++- Fix kernel warning in unlock_new_inode() and deadlock ++- Fix overflow in P2ROUNDUP_TYPED macro ++- Fix write performance issue due to bad zfs_dirty_data_max calculation ++- Fix builtin kernel builds ++- Fix deadlock during direct memory reclaim ++* Tue Oct 13 2015 Ned Bass - 0.6.5.3-1 ++- Don't import all visible pools in zfs-import init script zfsonlinux/zfs#3777 ++- Fix use-after-free in vdev_disk_physio_completion zfsonlinux/zfs#3920 ++- Fix avl_is_empty(&dn->dn_dbufs) assertion zfsonlinux/zfs#3865 ++* Wed Sep 30 2015 Ned Bass - 0.6.5.2-1 ++- Init script fixes zfsonlinux/zfs#3816 ++- Fix uioskip crash when skip to end zfsonlinux/zfs#3806 zfsonlinux/zfs#3850 ++- Userspace can trigger an assertion zfsonlinux/zfs#3792 ++- Fix quota userused underflow bug zfsonlinux/zfs#3789 ++- Fix performance regression from unwanted synchronous I/O zfsonlinux/zfs#3780 ++- Fix deadlock during ARC reclaim zfsonlinux/zfs#3808 zfsonlinux/zfs#3834 ++- Fix deadlock with zfs receive and clamscan zfsonlinux/zfs#3719 ++- Allow NFS activity to defer snapshot unmounts zfsonlinux/zfs#3794 ++- Linux 4.3 compatibility zfsonlinux/zfs#3799 ++- Zed reload fixes zfsonlinux/zfs#3773 ++- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796 ++- Always remove during dkms uninstall/update zfsonlinux/spl#476 ++* Sat Sep 19 2015 Ned Bass - 0.6.5.1-1 ++- Fix zvol corruption with TRIM/discard zfsonlinux/zfs#3798 ++- Fix NULL as mount(2) syscall data parameter zfsonlinux/zfs#3804 ++- Fix xattr=sa dataset property not honored zfsonlinux/zfs#3787 + * Fri Sep 11 2015 Brian Behlendorf - 0.6.5-1 + - Released 0.6.5-1, detailed release notes are available at: + - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5 +--- zfs-linux-0.7.0-rc3.orig/rpm/generic/zfs.spec.in ++++ zfs-linux-0.7.0-rc3/rpm/generic/zfs.spec.in +@@ -325,6 +325,67 @@ exit 0 + %endif + + %changelog ++* Thu May 12 2016 Ned Bass - 0.6.5.7-1 ++- Fix user namespaces uid/gid mapping zfsonlinux/zfs#4177 ++- Fix ZPL miswrite of default POSIX ACL zfsonlinux/zfs#4520 ++- Linux 4.5 and 4.6 compatibility zfsonlinux/zfs#4537 zfsonlinux/zfs#4489 ++- Ensure /dev/disk/by-partlabel gets correctly populated zfsonlinux/zfs#4517 ++- Utilities now work reliably with newly created partitions zfsonlinux/zfs#3708 ++- Import now reliably uses device names stored in label zfsonlinux/zfs#3043 ++- Fix possible deadlock in zfs_secpolicy_write_perms ioctl zfsonlinux/zfs#4554 ++- Fix inverted logic on none elevator comparison zfsonlinux/zfs#4507 ++- Add 32 bit FS_IOC32_{GET|SET}FLAGS compat ioctls for PPC zfsonlinux/zfs#4477 ++* Tue Mar 22 2016 Ned Bass - 0.6.5.6-1 ++- Remove artificial architecture restrictions in packaging ++- Add support for s390[x] zfsonlinux/zfs#4425 ++- Handle negative dentries in case insensitive filesystem zfsonlinux/zfs#4243 ++- Fix casesensitivity=insensitive deadlock zfsonlinux/zfs#4136 ++- Correctly parse zdb -R flag arguments zfsonlinux/zfs#4304 ++- Fix lock order inversion with zvol_open() zfsonlinux/zfs#3681 ++- Add support for asynchronous zvol minor operations zfsonlinux/zfs#2217 ++- Make zvol minor functionality more robust zfsonlinux/zfs#4344 ++- Prevent zpool_find_vdev() from truncating vdev path zfsonlinux/zfs#4312 ++- Add -gLP to zpool subcommands for alt vdev names zfsonlinux/zfs#4341 ++- Fix zpool list -v output for spares and log devices zfsonlinux/zfs#4313 ++* Wed Mar 9 2016 Ned Bass - 0.6.5.5-1 ++- Linux 4.5 compatibility zfsonlinux/zfs#4228 ++- Create working debuginfo packages on Red Hat zfsonlinux/zfs#4224 ++- Make arc_summary.py and dbufstat.py compatible with python3 ++- musl libc compatibility for option parsing zfsonlinux/zfs#4222 ++- Prevent arc_c collapse and possible panic zfsonlinux/zfs#3904 ++- Prevent duplicated xattr between SA and dir zfsonlinux/zfs#4153 ++- Fix zsb->z_hold_mtx deadlock zfsonlinux/zfs#4106 ++- Prevent SA header corruption zfsonlinux/zfs#4150 ++* Fri Jan 8 2016 Ned Bass - 0.6.5.4-1 ++- Linux 4.4 compat ++- Assorted stability fixes ++- Fixes for NFS-exported snapshots ++- Fix kernel warning in unlock_new_inode() and deadlock ++- Fix overflow in P2ROUNDUP_TYPED macro ++- Fix write performance issue due to bad zfs_dirty_data_max calculation ++- Fix builtin kernel builds ++- Fix deadlock during direct memory reclaim ++* Tue Oct 13 2015 Ned Bass - 0.6.5.3-1 ++- Don't import all visible pools in zfs-import init script zfsonlinux/zfs#3777 ++- Fix use-after-free in vdev_disk_physio_completion zfsonlinux/zfs#3920 ++- Fix avl_is_empty(&dn->dn_dbufs) assertion zfsonlinux/zfs#3865 ++* Wed Sep 30 2015 Ned Bass - 0.6.5.2-1 ++- Init script fixes zfsonlinux/zfs#3816 ++- Fix uioskip crash when skip to end zfsonlinux/zfs#3806 zfsonlinux/zfs#3850 ++- Userspace can trigger an assertion zfsonlinux/zfs#3792 ++- Fix quota userused underflow bug zfsonlinux/zfs#3789 ++- Fix performance regression from unwanted synchronous I/O zfsonlinux/zfs#3780 ++- Fix deadlock during ARC reclaim zfsonlinux/zfs#3808 zfsonlinux/zfs#3834 ++- Fix deadlock with zfs receive and clamscan zfsonlinux/zfs#3719 ++- Allow NFS activity to defer snapshot unmounts zfsonlinux/zfs#3794 ++- Linux 4.3 compatibility zfsonlinux/zfs#3799 ++- Zed reload fixes zfsonlinux/zfs#3773 ++- Fix PAX Patch/Grsec SLAB_USERCOPY panic zfsonlinux/zfs#3796 ++- Always remove during dkms uninstall/update zfsonlinux/spl#476 ++* Sat Sep 19 2015 Ned Bass - 0.6.5.1-1 ++- Fix zvol corruption with TRIM/discard zfsonlinux/zfs#3798 ++- Fix NULL as mount(2) syscall data parameter zfsonlinux/zfs#3804 ++- Fix xattr=sa dataset property not honored zfsonlinux/zfs#3787 + * Fri Sep 11 2015 Brian Behlendorf - 0.6.5-1 + - Released 0.6.5-1, detailed release notes are available at: + - https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.5 diff -Nru zfs-linux-0.7.0-rc2/debian/patches/series zfs-linux-0.7.0-rc3/debian/patches/series --- zfs-linux-0.7.0-rc2/debian/patches/series 2016-10-27 04:23:45.000000000 +0000 +++ zfs-linux-0.7.0-rc3/debian/patches/series 2017-01-20 23:38:16.000000000 +0000 @@ -1,4 +1,4 @@ 0001-Prevent-manual-builds-in-the-DKMS-source.patch 0002-Check-for-META-and-DCH-consistency-in-autoconf.patch 0003-Add-libuutil-to-LIBADD-for-libzfs-and-libzfs_core.patch -debian-changes-0.7.0-rc2-1~trusty~2.gbp1cf5b2 +debian-changes-0.7.0-rc3-1~trusty~1.gbp45e48b diff -Nru zfs-linux-0.7.0-rc2/etc/init.d/zfs-import.in zfs-linux-0.7.0-rc3/etc/init.d/zfs-import.in --- zfs-linux-0.7.0-rc2/etc/init.d/zfs-import.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/etc/init.d/zfs-import.in 2017-01-20 18:18:28.000000000 +0000 @@ -98,7 +98,7 @@ if [ -n "$npools" ] then # Because we have found extra pool(s) here, which wasn't - # found 'normaly', we need to force USE_DISK_BY_ID to + # found 'normally', we need to force USE_DISK_BY_ID to # make sure we're able to actually import it/them later. USE_DISK_BY_ID='yes' @@ -148,7 +148,7 @@ available_pools="$apools" fi - # For backwards compability, make sure that ZPOOL_IMPORT_PATH is set + # For backwards compatibility, make sure that ZPOOL_IMPORT_PATH is set # to something we can use later with the real import(s). We want to # make sure we find all by* dirs, BUT by-vdev should be first (if it # exists). @@ -157,7 +157,7 @@ local dirs dirs="$(for dir in $(echo /dev/disk/by-*) do - # Ignore by-vdev here - we wan't it first! + # Ignore by-vdev here - we want it first! echo "$dir" | grep -q /by-vdev && continue [ ! -d "$dir" ] && continue @@ -217,7 +217,7 @@ # Import by using ZPOOL_IMPORT_PATH (either set above or in # the config file) _or_ with the 'built in' default search - # paths. This is the prefered way. + # paths. This is the preferred way. "$ZPOOL" import -N ${ZPOOL_IMPORT_OPTS} "$pool" 2> /dev/null r="$?" ; RET=$((RET + r)) if [ "$r" -eq 0 ] diff -Nru zfs-linux-0.7.0-rc2/etc/init.d/zfs-mount.in zfs-linux-0.7.0-rc3/etc/init.d/zfs-mount.in --- zfs-linux-0.7.0-rc2/etc/init.d/zfs-mount.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/etc/init.d/zfs-mount.in 2017-01-20 18:18:28.000000000 +0000 @@ -73,7 +73,7 @@ zfs_action "Mounting ZFS filesystem(s)" \ "$ZFS" mount -a$verbose$overlay "$MOUNT_EXTRA_OPTIONS" - # Require each volume/filesytem to have 'noauto' and no fsck + # Require each volume/filesystem to have 'noauto' and no fsck # option. This shouldn't really be necessary, as long as one # can get zfs-import to run sufficiently early on in the boot # process - before local mounts. This is just here in case/if diff -Nru zfs-linux-0.7.0-rc2/etc/systemd/system/zfs-share.service.in zfs-linux-0.7.0-rc3/etc/systemd/system/zfs-share.service.in --- zfs-linux-0.7.0-rc2/etc/systemd/system/zfs-share.service.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/etc/systemd/system/zfs-share.service.in 2017-01-20 18:18:28.000000000 +0000 @@ -9,7 +9,7 @@ [Service] Type=oneshot RemainAfterExit=yes -ExecStartPre=-@bindir@/rm -f /etc/dfs/sharetab +ExecStartPre=-/bin/rm -f /etc/dfs/sharetab ExecStart=@sbindir@/zfs share -a [Install] diff -Nru zfs-linux-0.7.0-rc2/.github/CONTRIBUTING.md zfs-linux-0.7.0-rc3/.github/CONTRIBUTING.md --- zfs-linux-0.7.0-rc2/.github/CONTRIBUTING.md 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/.github/CONTRIBUTING.md 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,152 @@ +# Contributing to ZFS on Linux +

+ +*First of all, thank you for taking the time to contribute!* + +By using the following guidelines, you can help us make ZFS on Linux even +better. + +## Table Of Contents +[What should I know before I get +started?](#what-should-i-know-before-i-get-started) + + * [Get ZFS](#get-zfs) + * [Debug ZFS](#debug-zfs) + * [Where can I ask for help?](#where-can-I-ask-for-help) + +[How Can I Contribute?](#how-can-i-contribute) + + * [Reporting Bugs](#reporting-bugs) + * [Suggesting Enhancements](#suggesting-enhancements) + * [Pull Requests](#pull-requests) + * [Testing](#testing) + +[Style Guides](#style-guides) + + * [Coding Conventions](#coding-conventions) + +Helpful resources + + * [ZFS on Linux wiki](https://github.com/zfsonlinux/zfs/wiki) + * [OpenZFS Documentation](http://open-zfs.org/wiki/Developer_resources) + +## What should I know before I get started? + +### Get ZFS +You can build zfs packages by following [these +instructions](https://github.com/zfsonlinux/zfs/wiki/Building-ZFS), +or install stable packages from [your distribution's +repository](https://github.com/zfsonlinux/zfs/wiki/Getting-Started). + +### Debug ZFS +A variety of methods and tools are available to aid ZFS developers. +It's strongly recommended that when developing a patch the `--enable-debug` +configure option should be set. This will enable additional correctness +checks and all the ASSERTs to help quickly catch potential issues. + +In addition, there are numerous utilities and debugging files which +provide visibility in to the inner workings of ZFS. The most useful +of these tools are discussed in detail on the [debugging ZFS wiki +page](https://github.com/zfsonlinux/zfs/wiki/Debugging). + +### Where can I ask for help? +The [mailing list](https://github.com/zfsonlinux/zfs/wiki/Mailing-Lists) +is the best place to ask for help. + +## How Can I Contribute? + +### Reporting Bugs +*Please* contact us via the [mailing +list](https://github.com/zfsonlinux/zfs/wiki/Mailing-Lists) if you aren't +certain that you are experiencing a bug. + +If you run into an issue, please search our [issue +tracker](https://github.com/zfsonlinux/zfs/issues) *first* to ensure the +issue hasn't been reported before. Open a new issue only if you haven't +found anything similar to your issue. + +You can open a new issue and search existing issues using the public [issue +tracker](https://github.com/zfsonlinux/zfs/issues). + +#### When opening a new issue, please include the following information at the top of the issue: +* What distribution (with version) you are using. +* The spl and zfs versions you are using, installation method (repository +or manual compilation). +* Describe the issue you are experiencing. +* Describe how to reproduce the issue. +* Including any warning/errors/backtraces from the system logs. + +When a new issue is opened, it is not uncommon for developers to request +additional information. + +In general, the more detail you share about a problem the quicker a +developer can resolve it. For example, providing a simple test case is always +exceptionally helpful. + +Be prepared to work with the developers investigating your issue. Your +assistance is crucial in providing a quick solution. They may ask for +information like: + +* Your pool configuration as reported by `zdb` or `zpool status`. +* Your hardware configuration, such as + * Number of CPUs. + * Amount of memory. + * Whether your system has ECC memory. + * Whether it is running under a VMM/Hypervisor. + * Kernel version. + * Values of the spl/zfs module parameters. +* Stack traces which may be logged to `dmesg`. + +### Suggesting Enhancements +ZFS on Linux is a widely deployed production filesystem which is under +active development. The team's primary focus is on fixing known issues, +improving performance, and adding compelling new features. + +You can view the list of proposed features +by filtering the issue tracker by the ["Feature" +label](https://github.com/zfsonlinux/zfs/issues?q=is%3Aopen+is%3Aissue+label%3AFeature). +If you have an idea for a feature first check this list. If your idea already +appears then add a +1 to the top most comment, this helps us gauge interest +in that feature. + +Otherwise, open a new issue and describe your proposed feature. Why is this +feature needed? What problem does it solve? + +### Pull Requests +* All pull requests must be based on the current master branch and apply +without conflicts. +* Please attempt to limit pull requests to a single commit which resolves +one specific issue. +* When updating a pull request squash multiple commits by performing a +[rebase](https://git-scm.com/docs/git-rebase) (squash). +* For large pull requests consider structuring your changes as a stack of +logically independent patches which build on each other. This makes large +changes easier to review and approve which speeds up the merging process. +* Try to keep pull requests simple. Simple code with comments is much easier +to review and approve. +* Test cases should be provided when appropriate. +* If your pull request improves performance, please include some benchmarks. +* The pull request must pass all required [ZFS +Buildbot](http://build.zfsonlinux.org/) builders before +being accepted. If you are experiencing intermittent TEST +builder failures, you may be experiencing a [test suite +issue](https://github.com/zfsonlinux/zfs/issues?q=is%3Aissue+is%3Aopen+label%3A%22Test+Suite%22). +* All proposed changes must be approved by a ZFS on Linux organization member. + +### Testing +All help is appreciated! If you're in a position to run the latest code +consider helping us by reporting any functional problems, performance +regressions or other suspected issues. By running the latest code to a wide +range of realistic workloads, configurations and architectures we're better +able quickly identify and resolve potential issues. + +Users can also run the [ZFS Test +Suite](https://github.com/zfsonlinux/zfs/tree/master/tests) on their systems +to verify ZFS is behaving as intended. + +## Style Guides + +### Coding Conventions +We currently use [C Style and Coding Standards for +SunOS](http://www.cis.upenn.edu/%7Elee/06cse480/data/cstyle.ms.pdf) as our +coding convention. diff -Nru zfs-linux-0.7.0-rc2/.github/ISSUE_TEMPLATE.md zfs-linux-0.7.0-rc3/.github/ISSUE_TEMPLATE.md --- zfs-linux-0.7.0-rc2/.github/ISSUE_TEMPLATE.md 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/.github/ISSUE_TEMPLATE.md 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,46 @@ + + +### System information + +Type | Version/Name + --- | --- +Distribution Name | +Distribution Version | +Linux Kernel | +Architecture | +ZFS Version | +SPL Version | + + +### Describe the problem you're observing + +### Describe how to reproduce the problem + +### Include any warning/errors/backtraces from the system logs + diff -Nru zfs-linux-0.7.0-rc2/.github/PULL_REQUEST_TEMPLATE.md zfs-linux-0.7.0-rc3/.github/PULL_REQUEST_TEMPLATE.md --- zfs-linux-0.7.0-rc2/.github/PULL_REQUEST_TEMPLATE.md 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/.github/PULL_REQUEST_TEMPLATE.md 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,32 @@ + + +### Description + + +### Motivation and Context + + + +### How Has This Been Tested? + + + + + +### Types of changes + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Performance enhancement (non-breaking change which improves efficiency) +- [ ] Code cleanup (non-breaking change which makes code smaller or more readable) +- [ ] Breaking change (fix or feature that would cause existing functionality to change) + +### Checklist: + + +- [ ] My code follows the ZFS on Linux code style requirements. +- [ ] I have updated the documentation accordingly. +- [ ] I have read the **CONTRIBUTING** document. +- [ ] I have added tests to cover my changes. +- [ ] All new and existing tests passed. +- [ ] Change has been approved by a ZFS on Linux member. diff -Nru zfs-linux-0.7.0-rc2/include/libzfs.h zfs-linux-0.7.0-rc3/include/libzfs.h --- zfs-linux-0.7.0-rc2/include/libzfs.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/libzfs.h 2017-01-20 18:18:28.000000000 +0000 @@ -64,6 +64,10 @@ */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ +#define IMPORT_ORDER_PREFERRED_1 1 +#define IMPORT_ORDER_PREFERRED_2 2 +#define IMPORT_ORDER_SCAN_OFFSET 10 +#define IMPORT_ORDER_DEFAULT 100 #define DEFAULT_IMPORT_PATH_SIZE 9 extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE]; @@ -728,6 +732,7 @@ extern int zfs_append_partition(char *path, size_t max_len); extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk); +extern int zfs_path_order(char *path, int *order); /* * Mount support functions. @@ -758,6 +763,7 @@ extern int zfs_unshareall_nfs(zfs_handle_t *); extern int zfs_unshareall_smb(zfs_handle_t *); extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); +extern int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); @@ -832,6 +838,7 @@ extern boolean_t is_mpath_whole_disk(const char *); extern void update_vdev_config_dev_strs(nvlist_t *); extern char *zfs_strip_partition(char *); +extern char *zfs_strip_partition_path(char *); #ifdef HAVE_LIBUDEV struct udev_device; diff -Nru zfs-linux-0.7.0-rc2/include/linux/blkdev_compat.h zfs-linux-0.7.0-rc3/include/linux/blkdev_compat.h --- zfs-linux-0.7.0-rc2/include/linux/blkdev_compat.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/linux/blkdev_compat.h 2017-01-20 18:18:28.000000000 +0000 @@ -303,19 +303,58 @@ #endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ #endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ +#ifndef HAVE_BIO_SET_OP_ATTRS /* - * 2.6.37 API change - * The WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags have been - * introduced as a replacement for WRITE_BARRIER. This was done to - * allow richer semantics to be expressed to the block layer. It is - * the block layers responsibility to choose the correct way to - * implement these semantics. + * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ -#ifdef WRITE_FLUSH_FUA -#define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA +static inline void +bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) +{ + bio->bi_rw |= rw | flags; +} +#endif + +/* + * bio_set_flush - Set the appropriate flags in a bio to guarantee + * data are on non-volatile media on completion. + * + * 2.6.X - 2.6.36 API, + * WRITE_BARRIER - Tells the block layer to commit all previously submitted + * writes to stable storage before this one is started and that the current + * write is on stable storage upon completion. Also prevents reordering + * on both sides of the current operation. + * + * 2.6.37 - 4.8 API, + * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a + * replacement for WRITE_BARRIER to allow expressing richer semantics + * to the block layer. It's up to the block layer to implement the + * semantics correctly. Use the WRITE_FLUSH_FUA flag combination. + * + * 4.8 - 4.9 API, + * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous + * ZoL releases, prefer the WRITE_FLUSH_FUA flag set if it's available. + * + * 4.10 API, + * The read/write flags and their modifiers, including WRITE_FLUSH, + * WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in + * torvalds/linux@70fd7614 and replaced by direct flag modification + * of the REQ_ flags in bio->bi_opf. Use REQ_PREFLUSH. + */ +static inline void +bio_set_flush(struct bio *bio) +{ +#if defined(WRITE_BARRIER) /* < 2.6.37 */ + bio_set_op_attrs(bio, 0, WRITE_BARRIER); +#elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ + bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); +#elif defined(REQ_PREFLUSH) /* >= 4.10 */ + bio_set_op_attrs(bio, 0, REQ_PREFLUSH); #else -#define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER +#error "Allowing the build will cause bio_set_flush requests to be ignored." + "Please file an issue report at: " + "https://github.com/zfsonlinux/zfs/issues/new" #endif +} /* * 4.8 - 4.x API, diff -Nru zfs-linux-0.7.0-rc2/include/linux/simd_x86.h zfs-linux-0.7.0-rc3/include/linux/simd_x86.h --- zfs-linux-0.7.0-rc2/include/linux/simd_x86.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/linux/simd_x86.h 2017-01-20 18:18:28.000000000 +0000 @@ -208,8 +208,8 @@ uint32_t eax, edx; /* xgetbv - instruction byte code */ __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" - : "=a" (eax), "=d" (edx) - : "c" (index)); + : "=a" (eax), "=d" (edx) + : "c" (index)); return ((((uint64_t)edx)<<32) | (uint64_t)eax); } @@ -229,13 +229,13 @@ * are passed by value. */ __cpuid_count(desc->leaf, desc->subleaf, - r[EAX], r[EBX], r[ECX], r[EDX]); + r[EAX], r[EBX], r[ECX], r[EDX]); return ((r[desc->reg] & desc->flag) == desc->flag); } return (B_FALSE); } -#define CPUID_FEATURE_CHECK(name, id) \ +#define CPUID_FEATURE_CHECK(name, id) \ static inline boolean_t \ __cpuid_has_ ## name(void) \ { \ diff -Nru zfs-linux-0.7.0-rc2/include/linux/vfs_compat.h zfs-linux-0.7.0-rc3/include/linux/vfs_compat.h --- zfs-linux-0.7.0-rc2/include/linux/vfs_compat.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/linux/vfs_compat.h 2017-01-20 18:18:28.000000000 +0000 @@ -205,17 +205,9 @@ #include #if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) - #define zpl_posix_acl_release(arg) posix_acl_release(arg) -#define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) -#define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) - #else - -static inline void -zpl_posix_acl_free(void *arg) { - kfree(arg); -} +void zpl_posix_acl_release_impl(struct posix_acl *); static inline void zpl_posix_acl_release(struct posix_acl *acl) @@ -223,12 +215,15 @@ if ((acl == NULL) || (acl == ACL_NOT_CACHED)) return; - if (atomic_dec_and_test(&acl->a_refcount)) { - taskq_dispatch_delay(system_taskq, zpl_posix_acl_free, acl, - TQ_SLEEP, ddi_get_lbolt() + 60*HZ); - } + if (atomic_dec_and_test(&acl->a_refcount)) + zpl_posix_acl_release_impl(acl); } +#endif /* HAVE_POSIX_ACL_RELEASE */ +#ifdef HAVE_SET_CACHED_ACL_USABLE +#define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) +#define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) +#else static inline void zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { struct posix_acl *older = NULL; @@ -258,7 +253,7 @@ zpl_forget_cached_acl(struct inode *ip, int type) { zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); } -#endif /* HAVE_POSIX_ACL_RELEASE */ +#endif /* HAVE_SET_CACHED_ACL_USABLE */ #ifndef HAVE___POSIX_ACL_CHMOD #ifdef HAVE_POSIX_ACL_CHMOD diff -Nru zfs-linux-0.7.0-rc2/include/sys/abd.h zfs-linux-0.7.0-rc3/include/sys/abd.h --- zfs-linux-0.7.0-rc2/include/sys/abd.h 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/abd.h 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */ +} abd_flags_t; + +typedef struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; + uint_t abd_nents; + struct scatterlist *abd_sgl; + } abd_scatter; + struct abd_linear { + void *abd_buf; + } abd_linear; + } abd_u; +} abd_t; + +typedef int abd_iter_func_t(void *buf, size_t len, void *private); +typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); + +extern int zfs_abd_scatter_enabled; + +static inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0); +} + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_free(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); + +#if defined(_KERNEL) && defined(HAVE_SPL) +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + +/* + * Wrappers for calls with offsets of 0 + */ + +static inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +static inline void +abd_copy_from_buf(abd_t *abd, void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +static inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +static inline int +abd_cmp_buf(abd_t *abd, void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +static inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * Module lifecycle + */ + +void abd_init(void); +void abd_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff -Nru zfs-linux-0.7.0-rc2/include/sys/arc_impl.h zfs-linux-0.7.0-rc3/include/sys/arc_impl.h --- zfs-linux-0.7.0-rc2/include/sys/arc_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/arc_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -54,7 +54,7 @@ * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA + * that cannot be freed. Generally, they will acquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second @@ -166,7 +166,7 @@ refcount_t b_refcnt; arc_callback_t *b_acb; - void *b_pdata; + abd_t *b_pabd; } l1arc_buf_hdr_t; typedef struct l2arc_dev { diff -Nru zfs-linux-0.7.0-rc2/include/sys/ddt.h zfs-linux-0.7.0-rc3/include/sys/ddt.h --- zfs-linux-0.7.0-rc2/include/sys/ddt.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/ddt.h 2017-01-20 18:18:28.000000000 +0000 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DDT_H @@ -35,6 +36,8 @@ extern "C" { #endif +struct abd; + /* * On-disk DDT formats, in the desired search order (newest version first). */ @@ -108,7 +111,7 @@ ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + struct abd *dde_repair_abd; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff -Nru zfs-linux-0.7.0-rc2/include/sys/dmu.h zfs-linux-0.7.0-rc3/include/sys/dmu.h --- zfs-linux-0.7.0-rc2/include/sys/dmu.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/dmu.h 2017-01-20 18:18:28.000000000 +0000 @@ -674,10 +674,17 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); +void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); +void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, + const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); @@ -727,8 +734,12 @@ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); +int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL diff -Nru zfs-linux-0.7.0-rc2/include/sys/dmu_objset.h zfs-linux-0.7.0-rc3/include/sys/dmu_objset.h --- zfs-linux-0.7.0-rc2/include/sys/dmu_objset.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/dmu_objset.h 2017-01-20 18:18:28.000000000 +0000 @@ -184,17 +184,10 @@ int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); boolean_t dmu_objset_userobjused_enabled(objset_t *os); +boolean_t dmu_objset_userobjspace_upgradable(objset_t *os); void dmu_objset_userobjspace_upgrade(objset_t *os); boolean_t dmu_objset_userobjspace_present(objset_t *os); -static inline boolean_t dmu_objset_userobjspace_upgradable(objset_t *os) -{ - return (dmu_objset_type(os) == DMU_OST_ZFS && - !dmu_objset_is_snapshot(os) && - dmu_objset_userobjused_enabled(os) && - !dmu_objset_userobjspace_present(os)); -} - int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); diff -Nru zfs-linux-0.7.0-rc2/include/sys/dmu_tx.h zfs-linux-0.7.0-rc3/include/sys/dmu_tx.h --- zfs-linux-0.7.0-rc2/include/sys/dmu_tx.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/dmu_tx.h 2017-01-20 18:18:28.000000000 +0000 @@ -171,7 +171,7 @@ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx); -void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, dnode_t *dn); void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); diff -Nru zfs-linux-0.7.0-rc2/include/sys/dnode.h zfs-linux-0.7.0-rc3/include/sys/dnode.h --- zfs-linux-0.7.0-rc2/include/sys/dnode.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/dnode.h 2017-01-20 18:18:28.000000000 +0000 @@ -266,7 +266,7 @@ * duplicate entries, we order the dbufs by an arbitrary value - * their address in memory. This means that dn_dbufs cannot be used to * directly look up a dbuf. Instead, callers must use avl_walk, have - * a reference to the dbuf, or look up a non-existant node with + * a reference to the dbuf, or look up a non-existent node with * db_state = DB_SEARCH (see dbuf_free_range for an example). */ avl_tree_t dn_dbufs; diff -Nru zfs-linux-0.7.0-rc2/include/sys/dsl_dataset.h zfs-linux-0.7.0-rc3/include/sys/dsl_dataset.h --- zfs-linux-0.7.0-rc2/include/sys/dsl_dataset.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/dsl_dataset.h 2017-01-20 18:18:28.000000000 +0000 @@ -86,13 +86,6 @@ /* * This field is present (with value=0) if this dataset may contain large - * blocks (>128KB). If it is present, then this dataset - * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature. - */ -#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" - -/* - * This field is present (with value=0) if this dataset may contain large * dnodes (>512B). If it is present, then this dataset is counted in the * refcount of the SPA_FEATURE_LARGE_DNODE feature. */ @@ -278,7 +271,6 @@ minor_t cleanup_minor, const char *htag); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); -void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); diff -Nru zfs-linux-0.7.0-rc2/include/sys/efi_partition.h zfs-linux-0.7.0-rc3/include/sys/efi_partition.h --- zfs-linux-0.7.0-rc2/include/sys/efi_partition.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/efi_partition.h 2017-01-20 18:18:28.000000000 +0000 @@ -272,7 +272,7 @@ #define EFI_FREEDESKTOP_BOOT { 0xbc13c2ff, 0x59e6, 0x4262, 0xa3, 0x52, \ { 0xb2, 0x75, 0xfd, 0x6f, 0x71, 0x72 } } -/* minimum # of bytes for partition table entires, per EFI spec */ +/* minimum # of bytes for partition table entries, per EFI spec */ #define EFI_MIN_ARRAY_SIZE (16 * 1024) #define EFI_PART_NAME_LEN 36 diff -Nru zfs-linux-0.7.0-rc2/include/sys/fs/zfs.h zfs-linux-0.7.0-rc3/include/sys/fs/zfs.h --- zfs-linux-0.7.0-rc2/include/sys/fs/zfs.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/fs/zfs.h 2017-01-20 18:18:28.000000000 +0000 @@ -82,7 +82,8 @@ * the property table in module/zcommon/zfs_prop.c. */ typedef enum { - ZFS_PROP_TYPE, + ZFS_PROP_BAD = -1, + ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, @@ -894,7 +895,7 @@ * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { - uint64_t ddo_count; /* number of elments in ddt */ + uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; @@ -917,6 +918,7 @@ #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" +#define ZFS_SHARETAB "/etc/dfs/sharetab" /* general zvol path */ #define ZVOL_DIR "/dev" diff -Nru zfs-linux-0.7.0-rc2/include/sys/Makefile.am zfs-linux-0.7.0-rc3/include/sys/Makefile.am --- zfs-linux-0.7.0-rc2/include/sys/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -1,6 +1,7 @@ SUBDIRS = fm fs crypto sysevent COMMON_H = \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/metaslab.h zfs-linux-0.7.0-rc3/include/sys/metaslab.h --- zfs-linux-0.7.0-rc2/include/sys/metaslab.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/metaslab.h 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -36,10 +36,12 @@ extern "C" { #endif + typedef struct metaslab_ops { - uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; + extern metaslab_ops_t *zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, @@ -64,13 +66,18 @@ #define METASLAB_FASTWRITE 0x20 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); +void metaslab_alloc_trace_init(void); +void metaslab_alloc_trace_fini(void); +void metaslab_trace_init(zio_alloc_list_t *); +void metaslab_trace_fini(zio_alloc_list_t *); + metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); void metaslab_class_destroy(metaslab_class_t *); int metaslab_class_validate(metaslab_class_t *); diff -Nru zfs-linux-0.7.0-rc2/include/sys/metaslab_impl.h zfs-linux-0.7.0-rc3/include/sys/metaslab_impl.h --- zfs-linux-0.7.0-rc2/include/sys/metaslab_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/metaslab_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -42,6 +42,94 @@ #endif /* + * Metaslab allocation tracing record. + */ +typedef struct metaslab_alloc_trace { + list_node_t mat_list_node; + metaslab_group_t *mat_mg; + metaslab_t *mat_msp; + uint64_t mat_size; + uint64_t mat_weight; + uint32_t mat_dva_id; + uint64_t mat_offset; +} metaslab_alloc_trace_t; + +/* + * Used by the metaslab allocation tracing facility to indicate + * error conditions. These errors are stored to the offset member + * of the metaslab_alloc_trace_t record and displayed by mdb. + */ +typedef enum trace_alloc_type { + TRACE_ALLOC_FAILURE = -1ULL, + TRACE_TOO_SMALL = -2ULL, + TRACE_FORCE_GANG = -3ULL, + TRACE_NOT_ALLOCATABLE = -4ULL, + TRACE_GROUP_FAILURE = -5ULL, + TRACE_ENOSPC = -6ULL, + TRACE_CONDENSING = -7ULL, + TRACE_VDEV_ERROR = -8ULL +} trace_alloc_type_t; + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + +/* + * The metaslab weight is used to encode the amount of free space in a + * metaslab, such that the "best" metaslab appears first when sorting the + * metaslabs by weight. The weight (and therefore the "best" metaslab) can + * be determined in two different ways: by computing a weighted sum of all + * the free space in the metaslab (a space based weight) or by counting only + * the free segments of the largest size (a segment based weight). We prefer + * the segment based weight because it reflects how the free space is + * comprised, but we cannot always use it -- legacy pools do not have the + * space map histogram information necessary to determine the largest + * contiguous regions. Pools that have the space map histogram determine + * the segment weight by looking at each bucket in the histogram and + * determining the free space whose size in bytes is in the range: + * [2^i, 2^(i+1)) + * We then encode the largest index, i, that contains regions into the + * segment-weighted value. + * + * Space-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS1| weighted-free space | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * space - the fragmentation-weighted space + * + * Segment-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PS0| idx| count of segments in region | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * idx - index for the highest bucket in the histogram + * count - number of segments in the specified bucket + */ +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) + +#define WEIGHT_IS_SPACEBASED(weight) \ + ((weight) == 0 || BF64_GET((weight), 61, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + +/* + * These macros are only applicable to segment-based weighting. + */ +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) + +/* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include @@ -104,7 +192,7 @@ /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) - * of a top-level vdev. They are linked togther to form a circular linked + * of a top-level vdev. They are linked together to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will @@ -220,7 +308,6 @@ kmutex_t ms_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; - metaslab_ops_t *ms_ops; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; @@ -233,12 +320,27 @@ boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; + + /* + * We must hold both ms_lock and ms_group->mg_lock in order to + * modify ms_loaded. + */ boolean_t ms_loaded; boolean_t ms_loading; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_access_txg; + uint64_t ms_activation_weight; /* activation weight */ + + /* + * Track of whenever a metaslab is selected for loading or allocation. + * We use this value to determine how long the metaslab should + * stay cached. + */ + uint64_t ms_selected_txg; + + uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ + uint64_t ms_max_size; /* maximum allocatable size */ /* * The metaslab block allocators can optionally use a size-ordered diff -Nru zfs-linux-0.7.0-rc2/include/sys/sa.h zfs-linux-0.7.0-rc3/include/sys/sa.h --- zfs-linux-0.7.0-rc2/include/sys/sa.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/sa.h 2017-01-20 18:18:28.000000000 +0000 @@ -134,8 +134,6 @@ int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); int sa_size(sa_handle_t *, sa_attr_type_t, int *); -int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, - uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); void *sa_get_userdata(sa_handle_t *); diff -Nru zfs-linux-0.7.0-rc2/include/sys/spa.h zfs-linux-0.7.0-rc3/include/sys/spa.h --- zfs-linux-0.7.0-rc2/include/sys/spa.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/spa.h 2017-01-20 18:18:28.000000000 +0000 @@ -416,15 +416,17 @@ #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) -#define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ @@ -569,8 +571,7 @@ } #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA) + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, @@ -585,7 +586,6 @@ size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, nvlist_t *zplprops); -extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); @@ -725,6 +725,13 @@ TXG_STATE_COMMITTED = 5, } txg_state_t; +typedef struct txg_stat { + vdev_stat_t vs1; + vdev_stat_t vs2; + uint64_t txg; + uint64_t ndirty; +} txg_stat_t; + extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, @@ -732,8 +739,9 @@ extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); -extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, - uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty); +extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, + struct dsl_pool *); +extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); /* Pool configuration locks */ diff -Nru zfs-linux-0.7.0-rc2/include/sys/spa_impl.h zfs-linux-0.7.0-rc3/include/sys/spa_impl.h --- zfs-linux-0.7.0-rc2/include/sys/spa_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/spa_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -120,7 +120,8 @@ typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ - AVZ_ACTION_REBUILD /* Populate the new AVZ, see spa_avz_rebuild */ + AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ + AVZ_ACTION_INITIALIZE } spa_avz_action_t; struct spa { @@ -239,6 +240,7 @@ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_ditto; /* dedup ditto threshold */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ @@ -269,7 +271,7 @@ uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ - taskq_t *spa_zvol_taskq; /* Taskq for minor managment */ + taskq_t *spa_zvol_taskq; /* Taskq for minor management */ /* * spa_refcount & spa_config_lock must be the last elements diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_acl.h zfs-linux-0.7.0-rc3/include/sys/trace_acl.h --- zfs-linux-0.7.0-rc2/include/sys/trace_acl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_acl.h 2017-01-20 18:18:28.000000000 +0000 @@ -42,7 +42,7 @@ * zfs_ace_hdr_t *, ..., * uint32_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_ace_class, TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), TP_ARGS(zn, ace, mask_matched), @@ -136,6 +136,7 @@ __entry->z_type, __entry->z_flags, __entry->z_access_mask, __entry->mask_matched) ); +/* END CSTYLED */ #define DEFINE_ACE_EVENT(name) \ DEFINE_EVENT(zfs_ace_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_arc.h zfs-linux-0.7.0-rc3/include/sys/trace_arc.h --- zfs-linux-0.7.0-rc2/include/sys/trace_arc.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_arc.h 2017-01-20 18:18:28.000000000 +0000 @@ -42,7 +42,7 @@ * DTRACE_PROBE1(..., * arc_buf_hdr_t *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_PROTO(arc_buf_hdr_t *ab), TP_ARGS(ab), @@ -95,6 +95,7 @@ __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, __entry->hdr_refcount) ); +/* END CSTYLED */ #define DEFINE_ARC_BUF_HDR_EVENT(name) \ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ @@ -117,7 +118,7 @@ * vdev_t *, ..., * zio_t *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, TP_PROTO(vdev_t *vd, zio_t *zio), TP_ARGS(vd, zio), @@ -137,6 +138,7 @@ ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, ZIO_TP_PRINTK_ARGS) ); +/* END CSTYLED */ #define DEFINE_L2ARC_RW_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_rw_class, name, \ @@ -153,7 +155,7 @@ * zio_t *, ..., * l2arc_write_callback_t *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class, TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), TP_ARGS(zio, cb), @@ -161,6 +163,7 @@ TP_fast_assign(ZIO_TP_FAST_ASSIGN), TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) ); +/* END CSTYLED */ #define DEFINE_L2ARC_IODONE_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_iodone_class, name, \ @@ -178,7 +181,7 @@ * uint64_t, * const zbookmark_phys_t *); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_PROTO(arc_buf_hdr_t *hdr, const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), @@ -272,6 +275,7 @@ __entry->bp_lsize, __entry->zb_objset, __entry->zb_object, __entry->zb_level, __entry->zb_blkid) ); +/* END CSTYLED */ #define DEFINE_ARC_MISS_EVENT(name) \ DEFINE_EVENT(zfs_arc_miss_class, name, \ @@ -289,7 +293,7 @@ * uint64_t, ..., * boolean_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class, TP_PROTO(l2arc_dev_t *dev, list_t *buflist, uint64_t taddr, boolean_t all), @@ -330,6 +334,7 @@ __entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing, __entry->taddr, __entry->all) ); +/* END CSTYLED */ #define DEFINE_L2ARC_EVICT_EVENT(name) \ DEFINE_EVENT(zfs_l2arc_evict_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_dbgmsg.h zfs-linux-0.7.0-rc3/include/sys/trace_dbgmsg.h --- zfs-linux-0.7.0-rc2/include/sys/trace_dbgmsg.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_dbgmsg.h 2017-01-20 18:18:28.000000000 +0000 @@ -45,7 +45,7 @@ * int, ..., * const char *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dprintf_class, TP_PROTO(const char *file, const char *function, int line, const char *msg), @@ -66,6 +66,7 @@ TP_printk("%s:%d:%s(): %s", __get_str(file), __entry->line, __get_str(function), __get_str(msg)) ); +/* END CSTYLED */ #define DEFINE_DPRINTF_EVENT(name) \ DEFINE_EVENT(zfs_dprintf_class, name, \ @@ -83,7 +84,7 @@ * int, ..., * uintptr_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_set_error_class, TP_PROTO(const char *file, const char *function, int line, uintptr_t error), @@ -104,6 +105,7 @@ TP_printk("%s:%d:%s(): error 0x%lx", __get_str(file), __entry->line, __get_str(function), __entry->error) ); +/* END CSTYLED */ #ifdef TP_CONDITION #define DEFINE_SET_ERROR_EVENT(name) \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_dmu.h zfs-linux-0.7.0-rc3/include/sys/trace_dmu.h --- zfs-linux-0.7.0-rc2/include/sys/trace_dmu.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_dmu.h 2017-01-20 18:18:28.000000000 +0000 @@ -41,7 +41,7 @@ * uint64_t, ..., * uint64_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_delay_mintime_class, TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), TP_ARGS(tx, dirty, min_tx_time), @@ -102,6 +102,7 @@ #endif __entry->dirty, __entry->min_tx_time) ); +/* END CSTYLED */ #define DEFINE_DELAY_MINTIME_EVENT(name) \ DEFINE_EVENT(zfs_delay_mintime_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_dnode.h zfs-linux-0.7.0-rc3/include/sys/trace_dnode.h --- zfs-linux-0.7.0-rc2/include/sys/trace_dnode.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_dnode.h 2017-01-20 18:18:28.000000000 +0000 @@ -41,7 +41,7 @@ * int64_t, ..., * uint32_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dnode_move_class, TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), TP_ARGS(dn, refcount, dbufs), @@ -102,6 +102,7 @@ __entry->dn_maxblkid, __entry->dn_tx_holds, __entry->dn_holds, __entry->dn_have_spill, __entry->refcount, __entry->dbufs) ); +/* END CSTYLED */ #define DEFINE_DNODE_MOVE_EVENT(name) \ DEFINE_EVENT(zfs_dnode_move_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_multilist.h zfs-linux-0.7.0-rc3/include/sys/trace_multilist.h --- zfs-linux-0.7.0-rc2/include/sys/trace_multilist.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_multilist.h 2017-01-20 18:18:28.000000000 +0000 @@ -41,7 +41,7 @@ * unsigned int, ..., * void *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), TP_ARGS(ml, sublist_idx, obj), @@ -60,6 +60,7 @@ TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) ); +/* END CSTYLED */ #define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_txg.h zfs-linux-0.7.0-rc3/include/sys/trace_txg.h --- zfs-linux-0.7.0-rc2/include/sys/trace_txg.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_txg.h 2017-01-20 18:18:28.000000000 +0000 @@ -40,7 +40,7 @@ * dsl_pool_t *, ..., * uint64_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_txg_class, TP_PROTO(dsl_pool_t *dp, uint64_t txg), TP_ARGS(dp, txg), @@ -52,6 +52,7 @@ ), TP_printk("txg %llu", __entry->txg) ); +/* END CSTYLED */ #define DEFINE_TXG_EVENT(name) \ DEFINE_EVENT(zfs_txg_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_zil.h zfs-linux-0.7.0-rc3/include/sys/trace_zil.h --- zfs-linux-0.7.0-rc2/include/sys/trace_zil.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_zil.h 2017-01-20 18:18:28.000000000 +0000 @@ -39,7 +39,7 @@ * DTRACE_PROBE1(..., * zilog_t *, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_zil_class, TP_PROTO(zilog_t *zilog), TP_ARGS(zilog), @@ -111,6 +111,7 @@ __entry->zl_itx_list_sz, __entry->zl_cur_used, __entry->zl_replay_time, __entry->zl_replay_blks) ); +/* END CSTYLED */ #define DEFINE_ZIL_EVENT(name) \ DEFINE_EVENT(zfs_zil_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_zio.h zfs-linux-0.7.0-rc3/include/sys/trace_zio.h --- zfs-linux-0.7.0-rc2/include/sys/trace_zio.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_zio.h 2017-01-20 18:18:28.000000000 +0000 @@ -36,6 +36,7 @@ #include #include /* For ZIO macros */ +/* BEGIN CSTYLED */ TRACE_EVENT(zfs_zio__delay__miss, TP_PROTO(zio_t *zio, hrtime_t now), TP_ARGS(zio, now), @@ -75,6 +76,7 @@ TP_fast_assign(ZIO_TP_FAST_ASSIGN), TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) ); +/* END CSTYLED */ #endif /* _TRACE_ZIO_H */ diff -Nru zfs-linux-0.7.0-rc2/include/sys/trace_zrlock.h zfs-linux-0.7.0-rc3/include/sys/trace_zrlock.h --- zfs-linux-0.7.0-rc2/include/sys/trace_zrlock.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/trace_zrlock.h 2017-01-20 18:18:28.000000000 +0000 @@ -40,7 +40,7 @@ * zrlock_t *, ..., * uint32_t, ...); */ - +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_zrlock_class, TP_PROTO(zrlock_t *zrl, uint32_t n), TP_ARGS(zrl, n), @@ -69,6 +69,7 @@ __entry->refcount, __entry->n) #endif ); +/* END_CSTYLED */ #define DEFINE_ZRLOCK_EVENT(name) \ DEFINE_EVENT(zfs_zrlock_class, name, \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/txg_impl.h zfs-linux-0.7.0-rc3/include/sys/txg_impl.h --- zfs-linux-0.7.0-rc2/include/sys/txg_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/txg_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -65,7 +65,7 @@ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. * The tc_open_lock is held until the transaction is assigned into the * transaction group. Typically, this is a short operation but if throttling - * is occuring it may be held for longer periods of time. + * is occurring it may be held for longer periods of time. */ struct tx_cpu { kmutex_t tc_open_lock; /* protects tx_open_txg */ diff -Nru zfs-linux-0.7.0-rc2/include/sys/vdev_file.h zfs-linux-0.7.0-rc3/include/sys/vdev_file.h --- zfs-linux-0.7.0-rc2/include/sys/vdev_file.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/vdev_file.h 2017-01-20 18:18:28.000000000 +0000 @@ -37,6 +37,9 @@ vnode_t *vf_vnode; } vdev_file_t; +extern void vdev_file_init(void); +extern void vdev_file_fini(void); + #ifdef __cplusplus } #endif diff -Nru zfs-linux-0.7.0-rc2/include/sys/vdev_impl.h zfs-linux-0.7.0-rc3/include/sys/vdev_impl.h --- zfs-linux-0.7.0-rc2/include/sys/vdev_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/vdev_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -53,6 +53,7 @@ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; extern int zfs_vdev_queue_depth_pct; extern uint32_t zfs_vdev_async_write_max_active; @@ -87,7 +88,7 @@ * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + struct abd *ve_abd; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff -Nru zfs-linux-0.7.0-rc2/include/sys/vdev_raidz.h zfs-linux-0.7.0-rc3/include/sys/vdev_raidz.h --- zfs-linux-0.7.0-rc2/include/sys/vdev_raidz.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/vdev_raidz.h 2017-01-20 18:18:28.000000000 +0000 @@ -40,22 +40,22 @@ /* * vdev_raidz interface */ -struct raidz_map * vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, +struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); -void vdev_raidz_map_free(struct raidz_map *); -void vdev_raidz_generate_parity(struct raidz_map *); -int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity(struct raidz_map *); +int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); /* * vdev_raidz_math interface */ -void vdev_raidz_math_init(void); -void vdev_raidz_math_fini(void); -struct raidz_impl_ops * vdev_raidz_math_get_ops(void); -int vdev_raidz_math_generate(struct raidz_map *); -int vdev_raidz_math_reconstruct(struct raidz_map *, - const int *, const int *, const int); -int vdev_raidz_impl_set(const char *); +void vdev_raidz_math_init(void); +void vdev_raidz_math_fini(void); +struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +int vdev_raidz_math_generate(struct raidz_map *); +int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, + const int); +int vdev_raidz_impl_set(const char *); #ifdef __cplusplus } diff -Nru zfs-linux-0.7.0-rc2/include/sys/vdev_raidz_impl.h zfs-linux-0.7.0-rc3/include/sys/vdev_raidz_impl.h --- zfs-linux-0.7.0-rc2/include/sys/vdev_raidz_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/vdev_raidz_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -104,7 +105,7 @@ size_t rc_devidx; /* child device index for I/O */ size_t rc_offset; /* device offset */ size_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_abd; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ unsigned int rc_tried; /* Did we attempt this I/O column? */ @@ -121,7 +122,7 @@ size_t rm_firstdatacol; /* First data column/parity count */ size_t rm_nskip; /* Skipped sectors for padding */ size_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ size_t rm_reports; /* # of referencing checksum reports */ unsigned int rm_freed; /* map no longer has referencing ZIO */ unsigned int rm_ecksuminjected; /* checksum error was injected */ @@ -141,6 +142,12 @@ #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_avx2_impl; #endif +#if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_avx512f_impl; +#endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_avx512bw_impl; +#endif #if defined(__aarch64__) extern const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl; extern const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl; @@ -171,12 +178,12 @@ * @code parity the function produce * @impl name of the implementation */ -#define _RAIDZ_GEN_WRAP(code, impl) \ +#define _RAIDZ_GEN_WRAP(code, impl) \ static void \ impl ## _gen_ ## code(void *rmp) \ { \ - raidz_map_t *rm = (raidz_map_t *) rmp; \ - raidz_generate_## code ## _impl(rm); \ + raidz_map_t *rm = (raidz_map_t *)rmp; \ + raidz_generate_## code ## _impl(rm); \ } /* @@ -185,11 +192,11 @@ * @code parity the function produce * @impl name of the implementation */ -#define _RAIDZ_REC_WRAP(code, impl) \ -static int \ +#define _RAIDZ_REC_WRAP(code, impl) \ +static int \ impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ { \ - raidz_map_t *rm = (raidz_map_t *) rmp; \ + raidz_map_t *rm = (raidz_map_t *)rmp; \ return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ } @@ -288,7 +295,7 @@ if (a == 0) return (0); - return (vdev_raidz_pow2[(exp + (unsigned) vdev_raidz_log2[a]) % 255]); + return (vdev_raidz_pow2[(exp + (unsigned)vdev_raidz_log2[a]) % 255]); } /* @@ -311,9 +318,9 @@ if (a == 0 || b == 0) return (0); - logsum = (gf_log_t) vdev_raidz_log2[a] + (gf_log_t) vdev_raidz_log2[b]; + logsum = (gf_log_t)vdev_raidz_log2[a] + (gf_log_t)vdev_raidz_log2[b]; - return ((gf_t) vdev_raidz_pow2[logsum % 255]); + return ((gf_t)vdev_raidz_pow2[logsum % 255]); } static inline gf_t @@ -325,10 +332,10 @@ if (a == 0) return (0); - logsum = (gf_log_t) 255 + (gf_log_t) vdev_raidz_log2[a] - - (gf_log_t) vdev_raidz_log2[b]; + logsum = (gf_log_t)255 + (gf_log_t)vdev_raidz_log2[a] - + (gf_log_t)vdev_raidz_log2[b]; - return ((gf_t) vdev_raidz_pow2[logsum % 255]); + return ((gf_t)vdev_raidz_pow2[logsum % 255]); } static inline gf_t @@ -338,9 +345,9 @@ ASSERT3U(a, >, 0); - logsum = (gf_log_t) 255 - (gf_log_t) vdev_raidz_log2[a]; + logsum = (gf_log_t)255 - (gf_log_t)vdev_raidz_log2[a]; - return ((gf_t) vdev_raidz_pow2[logsum]); + return ((gf_t)vdev_raidz_pow2[logsum]); } static inline gf_t @@ -353,7 +360,7 @@ gf_exp4(gf_log_t exp) { ASSERT3U(exp, <=, 255); - return ((gf_t) vdev_raidz_pow2[(2 * exp) % 255]); + return ((gf_t)vdev_raidz_pow2[(2 * exp) % 255]); } #ifdef __cplusplus diff -Nru zfs-linux-0.7.0-rc2/include/sys/xvattr.h zfs-linux-0.7.0-rc3/include/sys/xvattr.h --- zfs-linux-0.7.0-rc2/include/sys/xvattr.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/xvattr.h 2017-01-20 18:18:28.000000000 +0000 @@ -73,7 +73,7 @@ * - a 32 bit quantity (xva_mapsize) that specifies the size of the * attribute bitmaps in 32 bit words. * - A pointer to the returned attribute bitmap (needed because the - * previous element, the requested attribute bitmap) is variable lenth. + * previous element, the requested attribute bitmap) is variable length. * - The requested attribute bitmap, which is an array of 32 bit words. * Callers use the XVA_SET_REQ() macro to set the bits corresponding to * the attributes that are being requested. @@ -97,7 +97,7 @@ * attributes to be requested/returned. File systems may or may not support * optional attributes. They do so at their own discretion but if they do * support optional attributes, they must register the VFSFT_XVATTR feature - * so that the optional attributes can be set/retrived. + * so that the optional attributes can be set/retrieved. * * The fields of the xvattr structure are: * @@ -225,7 +225,7 @@ * of requested attributes (xva_reqattrmap[]). */ #define XVA_SET_REQ(xvap, attr) \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) /* @@ -233,7 +233,7 @@ * of requested attributes (xva_reqattrmap[]). */ #define XVA_CLR_REQ(xvap, attr) \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr) @@ -242,7 +242,7 @@ * of returned attributes (xva_rtnattrmap[]). */ #define XVA_SET_RTN(xvap, attr) \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) @@ -251,7 +251,7 @@ * to see of the corresponding attribute bit is set. If so, returns non-zero. */ #define XVA_ISSET_REQ(xvap, attr) \ - ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ + ((((xvap)->xva_vattr.va_mask & AT_XVATTR) && \ ((xvap)->xva_magic == XVA_MAGIC) && \ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) @@ -261,7 +261,7 @@ * to see of the corresponding attribute bit is set. If so, returns non-zero. */ #define XVA_ISSET_RTN(xvap, attr) \ - ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ + ((((xvap)->xva_vattr.va_mask & AT_XVATTR) && \ ((xvap)->xva_magic == XVA_MAGIC) && \ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) diff -Nru zfs-linux-0.7.0-rc2/include/sys/zap.h zfs-linux-0.7.0-rc3/include/sys/zap.h --- zfs-linux-0.7.0-rc2/include/sys/zap.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zap.h 2017-01-20 18:18:28.000000000 +0000 @@ -255,6 +255,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); @@ -294,6 +297,7 @@ int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); +int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zfs_context.h zfs-linux-0.7.0-rc3/include/sys/zfs_context.h --- zfs-linux-0.7.0-rc2/include/sys/zfs_context.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zfs_context.h 2017-01-20 18:18:28.000000000 +0000 @@ -394,6 +394,7 @@ */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_waitq_enter(kstat_io_t *); @@ -495,7 +496,10 @@ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ +#define TASKQID_INVALID ((taskqid_t)0) + extern taskq_t *system_taskq; +extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ diff -Nru zfs-linux-0.7.0-rc2/include/sys/zfs_debug.h zfs-linux-0.7.0-rc3/include/sys/zfs_debug.h --- zfs-linux-0.7.0-rc2/include/sys/zfs_debug.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zfs_debug.h 2017-01-20 18:18:28.000000000 +0000 @@ -42,14 +42,15 @@ extern int zfs_recover; extern int zfs_free_leak_on_eio; -#define ZFS_DEBUG_DPRINTF (1<<0) -#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -#define ZFS_DEBUG_SNAPNAMES (1<<3) -#define ZFS_DEBUG_MODIFY (1<<4) -#define ZFS_DEBUG_SPA (1<<5) -#define ZFS_DEBUG_ZIO_FREE (1<<6) -#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) +#define ZFS_DEBUG_DPRINTF (1 << 0) +#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) +#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) +#define ZFS_DEBUG_SNAPNAMES (1 << 3) +#define ZFS_DEBUG_MODIFY (1 << 4) +#define ZFS_DEBUG_SPA (1 << 5) +#define ZFS_DEBUG_ZIO_FREE (1 << 6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) +#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) extern void __dprintf(const char *file, const char *func, int line, const char *fmt, ...); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zfs_dir.h zfs-linux-0.7.0-rc3/include/sys/zfs_dir.h --- zfs-linux-0.7.0-rc2/include/sys/zfs_dir.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zfs_dir.h 2017-01-20 18:18:28.000000000 +0000 @@ -47,6 +47,7 @@ /* mknode flags */ #define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_TMPFILE 0x04 /* create a tmpfile */ extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, int, int *, pathname_t *); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zfs_vnops.h zfs-linux-0.7.0-rc3/include/sys/zfs_vnops.h --- zfs-linux-0.7.0-rc2/include/sys/zfs_vnops.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zfs_vnops.h 2017-01-20 18:18:28.000000000 +0000 @@ -47,6 +47,8 @@ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); extern int zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); extern int zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags); extern int zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, cred_t *cr, int flags, vsecattr_t *vsecp); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zfs_znode.h zfs-linux-0.7.0-rc3/include/sys/zfs_znode.h --- zfs-linux-0.7.0-rc2/include/sys/zfs_znode.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zfs_znode.h 2017-01-20 18:18:28.000000000 +0000 @@ -194,6 +194,7 @@ zfs_acl_t *z_acl_cached; /* cached acl */ krwlock_t z_xattr_lock; /* xattr data lock */ nvlist_t *z_xattr_cached; /* cached xattrs */ + uint64_t z_xattr_parent; /* parent obj for this xattr */ list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ diff -Nru zfs-linux-0.7.0-rc2/include/sys/zio_checksum.h zfs-linux-0.7.0-rc3/include/sys/zio_checksum.h --- zfs-linux-0.7.0-rc2/include/sys/zio_checksum.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zio_checksum.h 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright Saso Kiselkov 2013, All rights reserved. */ @@ -34,12 +34,12 @@ extern "C" { #endif +struct abd; + /* * Signature for checksum functions. */ -typedef void zio_checksum_func_t(const void *, uint64_t, const void *, - zio_cksum_t *); -typedef void zio_checksum_t(const void *data, uint64_t size, +typedef void zio_checksum_t(struct abd *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp); typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); typedef void zio_checksum_tmpl_free_t(void *ctx_template); @@ -83,28 +83,28 @@ /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; -extern zio_checksum_t zio_checksum_SHA512_native; -extern zio_checksum_t zio_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; /* Skein */ -extern zio_checksum_t zio_checksum_skein_native; -extern zio_checksum_t zio_checksum_skein_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free; +extern zio_checksum_t abd_checksum_skein_native; +extern zio_checksum_t abd_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; /* Edon-R */ -extern zio_checksum_t zio_checksum_edonr_native; -extern zio_checksum_t zio_checksum_edonr_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +extern zio_checksum_t abd_checksum_edonr_native; +extern zio_checksum_t abd_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, - void *, uint64_t, uint64_t, zio_bad_cksum_t *); + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zio_compress.h zfs-linux-0.7.0-rc3/include/sys/zio_compress.h --- zfs-linux-0.7.0-rc2/include/sys/zio_compress.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zio_compress.h 2017-01-20 18:18:28.000000000 +0000 @@ -22,12 +22,14 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H +#include + #ifdef __cplusplus extern "C" { #endif @@ -60,13 +62,20 @@ size_t s_len, size_t d_len, int); /* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); +/* * Information about each compression function. */ typedef const struct zio_compress_info { - zio_compress_func_t *ci_compress; /* compression function */ - zio_decompress_func_t *ci_decompress; /* decompression function */ - int ci_level; /* level parameter */ - char *ci_name; /* algorithm name */ + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; } zio_compress_info_t; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; @@ -96,13 +105,16 @@ int level); extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, int level); - +extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len); -extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len); #ifdef __cplusplus diff -Nru zfs-linux-0.7.0-rc2/include/sys/zio.h zfs-linux-0.7.0-rc3/include/sys/zio.h --- zfs-linux-0.7.0-rc2/include/sys/zio.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zio.h 2017-01-20 18:18:28.000000000 +0000 @@ -301,6 +301,7 @@ struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; +struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -333,12 +334,12 @@ } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, struct abd *data, uint64_t offset); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -357,6 +358,11 @@ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_alloc_list { + list_t zal_list; + uint64_t zal_size; +} zio_alloc_list_t; + typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; @@ -396,8 +402,8 @@ uint64_t io_lsize; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + struct abd *io_abd; + struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; @@ -416,6 +422,7 @@ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; + zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ enum zio_flag io_flags; @@ -455,19 +462,19 @@ extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t lsize, zio_done_func_t *done, void *private, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + struct abd *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -483,12 +490,12 @@ zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -517,21 +524,20 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -extern void *zio_buf_alloc_flags(size_t size, int flags); -extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zio_impl.h zfs-linux-0.7.0-rc3/include/sys/zio_impl.h --- zfs-linux-0.7.0-rc2/include/sys/zio_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zio_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -30,9 +30,6 @@ #ifndef _ZIO_IMPL_H #define _ZIO_IMPL_H -#include -#include - #ifdef __cplusplus extern "C" { #endif diff -Nru zfs-linux-0.7.0-rc2/include/sys/zpl.h zfs-linux-0.7.0-rc3/include/sys/zpl.h --- zfs-linux-0.7.0-rc2/include/sys/zpl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zpl.h 2017-01-20 18:18:28.000000000 +0000 @@ -76,7 +76,7 @@ extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) -extern int zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl); +extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); #if !defined(HAVE_GET_ACL) #if defined(HAVE_CHECK_ACL_WITH_FLAGS) @@ -148,8 +148,7 @@ dir_emit(struct dir_context *ctx, const char *name, int namelen, uint64_t ino, unsigned type) { - return (ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type) - == 0); + return (!ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type)); } static inline bool diff -Nru zfs-linux-0.7.0-rc2/include/sys/zrlock.h zfs-linux-0.7.0-rc3/include/sys/zrlock.h --- zfs-linux-0.7.0-rc2/include/sys/zrlock.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zrlock.h 2017-01-20 18:18:28.000000000 +0000 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZRLOCK_H @@ -44,12 +45,8 @@ extern void zrl_init(zrlock_t *); extern void zrl_destroy(zrlock_t *); -#ifdef ZFS_DEBUG -#define zrl_add(_z) zrl_add_debug((_z), __func__) -extern void zrl_add_debug(zrlock_t *, const char *); -#else -extern void zrl_add(zrlock_t *); -#endif +#define zrl_add(_z) zrl_add_impl((_z), __func__) +extern void zrl_add_impl(zrlock_t *, const char *); extern void zrl_remove(zrlock_t *); extern int zrl_tryenter(zrlock_t *); extern void zrl_exit(zrlock_t *); diff -Nru zfs-linux-0.7.0-rc2/include/sys/zvol.h zfs-linux-0.7.0-rc3/include/sys/zvol.h --- zfs-linux-0.7.0-rc2/include/sys/zvol.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/sys/zvol.h 2017-01-20 18:18:28.000000000 +0000 @@ -32,7 +32,8 @@ #define ZVOL_OBJ 1ULL #define ZVOL_ZAP_OBJ 2ULL -extern void *zvol_tag; +#define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ + DEV_BSHIFT - 1)) - 1) extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async); extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async); @@ -40,6 +41,8 @@ const char *newname, boolean_t async); #ifdef _KERNEL +typedef struct zvol_state zvol_state_t; + extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); @@ -48,6 +51,9 @@ extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); +extern zvol_state_t *zvol_suspend(const char *); +extern int zvol_resume(zvol_state_t *); +extern void *zvol_tag(zvol_state_t *); extern int zvol_init(void); extern void zvol_fini(void); diff -Nru zfs-linux-0.7.0-rc2/include/zfs_fletcher.h zfs-linux-0.7.0-rc3/include/zfs_fletcher.h --- zfs-linux-0.7.0-rc2/include/zfs_fletcher.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/zfs_fletcher.h 2017-01-20 18:18:28.000000000 +0000 @@ -48,15 +48,16 @@ * checksum method is added. This method will ignore last (size % 4) bytes of * the data buffer. */ +void fletcher_init(zio_cksum_t *); void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); +int fletcher_2_incremental_native(void *, size_t, void *); +int fletcher_2_incremental_byteswap(void *, size_t, void *); void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +int fletcher_4_incremental_native(void *, size_t, void *); +int fletcher_4_incremental_byteswap(void *, size_t, void *); int fletcher_4_impl_set(const char *selector); void fletcher_4_init(void); void fletcher_4_fini(void); @@ -65,6 +66,10 @@ /* Internal fletcher ctx */ +typedef struct zfs_fletcher_superscalar { + uint64_t v[4]; +} zfs_fletcher_superscalar_t; + typedef struct zfs_fletcher_sse { uint64_t v[2] __attribute__((aligned(16))); } zfs_fletcher_sse_t; @@ -84,6 +89,7 @@ typedef union fletcher_4_ctx { zio_cksum_t scalar; + zfs_fletcher_superscalar_t superscalar[4]; #if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3)) zfs_fletcher_sse_t sse[4]; @@ -118,6 +124,8 @@ const char *name; } fletcher_4_ops_t; +extern const fletcher_4_ops_t fletcher_4_superscalar_ops; +extern const fletcher_4_ops_t fletcher_4_superscalar4_ops; #if defined(HAVE_SSE2) extern const fletcher_4_ops_t fletcher_4_sse2_ops; diff -Nru zfs-linux-0.7.0-rc2/include/zpios-ctl.h zfs-linux-0.7.0-rc3/include/zpios-ctl.h --- zfs-linux-0.7.0-rc2/include/zpios-ctl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/zpios-ctl.h 2017-01-20 18:18:28.000000000 +0000 @@ -1,7 +1,7 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel - * against ZFS while still being flexibly controled from user space. + * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -146,7 +146,7 @@ nsec -= NSEC_PER_SEC; sec++; } - while (nsec < 0) { + while (((int32_t)nsec) < 0) { nsec += NSEC_PER_SEC; sec--; } diff -Nru zfs-linux-0.7.0-rc2/include/zpios-internal.h zfs-linux-0.7.0-rc3/include/zpios-internal.h --- zfs-linux-0.7.0-rc2/include/zpios-internal.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/include/zpios-internal.h 2017-01-20 18:18:28.000000000 +0000 @@ -1,7 +1,7 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel - * against ZFS while still being flexibly controled from user space. + * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). diff -Nru zfs-linux-0.7.0-rc2/lib/libshare/libshare.c zfs-linux-0.7.0-rc3/lib/libshare/libshare.c --- zfs-linux-0.7.0-rc2/lib/libshare/libshare.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libshare/libshare.c 2017-01-20 18:18:28.000000000 +0000 @@ -113,7 +113,7 @@ char line[512]; char *eol, *pathname, *resource, *fstype, *options, *description; - fp = fopen("/etc/dfs/sharetab", "r"); + fp = fopen(ZFS_SHARETAB, "r"); if (fp == NULL) return; @@ -170,7 +170,7 @@ sa_share_impl_t impl_share; int temp_fd; FILE *temp_fp; - char tempfile[] = "/etc/dfs/sharetab.XXXXXX"; + char tempfile[] = ZFS_SHARETAB".XXXXXX"; sa_fstype_t *fstype; const char *resource; @@ -215,7 +215,7 @@ fsync(temp_fd); fclose(temp_fp); - (void) rename(tempfile, "/etc/dfs/sharetab"); + (void) rename(tempfile, ZFS_SHARETAB); } typedef struct update_cookie_s { @@ -498,7 +498,7 @@ #ifdef DEBUG fprintf(stderr, "sa_enable_share: share->sharepath=%s, protocol=%s\n", - impl_share->sharepath, protocol); + impl_share->sharepath, protocol); #endif assert(impl_share->handle != NULL); @@ -539,7 +539,7 @@ #ifdef DEBUG fprintf(stderr, "sa_disable_share: share->sharepath=%s, protocol=%s\n", - impl_share->sharepath, protocol); + impl_share->sharepath, protocol); #endif ret = SA_OK; @@ -697,7 +697,7 @@ #ifdef DEBUG fprintf(stderr, "sa_parse_legacy_options: options=%s, proto=%s\n", - options, proto); + options, proto); #endif fstype = fstypes; diff -Nru zfs-linux-0.7.0-rc2/lib/libshare/nfs.c zfs-linux-0.7.0-rc3/lib/libshare/nfs.c --- zfs-linux-0.7.0-rc2/lib/libshare/nfs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libshare/nfs.c 2017-01-20 18:18:28.000000000 +0000 @@ -688,7 +688,8 @@ } if (pid > 0) { - while ((rc = waitpid(pid, &status, 0)) <= 0 && errno == EINTR); + while ((rc = waitpid(pid, &status, 0)) <= 0 && + errno == EINTR) { } if (rc <= 0) { (void) close(nfs_exportfs_temp_fd); @@ -722,7 +723,7 @@ } /* - * Provides a convenient wrapper for determing nfs availability + * Provides a convenient wrapper for determining nfs availability */ static boolean_t nfs_available(void) diff -Nru zfs-linux-0.7.0-rc2/lib/libshare/smb.c zfs-linux-0.7.0-rc3/lib/libshare/smb.c --- zfs-linux-0.7.0-rc2/lib/libshare/smb.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libshare/smb.c 2017-01-20 18:18:28.000000000 +0000 @@ -154,7 +154,7 @@ continue; /* Incomplete share definition */ else { shares = (smb_share_t *) - malloc(sizeof (smb_share_t)); + malloc(sizeof (smb_share_t)); if (shares == NULL) { rc = SA_NO_MEMORY; goto out; @@ -354,17 +354,19 @@ static boolean_t smb_is_share_active(sa_share_impl_t impl_share) { + smb_share_t *iter = smb_shares; + if (!smb_available()) return (B_FALSE); /* Retrieve the list of (possible) active shares */ smb_retrieve_shares(); - while (smb_shares != NULL) { - if (strcmp(impl_share->sharepath, smb_shares->path) == 0) + while (iter != NULL) { + if (strcmp(impl_share->sharepath, iter->path) == 0) return (B_TRUE); - smb_shares = smb_shares->next; + iter = iter->next; } return (B_FALSE); @@ -393,7 +395,7 @@ old_shareopts = FSINFO(impl_share, smb_fstype)->shareopts; if (FSINFO(impl_share, smb_fstype)->active && old_shareopts != NULL && - strcmp(old_shareopts, shareopts) != 0) { + strcmp(old_shareopts, shareopts) != 0) { needs_reshare = B_TRUE; smb_disable_share(impl_share); } diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/asm-generic/atomic.c zfs-linux-0.7.0-rc3/lib/libspl/asm-generic/atomic.c --- zfs-linux-0.7.0-rc2/lib/libspl/asm-generic/atomic.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/asm-generic/atomic.c 2017-01-20 18:18:28.000000000 +0000 @@ -39,7 +39,7 @@ /* * Theses are the void returning variants */ - +/* BEGIN CSTYLED */ #define ATOMIC_INC(name, type) \ void atomic_inc_##name(volatile type *target) \ { \ @@ -381,6 +381,7 @@ ATOMIC_SWAP(uint, uint_t) ATOMIC_SWAP(ulong, ulong_t) ATOMIC_SWAP(64, uint64_t) +/* END CSTYLED */ void * atomic_swap_ptr(volatile void *target, void *bits) diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/getmntany.c zfs-linux-0.7.0-rc3/lib/libspl/getmntany.c --- zfs-linux-0.7.0-rc2/lib/libspl/getmntany.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/getmntany.c 2017-01-20 18:18:28.000000000 +0000 @@ -53,7 +53,7 @@ while ( ((ret = _sol_getmntent(fp, mgetp)) == 0) && ( DIFF(mnt_special) || DIFF(mnt_mountp) || - DIFF(mnt_fstype) || DIFF(mnt_mntopts))); + DIFF(mnt_fstype) || DIFF(mnt_mntopts))) { } return (ret); } @@ -86,7 +86,7 @@ int ret; struct stat64 st; - ret = _sol_getmntent(fp, (struct mnttab *) mp); + ret = _sol_getmntent(fp, (struct mnttab *)mp); if (ret == 0) { if (stat64(mp->mnt_mountp, &st) != 0) { mp->mnt_major = 0; diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/include/atomic.h zfs-linux-0.7.0-rc3/lib/libspl/include/atomic.h --- zfs-linux-0.7.0-rc2/lib/libspl/include/atomic.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/include/atomic.h 2017-01-20 18:18:28.000000000 +0000 @@ -247,7 +247,7 @@ /* * Perform an exclusive atomic bit set/clear on a target. - * Returns 0 if bit was sucessfully set/cleared, or -1 + * Returns 0 if bit was successfully set/cleared, or -1 * if the bit was already set/cleared. */ extern int atomic_set_long_excl(volatile ulong_t *, uint_t); diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/include/synch.h zfs-linux-0.7.0-rc3/lib/libspl/include/synch.h --- zfs-linux-0.7.0-rc2/lib/libspl/include/synch.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/include/synch.h 2017-01-20 18:18:28.000000000 +0000 @@ -53,11 +53,11 @@ switch (type) { case USYNC_THREAD: VERIFY0(pthread_rwlockattr_setpshared(&attr, - PTHREAD_PROCESS_PRIVATE)); + PTHREAD_PROCESS_PRIVATE)); break; case USYNC_PROCESS: VERIFY0(pthread_rwlockattr_setpshared(&attr, - PTHREAD_PROCESS_SHARED)); + PTHREAD_PROCESS_SHARED)); break; default: VERIFY0(1); diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/include/sys/dkio.h zfs-linux-0.7.0-rc3/lib/libspl/include/sys/dkio.h --- zfs-linux-0.7.0-rc2/lib/libspl/include/sys/dkio.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/include/sys/dkio.h 2017-01-20 18:18:28.000000000 +0000 @@ -158,7 +158,7 @@ /* * The following ioctls are generic in nature and need to be - * suported as appropriate by all disk drivers + * supported as appropriate by all disk drivers */ #define DKIOCGGEOM (DKIOC|1) /* Get geometry */ #define DKIOCINFO (DKIOC|3) /* Get info */ diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/include/sys/dklabel.h zfs-linux-0.7.0-rc3/lib/libspl/include/sys/dklabel.h --- zfs-linux-0.7.0-rc2/lib/libspl/include/sys/dklabel.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/include/sys/dklabel.h 2017-01-20 18:18:28.000000000 +0000 @@ -107,7 +107,7 @@ struct dkl_partition { uint16_t p_tag; /* ID tag of partition */ - uint16_t p_flag; /* permision flags */ + uint16_t p_flag; /* permission flags */ daddr32_t p_start; /* start sector no of partition */ int32_t p_size; /* # of blocks in partition */ }; diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/include/sys/param.h zfs-linux-0.7.0-rc3/lib/libspl/include/sys/param.h --- zfs-linux-0.7.0-rc2/lib/libspl/include/sys/param.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/include/sys/param.h 2017-01-20 18:18:28.000000000 +0000 @@ -57,8 +57,11 @@ #define MAXUID UINT32_MAX /* max user id */ #define MAXPROJID MAXUID /* max project id */ -#ifndef PAGESIZE -#define PAGESIZE (sysconf(_SC_PAGESIZE)) +#ifdef PAGESIZE +#undef PAGESIZE #endif /* PAGESIZE */ +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + #endif diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/Makefile.am zfs-linux-0.7.0-rc3/lib/libspl/Makefile.am --- zfs-linux-0.7.0-rc2/lib/libspl/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -24,6 +24,7 @@ getmntany.c \ list.c \ mkdirp.c \ + page.c \ strlcat.c \ strlcpy.c \ strnlen.c \ diff -Nru zfs-linux-0.7.0-rc2/lib/libspl/page.c zfs-linux-0.7.0-rc3/lib/libspl/page.c --- zfs-linux-0.7.0-rc2/lib/libspl/page.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libspl/page.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include + +size_t pagesize = 0; + +size_t +spl_pagesize(void) +{ + if (pagesize == 0) + pagesize = sysconf(_SC_PAGESIZE); + + return (pagesize); +} diff -Nru zfs-linux-0.7.0-rc2/lib/libuutil/uu_dprintf.c zfs-linux-0.7.0-rc3/lib/libuutil/uu_dprintf.c --- zfs-linux-0.7.0-rc2/lib/libuutil/uu_dprintf.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libuutil/uu_dprintf.c 2017-01-20 18:18:28.000000000 +0000 @@ -68,7 +68,8 @@ { uu_dprintf_t *D; - if (uu_check_name(name, UU_NAME_DOMAIN) == -1) { + if (name != NULL && + uu_check_name(name, UU_NAME_DOMAIN) == -1) { uu_set_error(UU_ERROR_INVALID_ARGUMENT); return (NULL); } diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_dataset.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_dataset.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_dataset.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_dataset.c 2017-01-20 18:18:28.000000000 +0000 @@ -1938,9 +1938,9 @@ * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && - !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { - *val = zfs_prop_default_numeric(prop); - return (-1); + !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { + *val = zfs_prop_default_numeric(prop); + return (-1); } switch (prop) { @@ -2343,7 +2343,7 @@ strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", - (u_longlong_t) val); + (u_longlong_t)val); } break; @@ -2744,11 +2744,11 @@ *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || - type == ZFS_PROP_USEROBJQUOTA || - type == ZFS_PROP_USEROBJUSED); + type == ZFS_PROP_USEROBJQUOTA || + type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || - type == ZFS_PROP_GROUPOBJQUOTA || - type == ZFS_PROP_GROUPOBJUSED); + type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_GROUPOBJUSED); cp = strchr(propname, '@') + 1; diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_diff.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_diff.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_diff.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_diff.c 2017-01-20 18:18:28.000000000 +0000 @@ -555,11 +555,13 @@ /* * Can accept - * dataset@snap1 - * dataset@snap1 dataset@snap2 - * dataset@snap1 @snap2 - * dataset@snap1 dataset - * @snap1 dataset@snap2 + * fdslen fsnlen tdslen tsnlen + * dataset@snap1 + * 0. dataset@snap1 dataset@snap2 >0 >1 >0 >1 + * 1. dataset@snap1 @snap2 >0 >1 ==0 >1 + * 2. dataset@snap1 dataset >0 >1 >0 ==0 + * 3. @snap1 dataset@snap2 ==0 >1 >0 >1 + * 4. @snap1 dataset ==0 >1 >0 ==0 */ if (tosnap == NULL) { /* only a from snapshot given, must be valid */ @@ -596,8 +598,7 @@ fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ - if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) || - (fsnlen == 0 && tsnlen == 0)) { + if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0)) { return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } else if ((fdslen > 0 && tdslen > 0) && ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_import.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_import.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_import.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_import.c 2017-01-20 18:18:28.000000000 +0000 @@ -200,7 +200,7 @@ udev_is_mpath(struct udev_device *dev) { return udev_device_get_property_value(dev, "DM_UUID") && - udev_device_get_property_value(dev, "MPATH_SBIN_PATH"); + udev_device_get_property_value(dev, "MPATH_SBIN_PATH"); } /* @@ -1313,6 +1313,7 @@ vdev_label_t *label; nvlist_t *expected_config = NULL; uint64_t expected_guid = 0, size; + int error; *config = NULL; @@ -1320,7 +1321,8 @@ return (0); size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - if ((label = malloc(sizeof (vdev_label_t))) == NULL) + error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); + if (error) return (-1); for (l = 0; l < VDEV_LABELS; l++) { @@ -1378,6 +1380,7 @@ char *rn_name; /* Full path to device */ int rn_order; /* Preferred order (low to high) */ int rn_num_labels; /* Number of valid labels */ + uint64_t rn_vdev_guid; /* Expected vdev guid when set */ libzfs_handle_t *rn_hdl; nvlist_t *rn_config; /* Label config */ avl_tree_t *rn_avl; @@ -1386,39 +1389,29 @@ boolean_t rn_labelpaths; } rdsk_node_t; +/* + * Sorted by vdev guid and full path to allow for multiple entries with + * the same full path name. This is required because it's possible to + * have multiple block devices with labels that refer to the same + * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both + * entries need to be added to the cache. Scenarios where this can occur + * include overwritten pool labels, devices which are visible from multiple + * hosts and multipath devices. + */ static int slice_cache_compare(const void *arg1, const void *arg2) { const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; - char *nm1slice, *nm2slice; + uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; + uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; - /* - * partitions one and three (slices zero and two) are the most - * likely to provide results, so put those first - */ - nm1slice = strstr(nm1, "part1"); - nm2slice = strstr(nm2, "part1"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - nm1slice = strstr(nm1, "part3"); - nm2slice = strstr(nm2, "part3"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } + rv = AVL_CMP(guid1, guid2); + if (rv) + return (rv); - rv = strcmp(nm1, nm2); - if (rv == 0) - return (0); - return (rv > 0 ? 1 : -1); + return (AVL_ISIGN(strcmp(nm1, nm2))); } static boolean_t @@ -1506,6 +1499,7 @@ struct stat64 statbuf; nvlist_t *config; char *bname, *dupname; + uint64_t vdev_guid = 0; int error; int num_labels; int fd; @@ -1531,19 +1525,28 @@ (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) return; - if ((fd = open(rn->rn_name, O_RDONLY)) < 0) + /* + * Preferentially open using O_DIRECT to bypass the block device + * cache which may be stale for multipath devices. An EINVAL errno + * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. + */ + fd = open(rn->rn_name, O_RDONLY | O_DIRECT); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + + if (fd < 0) return; /* * This file is too small to hold a zpool */ - if (S_ISREG(statbuf.st_mode) && - statbuf.st_size < SPA_MINDEVSIZE) { + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { (void) close(fd); return; } - if ((zpool_read_label(fd, &config, &num_labels)) != 0) { + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { (void) close(fd); return; } @@ -1554,6 +1557,18 @@ return; } + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + (void) close(fd); rn->rn_config = config; @@ -1580,9 +1595,10 @@ if (path != NULL) { slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zfs_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; - slice->rn_order = 1; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; slice->rn_labelpaths = B_FALSE; mutex_enter(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { @@ -1605,9 +1621,10 @@ return; } + slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; - slice->rn_order = 2; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; slice->rn_labelpaths = B_FALSE; mutex_enter(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { @@ -1709,10 +1726,11 @@ free(slice); continue; } + slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = cache; slice->rn_hdl = hdl; - slice->rn_order = i+1; + slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET; slice->rn_labelpaths = B_FALSE; mutex_enter(lock); avl_add(cache, slice); @@ -1747,6 +1765,7 @@ blkid_cache cache; blkid_dev_iterate iter; blkid_dev dev; + avl_index_t where; int error; *slice_cache = NULL; @@ -1781,13 +1800,25 @@ while (blkid_dev_next(iter, &dev) == 0) { slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev)); + slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = *slice_cache; slice->rn_hdl = hdl; - slice->rn_order = 100; slice->rn_labelpaths = B_TRUE; + + error = zfs_path_order(slice->rn_name, &slice->rn_order); + if (error == 0) + slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; + else + slice->rn_order = IMPORT_ORDER_DEFAULT; + mutex_enter(lock); - avl_add(*slice_cache, slice); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } mutex_exit(lock); } @@ -1881,6 +1912,7 @@ if (slice->rn_config != NULL) { nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; + int fd; if (iarg->poolname != NULL) { char *pname; @@ -1898,9 +1930,21 @@ if (!matched) { nvlist_free(config); } else { - add_config(hdl, &pools, - slice->rn_name, slice->rn_order, - slice->rn_num_labels, config); + /* + * Verify all remaining entries can be opened + * exclusively. This will prune all underlying + * multipath devices which otherwise could + * result in the vdev appearing as UNAVAIL. + */ + fd = open(slice->rn_name, O_RDONLY | O_EXCL); + if (fd >= 0) { + close(fd); + add_config(hdl, &pools, + slice->rn_name, slice->rn_order, + slice->rn_num_labels, config); + } else { + nvlist_free(config); + } } } free(slice->rn_name); diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_iter.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_iter.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_iter.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_iter.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ @@ -309,7 +309,8 @@ } snapspec_arg_t; static int -snapspec_cb(zfs_handle_t *zhp, void *arg) { +snapspec_cb(zfs_handle_t *zhp, void *arg) +{ snapspec_arg_t *ssa = arg; char *shortsnapname; int err = 0; diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_mount.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_mount.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_mount.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_mount.c 2017-01-20 18:18:28.000000000 +0000 @@ -456,7 +456,7 @@ */ if (!(flags & MS_OVERLAY)) { if (zfs_prop_get(zhp, ZFS_PROP_OVERLAY, overlay, - sizeof (overlay), NULL, NULL, 0, B_FALSE) == 0) { + sizeof (overlay), NULL, NULL, 0, B_FALSE) == 0) { if (strcmp(overlay, "on") == 0) { flags |= MS_OVERLAY; } @@ -988,6 +988,20 @@ return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); } +int +zfs_unshareall_bytype(zfs_handle_t *zhp, const char *mountpoint, + const char *proto) +{ + if (proto == NULL) + return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); + if (strcmp(proto, "nfs") == 0) + return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); + else if (strcmp(proto, "smb") == 0) + return (zfs_unshare_proto(zhp, mountpoint, smb_only)); + else + return (1); +} + /* * Remove the mountpoint associated with the current dataset, if necessary. * We only remove the underlying directory if: diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_pool.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_pool.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_pool.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_pool.c 2017-01-20 18:18:28.000000000 +0000 @@ -41,9 +41,6 @@ #include #include #include -#if HAVE_LIBDEVMAPPER -#include -#endif #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -311,7 +308,7 @@ case ZPOOL_PROP_ASHIFT: if (literal) (void) snprintf(buf, len, "%llu", - (u_longlong_t)intval); + (u_longlong_t)intval); else (void) zfs_nicenum(intval, buf, len); break; @@ -905,7 +902,7 @@ /* * Convert from feature name to feature guid. This conversion is - * unecessary for unsupported@... properties because they already + * unnecessary for unsupported@... properties because they already * use guids. */ if (supported) { @@ -3419,12 +3416,12 @@ d = part + 1; } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && tmp[1] == 'd') { - for (d = &tmp[2]; isalpha(*d); part = ++d); + for (d = &tmp[2]; isalpha(*d); part = ++d) { } } else if (strncmp("xvd", tmp, 3) == 0) { - for (d = &tmp[3]; isalpha(*d); part = ++d); + for (d = &tmp[3]; isalpha(*d); part = ++d) { } } if (part && d && *d != '\0') { - for (; isdigit(*d); d++); + for (; isdigit(*d); d++) { } if (*d == '\0') *part = '\0'; } @@ -3432,6 +3429,43 @@ return (tmp); } +/* + * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname + * + * path: /dev/sda1 + * returns: /dev/sda + * + * Returned string must be freed. + */ +char * +zfs_strip_partition_path(char *path) +{ + char *newpath = strdup(path); + char *sd_offset; + char *new_sd; + + if (!newpath) + return (NULL); + + /* Point to "sda1" part of "/dev/sda1" */ + sd_offset = strrchr(newpath, '/') + 1; + + /* Get our new name "sda" */ + new_sd = zfs_strip_partition(sd_offset); + if (!new_sd) { + free(newpath); + return (NULL); + } + + /* Paste the "sda" where "sda1" was */ + strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); + + /* Free temporary "sda" */ + free(new_sd); + + return (newpath); +} + #define PATH_BUF_LEN 64 /* @@ -4176,7 +4210,7 @@ if (id == 0) id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); - snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t) id); + snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); } /* @@ -4318,112 +4352,69 @@ return (0); } -#if HAVE_LIBDEVMAPPER -static void libdevmapper_dummy_log(int level, const char *file, int line, - int dm_errno_or_class, const char *f, ...) {} - -/* Disable libdevmapper error logging */ -static void disable_libdevmapper_errors(void) { - dm_log_with_errno_init(libdevmapper_dummy_log); -} -/* Enable libdevmapper error logging */ -static void enable_libdevmapper_errors(void) { - dm_log_with_errno_init(NULL); -} -#endif - /* * Allocate and return the underlying device name for a device mapper device. * If a device mapper device maps to multiple devices, return the first device. * - * For example, dm_name = "/dev/dm-0" could return "/dev/sda" - * - * dm_name should include the "/dev[/mapper]" prefix. + * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a + * DM device (like /dev/disk/by-vdev/A0) are also allowed. * * Returns device name, or NULL on error or no match. If dm_name is not a DM * device then return NULL. * * NOTE: The returned name string must be *freed*. */ -static char * dm_get_underlying_path(char *dm_name) +char * +dm_get_underlying_path(char *dm_name) { - char *name = NULL; -#if HAVE_LIBDEVMAPPER - char *tmp; - struct dm_task *dmt = NULL; - struct dm_tree *dt = NULL; - struct dm_tree_node *root, *child; - void *handle = NULL; - struct dm_info info; - const struct dm_info *child_info; + DIR *dp = NULL; + struct dirent *ep; + char *realp; + char *tmp = NULL; + char *path = NULL; + char *dev_str; + int size; - /* - * Disable libdevmapper errors. It's entirely possible user is not - * running devmapper, or that dm_name is not a devmapper device. - * That's totally ok, we will just harmlessly and silently return NULL. - */ - disable_libdevmapper_errors(); + if (dm_name == NULL) + return (NULL); + + /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ + realp = realpath(dm_name, NULL); + if (realp == NULL) + return (NULL); /* - * libdevmapper tutorial - * - * libdevmapper is basically a fancy wrapper for its ioctls. You - * create a "task", fill in the needed info to the task (fill in the - * ioctl fields), then run the task (call the ioctl). - * - * First we need the major/minor number for our DM device. + * If they preface 'dev' with a path (like "/dev") then strip it off. + * We just want the 'dm-N' part. */ - if (!(dmt = dm_task_create(DM_DEVICE_INFO))) - goto end; - - /* Lookup the name in libdevmapper */ - if (!dm_task_set_name(dmt, dm_name)) { - enable_libdevmapper_errors(); - goto end; - } - - if (!dm_task_run(dmt)) - goto end; - - /* Get DM device's major/minor */ - if (!dm_task_get_info(dmt, &info)) - goto end; - - /* We have major/minor number. Lookup the dm device's children */ - if (!(dt = dm_tree_create())) - goto end; - - /* We add the device into the tree and its children get populated */ - if (!dm_tree_add_dev(dt, info.major, info.minor)) - goto end; - - if (!(root = dm_tree_find_node(dt, 0, 0))) - goto end; - - if (!(child = dm_tree_next_child(&handle, root, 1))) - goto end; + tmp = strrchr(realp, '/'); + if (tmp != NULL) + dev_str = tmp + 1; /* +1 since we want the chr after '/' */ + else + dev_str = tmp; - /* Get child's major/minor numbers */ - if (!(child_info = dm_tree_node_get_info(child))) + size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); + if (size == -1 || !tmp) goto end; - if ((asprintf(&tmp, "/dev/block/%d:%d", child_info->major, - child_info->minor) == -1) || tmp == NULL) + dp = opendir(tmp); + if (dp == NULL) goto end; - /* Further translate /dev/block/ name into the normal name */ - name = realpath(tmp, NULL); - free(tmp); + /* Return first sd* entry in /sys/block/dm-N/slaves/ */ + while ((ep = readdir(dp))) { + if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ + size = asprintf(&path, "/dev/%s", ep->d_name); + break; + } + } end: - if (dmt) - dm_task_destroy(dmt); - if (dt) - dm_tree_free(dt); - enable_libdevmapper_errors(); -#endif /* HAVE_LIBDEVMAPPER */ - - return (name); + if (dp != NULL) + closedir(dp); + free(tmp); + free(realp); + return (path); } /* @@ -4436,7 +4427,7 @@ char *tmp; tmp = dm_get_underlying_path(dev_name); - if (!tmp) + if (tmp == NULL) return (0); free(tmp); @@ -4489,17 +4480,17 @@ char *name = NULL; char *tmp; - if (!dev_name) + if (dev_name == NULL) return (NULL); tmp = dm_get_underlying_path(dev_name); /* dev_name not a DM device, so just un-symlinkize it */ - if (!tmp) + if (tmp == NULL) tmp = realpath(dev_name, NULL); - if (tmp) { - name = zfs_strip_partition(tmp); + if (tmp != NULL) { + name = zfs_strip_partition_path(tmp); free(tmp); } @@ -4532,12 +4523,12 @@ size_t size; int tmpsize; - if (!dev_name) + if (dev_name == NULL) return (NULL); /* If they preface 'dev' with a path (like "/dev") then strip it off */ tmp1 = strrchr(dev_name, '/'); - if (tmp1) + if (tmp1 != NULL) dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); @@ -4558,7 +4549,7 @@ */ while ((ep = readdir(dp))) { /* Ignore everything that's not our enclosure_device link */ - if (!strstr(ep->d_name, "enclosure_device")) + if (strstr(ep->d_name, "enclosure_device") == NULL) continue; if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || @@ -4605,7 +4596,7 @@ free(tmp2); free(tmp1); - if (dp) + if (dp != NULL) closedir(dp); return (path); diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_sendrecv.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_sendrecv.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_sendrecv.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_sendrecv.c 2017-01-20 18:18:28.000000000 +0000 @@ -259,7 +259,7 @@ * a power of 2 if necessary. */ if (!ISP2(numbuckets)) - numbuckets = 1 << high_order_bit(numbuckets); + numbuckets = 1ULL << high_order_bit(numbuckets); ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, @@ -366,11 +366,12 @@ if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, zero_cksum) || !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { - SHA256_CTX ctx; + SHA2_CTX ctx; zio_cksum_t tmpsha256; - zio_checksum_SHA256(buf, - payload_size, &ctx, &tmpsha256); + SHA2Init(SHA256, &ctx); + SHA2Update(&ctx, buf, payload_size); + SHA2Final(&tmpsha256, &ctx); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); @@ -2614,7 +2615,7 @@ else progress = B_TRUE; sprintf(guidname, "%llu", - (u_longlong_t) parent_fromsnap_guid); + (u_longlong_t)parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } @@ -2648,7 +2649,7 @@ parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%llu", - (u_longlong_t) parent_fromsnap_guid); + (u_longlong_t)parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_util.c zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_util.c --- zfs-linux-0.7.0-rc2/lib/libzfs/libzfs_util.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/libzfs_util.c 2017-01-20 18:18:28.000000000 +0000 @@ -617,7 +617,7 @@ double val; if (format == ZFS_NICENUM_RAW) { - snprintf(buf, buflen, "%llu", (u_longlong_t) num); + snprintf(buf, buflen, "%llu", (u_longlong_t)num); return; } @@ -633,12 +633,12 @@ if ((format == ZFS_NICENUM_TIME) && (num == 0)) { (void) snprintf(buf, buflen, "-"); } else if ((index == 0) || ((num % - (uint64_t) powl(k_unit[format], index)) == 0)) { + (uint64_t)powl(k_unit[format], index)) == 0)) { /* * If this is an even multiple of the base, always display * without any decimal precision. */ - (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t) n, u); + (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u); } else { /* @@ -652,8 +652,8 @@ */ int i; for (i = 2; i >= 0; i--) { - val = (double) num / - (uint64_t) powl(k_unit[format], index); + val = (double)num / + (uint64_t)powl(k_unit[format], index); /* * Don't print floating point values for time. Note, @@ -752,7 +752,7 @@ int status; while ((error = waitpid(pid, &status, 0)) == -1 && - errno == EINTR); + errno == EINTR) { } if (error < 0 || !WIFEXITED(status)) return (-1); @@ -863,12 +863,13 @@ return (NULL); } - hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r"); + hdl->libzfs_sharetab = fopen(ZFS_SHARETAB, "r"); if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); (void) fclose(hdl->libzfs_mnttab); - (void) fclose(hdl->libzfs_sharetab); + if (hdl->libzfs_sharetab) + (void) fclose(hdl->libzfs_sharetab); free(hdl); return (NULL); } @@ -1136,6 +1137,45 @@ } /* + * Given a full path to a device determine if that device appears in the + * import search path. If it does return the first match and store the + * index in the passed 'order' variable, otherwise return an error. + */ +int +zfs_path_order(char *name, int *order) +{ + int i = 0, error = ENOENT; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir) { + if (strncmp(name, dir, strlen(dir)) == 0) { + *order = i; + error = 0; + break; + } + dir = strtok(NULL, ":"); + i++; + } + free(envdup); + } else { + for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { + if (strncmp(name, zpool_default_import_path[i], + strlen(zpool_default_import_path[i])) == 0) { + *order = i; + error = 0; + break; + } + } + } + + return (error); +} + +/* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs/Makefile.am zfs-linux-0.7.0-rc3/lib/libzfs/Makefile.am --- zfs-linux-0.7.0-rc2/lib/libzfs/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -35,7 +35,7 @@ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la -libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBDEVMAPPER) +libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) libzfs_la_LDFLAGS = -version-info 2:0:0 EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C) diff -Nru zfs-linux-0.7.0-rc2/lib/libzfs_core/libzfs_core.c zfs-linux-0.7.0-rc3/lib/libzfs_core/libzfs_core.c --- zfs-linux-0.7.0-rc2/lib/libzfs_core/libzfs_core.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzfs_core/libzfs_core.c 2017-01-20 18:18:28.000000000 +0000 @@ -617,7 +617,7 @@ fnvlist_add_string(innvl, "origin", origin); fnvlist_add_byte_array(innvl, "begin_record", - (uchar_t *) &drr, sizeof (drr)); + (uchar_t *)&drr, sizeof (drr)); fnvlist_add_int32(innvl, "input_fd", input_fd); diff -Nru zfs-linux-0.7.0-rc2/lib/libzpool/kernel.c zfs-linux-0.7.0-rc3/lib/libzpool/kernel.c --- zfs-linux-0.7.0-rc2/lib/libzpool/kernel.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzpool/kernel.c 2017-01-20 18:18:28.000000000 +0000 @@ -127,7 +127,7 @@ void * zk_thread_helper(void *arg) { - kthread_t *kt = (kthread_t *) arg; + kthread_t *kt = (kthread_t *)arg; VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); @@ -137,7 +137,7 @@ (void) setpriority(PRIO_PROCESS, 0, kt->t_pri); kt->t_tid = pthread_self(); - ((thread_func_arg_t) kt->t_func)(kt->t_arg); + ((thread_func_arg_t)kt->t_func)(kt->t_arg); /* Unreachable, thread must exit with thread_exit() */ abort(); @@ -737,7 +737,7 @@ /*ARGSUSED*/ int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) + int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) { ssize_t rc, done = 0, split; @@ -916,7 +916,7 @@ if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) - (void) printf("%u ", (uint_t) pthread_self()); + (void) printf("%u ", (uint_t)pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) @@ -1490,7 +1490,7 @@ fstrans_cookie_t spl_fstrans_mark(void) { - return ((fstrans_cookie_t) 0); + return ((fstrans_cookie_t)0); } void diff -Nru zfs-linux-0.7.0-rc2/lib/libzpool/Makefile.am zfs-linux-0.7.0-rc3/lib/libzpool/Makefile.am --- zfs-linux-0.7.0-rc2/lib/libzpool/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzpool/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -26,11 +26,14 @@ zfs_fletcher_sse.c \ zfs_fletcher_avx512.c \ zfs_fletcher_aarch64_neon.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zfs_uio.c \ zpool_prop.c \ zprop_common.c \ + abd.c \ arc.c \ blkptr.c \ bplist.c \ @@ -102,6 +105,8 @@ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math_avx512bw.c \ vdev_raidz_math_aarch64_neon.c \ vdev_raidz_math_aarch64_neonx2.c \ vdev_root.c \ diff -Nru zfs-linux-0.7.0-rc2/lib/libzpool/taskq.c zfs-linux-0.7.0-rc3/lib/libzpool/taskq.c --- zfs-linux-0.7.0-rc2/lib/libzpool/taskq.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/lib/libzpool/taskq.c 2017-01-20 18:18:28.000000000 +0000 @@ -32,6 +32,7 @@ int taskq_now; taskq_t *system_taskq; +taskq_t *system_delay_taskq; #define TASKQ_ACTIVE 0x00010000 @@ -353,6 +354,8 @@ { system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, + 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } void @@ -360,4 +363,6 @@ { taskq_destroy(system_taskq); system_taskq = NULL; /* defensive */ + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; } diff -Nru zfs-linux-0.7.0-rc2/Makefile.am zfs-linux-0.7.0-rc3/Makefile.am --- zfs-linux-0.7.0-rc2/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -43,7 +43,7 @@ cstyle: @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \ - ! -name '*.mod.c' -type f -exec scripts/cstyle.pl {} \+ + ! -name '*.mod.c' -type f -exec scripts/cstyle.pl -cpP {} \+ shellcheck: @if type shellcheck > /dev/null 2>&1; then \ @@ -55,13 +55,18 @@ done; \ fi -lint: cppcheck +lint: cppcheck paxcheck cppcheck: @if type cppcheck > /dev/null 2>&1; then \ cppcheck --quiet --force --error-exitcode=2 ${top_srcdir}; \ fi +paxcheck: + @if type scanelf > /dev/null 2>&1; then \ + scripts/paxcheck.sh ${top_srcdir}; \ + fi + flake8: @if type flake8 > /dev/null 2>&1; then \ flake8 ${top_srcdir}; \ diff -Nru zfs-linux-0.7.0-rc2/man/man5/zfs-module-parameters.5 zfs-linux-0.7.0-rc3/man/man5/zfs-module-parameters.5 --- zfs-linux-0.7.0-rc2/man/man5/zfs-module-parameters.5 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/man/man5/zfs-module-parameters.5 2017-01-20 18:18:28.000000000 +0000 @@ -182,6 +182,30 @@ .sp .ne 2 .na +\fBzfs_metaslab_segment_weight_enabled\fR (int) +.ad +.RS 12n +Enable/disable segment-based metaslab selection. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_switch_threshold\fR (int) +.ad +.RS 12n +When using segment-based metaslab selection, continue allocating +from the active metaslab until \fBlzfs_metaslab_switch_threshold\fR +worth of buckets have been exhausted. +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na \fBmetaslab_debug_load\fR (int) .ad .RS 12n @@ -421,7 +445,7 @@ .ad .RS 12n Percentage of ARC dnodes to try to scan in response to demand for non-metadata -when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fB. +when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR. .sp Default value: \fB10% of the number of dnodes in the ARC\fR. @@ -606,7 +630,7 @@ the level of these "sub-lists". This parameters controls the number of sub-lists per ARC state. .sp -Default value: \fR1\fB or the number of online CPUs, whichever is greater +Default value: \fB1\fR or the number of online CPUs, whichever is greater .RE .sp @@ -1236,7 +1260,7 @@ \fBzfs_free_min_time_ms\fR (int) .ad .RS 12n -During a \fRzfs destroy\fB operation using \fRfeature@async_destroy\fB a minimum +During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum of this much time will be spent working on freeing blocks per txg. .sp Default value: \fB1,000\fR. @@ -1249,7 +1273,7 @@ .ad .RS 12n Largest data block to write to zil. Larger blocks will be treated as if the -dataset being written to had the property setting \fRlogbias=throughput\fB. +dataset being written to had the property setting \fBlogbias=throughput\fR. .sp Default value: \fB32,768\fR. .RE @@ -1391,7 +1415,7 @@ .ad .RS 12n The number of bytes which should be prefetched during a pool traversal -(eg: \fRzfs send\fB or other data crawling operations) +(eg: \fBzfs send\fR or other data crawling operations) .sp Default value: \fB52,428,800\fR. .RE @@ -1775,6 +1799,8 @@ sse2 - implementation using SSE2 instruction set (64bit x86 only) ssse3 - implementation using SSSE3 instruction set (64bit x86 only) avx2 - implementation using AVX2 instruction set (64bit x86 only) + avx512f - implementation using AVX512F instruction set (64bit x86 only) + avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only) aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only) aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only) .sp @@ -1861,7 +1887,7 @@ Throttle block allocations in the ZIO pipeline. This allows for dynamic allocation distribution when devices are imbalanced. .sp -Default value: \fB0\fR. +Default value: \fB1\fR. .RE .sp diff -Nru zfs-linux-0.7.0-rc2/man/man8/zfs.8 zfs-linux-0.7.0-rc3/man/man8/zfs.8 --- zfs-linux-0.7.0-rc2/man/man8/zfs.8 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/man/man8/zfs.8 2017-01-20 18:18:28.000000000 +0000 @@ -22,7 +22,7 @@ .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow -.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved. +.\" Copyright (c) 2011, 2016 by Delphix. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. @@ -107,7 +107,7 @@ .LP .nf -+\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR... +\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR... .fi .LP @@ -160,12 +160,12 @@ .LP .nf -\fBzfs\fR \fBshare\fR \fB-a\fR | \fIfilesystem\fR +\fBzfs\fR \fBshare\fR [\fBnfs\fR|\fBsmb\fR] \fB-a\fR | \fIfilesystem\fR .fi .LP .nf -\fBzfs\fR \fBunshare\fR \fB-a\fR \fIfilesystem\fR|\fImountpoint\fR +\fBzfs\fR \fBunshare\fR [\fBnfs\fR|\fBsmb\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR .fi .LP @@ -603,9 +603,9 @@ .ad .sp .6 .RS 4n -The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. +The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that is freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. .sp -When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots. +The used space of a snapshot (see the "Snapshots" section) is space that is referenced exclusively by this snapshot. If this snapshot is destroyed, the amount of \fBused\fR space will be freed. Space that is shared by multiple snapshots isn't accounted for in this metric. When a snapshot is destroyed, space that was previously shared with this snapshot can become unique to snapshots adjacent to it, thus changing the used space of those snapshots. The used space of the latest snapshot can also be affected by changes in the file system. Note that the \fBused\fR space of a snapshot is a subset of the \fBwritten\fR space of the snapshot. .sp The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(2) or \fBO_SYNC\fR (see \fBopen\fR(2)) does not necessarily guarantee that the space usage information is updated immediately. .RE @@ -756,8 +756,8 @@ .ad .sp .6 .RS 4n -The amount of \fBreferenced\fR space written to this dataset since the -previous snapshot. +The amount of space \fBreferenced\fR by this dataset, that was written since the previous snapshot +(i.e. that is not referenced by the previous snapshot). .RE .sp @@ -1620,7 +1620,7 @@ When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. For example, property names beginning with \fBcom.sun\fR. are reserved for definition by Oracle Corporation (which acquired Sun Microsystems). .sp .LP -The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties (\fBzfs list\fR, \fBzfs get\fR, \fBzfs set\fR, and so forth) can be used to manipulate both native properties and user properties. Use the \fBzfs inherit\fR command to clear a user property. If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters. +The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties (\fBzfs list\fR, \fBzfs get\fR, \fBzfs set\fR, and so forth) can be used to manipulate both native properties and user properties. Use the \fBzfs inherit\fR command to clear a user property. If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 8192 bytes. .SS "ZFS Volumes as Swap" .LP \fBZFS\fR volumes may be used as Linux swap devices. After creating the volume @@ -2646,7 +2646,7 @@ .sp .ne 2 .na -\fB\fBzfs share\fR \fB-a\fR | \fIfilesystem\fR\fR +\fB\fBzfs share\fR [\fBnfs\fR|\fBsmb\fR] \fB-a\fR | \fIfilesystem\fR\fR .ad .sp .6 .RS 4n @@ -2658,7 +2658,7 @@ .ad .sp .6 .RS 4n -Share all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. +Share all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. Additionally if one of \fBnfs\fR|\fBsmb\fR protocols is specified only share file systems whose \fBsharenfs\fR|\fBsharesmb\fR is set. .RE .sp @@ -2676,11 +2676,11 @@ .sp .ne 2 .na -\fB\fBzfs unshare\fR \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR +\fB\fBzfs unshare\fR [\fBnfs\fR|\fBsmb\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR .ad .sp .6 .RS 4n -Unshares currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process. +Unshares currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process. Additionally if one of \fBnfs\fR|\fBsmb\fR is specified unshare only file systems currently shared by that protocol. .sp .ne 2 .na diff -Nru zfs-linux-0.7.0-rc2/man/man8/zinject.8 zfs-linux-0.7.0-rc3/man/man8/zinject.8 --- zfs-linux-0.7.0-rc2/man/man8/zinject.8 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/man/man8/zinject.8 2017-01-20 18:18:28.000000000 +0000 @@ -100,7 +100,7 @@ .TP .BI "\-b" " objset:object:level:start:end" Force an error into the pool at this bookmark tuple. Each number is -in hexidecimal, and only one block can be specified. +in hexadecimal, and only one block can be specified. .TP .BI "\-d" " vdev" A vdev specified by path or GUID. diff -Nru zfs-linux-0.7.0-rc2/man/man8/zpool.8 zfs-linux-0.7.0-rc3/man/man8/zpool.8 --- zfs-linux-0.7.0-rc2/man/man8/zpool.8 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/man/man8/zpool.8 2017-01-20 18:18:28.000000000 +0000 @@ -96,7 +96,7 @@ .LP .nf -\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-lq\fR]|[\fB-r\fR|-\fBw\fR]] +\fB\fBzpool iostat\fR [\fB-c\fR \fBCMD\fR] [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [\fB-lq\fR]|[\fB-r\fR|-\fBw\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR .fi @@ -159,7 +159,7 @@ .LP .nf -\fBzpool status\fR [\fB-gLPvxD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] +\fBzpool status\fR [\fB-c\fR \fBCMD\fR] [\fB-gLPvxD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] .fi .LP @@ -1523,7 +1523,7 @@ .sp .ne 2 .na -\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [[\fB-lq\fR]|[\fB-r\fR|\fB-w\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR +\fB\fBzpool iostat\fR [\fB-c\fR \fBCMD\fR] [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-ghHLpPvy\fR] [[\fB-lq\fR]|[\fB-r\fR|\fB-w\fR]] [[\fIpool\fR ...]|[\fIpool vdev\fR ...]|[\fIvdev\fR ...]] [\fIinterval\fR[\fIcount\fR]]\fR .ad .sp .6 @@ -1542,6 +1542,23 @@ .sp .ne 2 .na +\fB\fB-c\fR \fBCMD\fR +.ad +.RS 12n +Run a command on each vdev and include first line of output +.sp +The \fB-c\fR option allows you to run an arbitrary command on each vdev and +display the first line of output in zpool iostat. The environment vars +\fBVDEV_PATH\fR and \fBVDEV_UPATH\fR are set to the vdev's path and "underlying +path" before running the command. For device mapper, multipath, or partitioned +vdevs, \fBVDEV_UPATH\fR is the actual underlying /dev/sd* disk. This can be +useful if the command you're running requires a /dev/sd* device. Commands run +in parallel for each vdev for performance. +.RE + +.sp +.ne 2 +.na \fB\fB-T\fR \fBu\fR | \fBd\fR\fR .ad .RS 12n @@ -1764,6 +1781,9 @@ in the queues. If you specify an interval, the measurements will be sampled from the end of the interval. .RE + +.RE + .sp .ne 2 .na @@ -2079,7 +2099,7 @@ .sp .ne 2 .na -\fBzpool status\fR [\fB-gLPvxD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] +\fBzpool status\fR [\fB-c\fR \fBCMD\fR] [\fB-gLPvxD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] .ad .sp .6 .RS 4n @@ -2090,6 +2110,23 @@ .sp .ne 2 .na +\fB\fB-c\fR \fBCMD\fR +.ad +.RS 12n +Run a command on each vdev and include first line of output +.sp +The \fB-c\fR option allows you to run an arbitrary command on each vdev and +display the first line of output in zpool status. The environment vars +\fBVDEV_PATH\fR and \fBVDEV_UPATH\fR are set to the vdev's path and "underlying +path" before running the command. For device mapper, multipath, or partitioned +vdevs, \fBVDEV_UPATH\fR is the actual underlying /dev/sd* disk. This can be +useful if the command you're running requires a /dev/sd* device. Commands run +in parallel for each vdev for performance. +.RE + +.sp +.ne 2 +.na \fB\fB-g\fR\fR .ad .RS 12n @@ -2516,6 +2553,40 @@ c1t3d0 - - - - - .fi .in -2 +.sp + +.LP +\fBExample 16 \fRRunning commands in zpool status and zpool iostat with -c +.sp +.LP +Some examples of using the command (-c) option with zpool status and zpool +iostat: +.sp +.in +2 +.nf +# \fBzpool status -c \[aq]echo I am $VDEV_PATH, $VDEV_UPATH\[aq]\fR +NAME STATE READ WRITE CKSUM +mypool ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + mpatha ONLINE 0 0 0 I am /dev/mapper/mpatha, /dev/sdc + sdb ONLINE 0 0 0 I am /dev/sdb1, /dev/sdb +.fi +.in -2 + +.sp +.in +2 +.nf +# \fBzpool iostat -v -c \[aq]smartctl -a $VDEV_UPATH | grep "Current Drive Temperature"\[aq]\fR +mypool 997M 7.25T 0 0 105K 106K + mirror 997M 7.25T 0 0 105K 106K + B0 - - 0 0 17.4K 15.2K Current Drive Temperature: 25 C + B1 - - 0 0 17.4K 15.2K Current Drive Temperature: 24 C + B2 - - 0 0 17.5K 15.2K Current Drive Temperature: 24 C + B3 - - 0 0 0 15.1K Current Drive Temperature: 24 C +logs - - - - - - + B8 0 7.25T 0 0 1.14K 20.2K Current Drive Temperature: 23 C +.fi +.in -2 .SH EXIT STATUS .sp diff -Nru zfs-linux-0.7.0-rc2/META zfs-linux-0.7.0-rc3/META --- zfs-linux-0.7.0-rc2/META 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/META 2017-01-20 18:18:28.000000000 +0000 @@ -2,7 +2,7 @@ Name: zfs Branch: 1.0 Version: 0.7.0 -Release: rc2 +Release: rc3 Release-Tags: relext License: CDDL Author: OpenZFS on Linux diff -Nru zfs-linux-0.7.0-rc2/module/icp/algs/aes/aes_impl.c zfs-linux-0.7.0-rc3/module/icp/algs/aes/aes_impl.c --- zfs-linux-0.7.0-rc2/module/icp/algs/aes/aes_impl.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/algs/aes/aes_impl.c 2017-01-20 18:18:28.000000000 +0000 @@ -1593,18 +1593,17 @@ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a"(func), "c"(subfunc)); - if (memcmp((char *) (&ebx), "Genu", 4) == 0 && - memcmp((char *) (&edx), "ineI", 4) == 0 && - memcmp((char *) (&ecx), "ntel", 4) == 0) { - + if (memcmp((char *)(&ebx), "Genu", 4) == 0 && + memcmp((char *)(&edx), "ineI", 4) == 0 && + memcmp((char *)(&ecx), "ntel", 4) == 0) { func = 1; subfunc = 0; /* check for aes-ni instruction set */ __asm__ __volatile__( - "cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a"(func), "c"(subfunc)); + "cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a"(func), "c"(subfunc)); cached_result = !!(ecx & INTEL_AESNI_FLAG); } else { diff -Nru zfs-linux-0.7.0-rc2/module/icp/algs/modes/gcm.c zfs-linux-0.7.0-rc3/module/icp/algs/modes/gcm.c --- zfs-linux-0.7.0-rc2/module/icp/algs/modes/gcm.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/algs/modes/gcm.c 2017-01-20 18:18:28.000000000 +0000 @@ -723,18 +723,17 @@ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a"(func), "c"(subfunc)); - if (memcmp((char *) (&ebx), "Genu", 4) == 0 && - memcmp((char *) (&edx), "ineI", 4) == 0 && - memcmp((char *) (&ecx), "ntel", 4) == 0) { - + if (memcmp((char *)(&ebx), "Genu", 4) == 0 && + memcmp((char *)(&edx), "ineI", 4) == 0 && + memcmp((char *)(&ecx), "ntel", 4) == 0) { func = 1; subfunc = 0; /* check for aes-ni instruction set */ __asm__ __volatile__( - "cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a"(func), "c"(subfunc)); + "cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a"(func), "c"(subfunc)); cached_result = !!(ecx & INTEL_PCLMULQDQ_FLAG); } else { diff -Nru zfs-linux-0.7.0-rc2/module/icp/asm-x86_64/sha2/sha256_impl.S zfs-linux-0.7.0-rc3/module/icp/asm-x86_64/sha2/sha256_impl.S --- zfs-linux-0.7.0-rc2/module/icp/asm-x86_64/sha2/sha256_impl.S 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/asm-x86_64/sha2/sha256_impl.S 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates diff -Nru zfs-linux-0.7.0-rc2/module/icp/asm-x86_64/sha2/sha512_impl.S zfs-linux-0.7.0-rc3/module/icp/asm-x86_64/sha2/sha512_impl.S --- zfs-linux-0.7.0-rc2/module/icp/asm-x86_64/sha2/sha512_impl.S 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/asm-x86_64/sha2/sha512_impl.S 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates diff -Nru zfs-linux-0.7.0-rc2/module/icp/core/kcf_callprov.c zfs-linux-0.7.0-rc3/module/icp/core/kcf_callprov.c --- zfs-linux-0.7.0-rc2/module/icp/core/kcf_callprov.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/core/kcf_callprov.c 2017-01-20 18:18:28.000000000 +0000 @@ -282,7 +282,7 @@ prov_chain = me->me_hw_prov_chain; /* - * We check for the threshhold for using a hardware provider for + * We check for the threshold for using a hardware provider for * this amount of data. If there is no software provider available * for the mechanism, then the threshold is ignored. */ diff -Nru zfs-linux-0.7.0-rc2/module/icp/core/kcf_mech_tabs.c zfs-linux-0.7.0-rc3/module/icp/core/kcf_mech_tabs.c --- zfs-linux-0.7.0-rc2/module/icp/core/kcf_mech_tabs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/core/kcf_mech_tabs.c 2017-01-20 18:18:28.000000000 +0000 @@ -100,7 +100,7 @@ }; /* - * Per-algorithm internal threasholds for the minimum input size of before + * Per-algorithm internal thresholds for the minimum input size of before * offloading to hardware provider. * Dispatching a crypto operation to a hardware provider entails paying the * cost of an additional context switch. Measurments with Sun Accelerator 4000 diff -Nru zfs-linux-0.7.0-rc2/module/icp/core/kcf_prov_tabs.c zfs-linux-0.7.0-rc3/module/icp/core/kcf_prov_tabs.c --- zfs-linux-0.7.0-rc2/module/icp/core/kcf_prov_tabs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/core/kcf_prov_tabs.c 2017-01-20 18:18:28.000000000 +0000 @@ -67,8 +67,9 @@ void kcf_prov_tab_destroy(void) { - if (prov_tab) kmem_free(prov_tab, prov_tab_max * - sizeof (kcf_provider_desc_t *)); + if (prov_tab) + kmem_free(prov_tab, prov_tab_max * + sizeof (kcf_provider_desc_t *)); } /* diff -Nru zfs-linux-0.7.0-rc2/module/icp/core/kcf_sched.c zfs-linux-0.7.0-rc3/module/icp/core/kcf_sched.c --- zfs-linux-0.7.0-rc2/module/icp/core/kcf_sched.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/core/kcf_sched.c 2017-01-20 18:18:28.000000000 +0000 @@ -561,7 +561,7 @@ taskq_t *taskq = new_pd->pd_sched_info.ks_taskq; if (taskq_dispatch(taskq, process_req_hwp, areq, TQ_NOSLEEP) == - (taskqid_t)0) { + TASKQID_INVALID) { error = CRYPTO_HOST_MEMORY; } else { error = CRYPTO_QUEUED; @@ -782,7 +782,7 @@ if (taskq_dispatch(taskq, process_req_hwp, areq, TQ_NOSLEEP) == - (taskqid_t)0) { + TASKQID_INVALID) { error = CRYPTO_HOST_MEMORY; if (!(crq->cr_flag & CRYPTO_SKIP_REQID)) kcf_reqid_delete(areq); @@ -1062,7 +1062,7 @@ for (i = 0; i < REQID_TABLES; i++) { if (kcf_reqid_table[i]) kmem_free(kcf_reqid_table[i], - sizeof (kcf_reqid_table_t)); + sizeof (kcf_reqid_table_t)); } if (gswq) diff -Nru zfs-linux-0.7.0-rc2/module/icp/include/sha1/sha1.h zfs-linux-0.7.0-rc3/module/icp/include/sha1/sha1.h --- zfs-linux-0.7.0-rc2/module/icp/include/sha1/sha1.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/include/sha1/sha1.h 2017-01-20 18:18:28.000000000 +0000 @@ -35,7 +35,7 @@ /* * NOTE: n2rng (Niagara2 RNG driver) accesses the state field of * SHA1_CTX directly. NEVER change this structure without verifying - * compatiblity with n2rng. The important thing is that the state + * compatibility with n2rng. The important thing is that the state * must be in a field declared as uint32_t state[5]. */ /* SHA-1 context. */ diff -Nru zfs-linux-0.7.0-rc2/module/icp/os/modconf.c zfs-linux-0.7.0-rc3/module/icp/os/modconf.c --- zfs-linux-0.7.0-rc2/module/icp/os/modconf.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/os/modconf.c 2017-01-20 18:18:28.000000000 +0000 @@ -71,7 +71,7 @@ if (modlp->ml_rev != MODREV_1) { cmn_err(CE_WARN, "mod_install: " - "modlinkage structure is not MODREV_1\n"); + "modlinkage structure is not MODREV_1\n"); return (EINVAL); } linkpp = (struct modlmisc **)&modlp->ml_linkage[0]; @@ -168,4 +168,4 @@ if (retval == 0) return (1); return (0); -} \ No newline at end of file +} diff -Nru zfs-linux-0.7.0-rc2/module/icp/spi/kcf_spi.c zfs-linux-0.7.0-rc3/module/icp/spi/kcf_spi.c --- zfs-linux-0.7.0-rc2/module/icp/spi/kcf_spi.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/icp/spi/kcf_spi.c 2017-01-20 18:18:28.000000000 +0000 @@ -701,16 +701,13 @@ ks_data = ksp->ks_data; - ks_data->ps_ops_total.value.ui64 = - pd->pd_sched_info.ks_ndispatches; - ks_data->ps_ops_failed.value.ui64 = - pd->pd_sched_info.ks_nfails; - ks_data->ps_ops_busy_rval.value.ui64 = - pd->pd_sched_info.ks_nbusy_rval; + ks_data->ps_ops_total.value.ui64 = pd->pd_sched_info.ks_ndispatches; + ks_data->ps_ops_failed.value.ui64 = pd->pd_sched_info.ks_nfails; + ks_data->ps_ops_busy_rval.value.ui64 = pd->pd_sched_info.ks_nbusy_rval; ks_data->ps_ops_passed.value.ui64 = - pd->pd_sched_info.ks_ndispatches - - pd->pd_sched_info.ks_nfails - - pd->pd_sched_info.ks_nbusy_rval; + pd->pd_sched_info.ks_ndispatches - + pd->pd_sched_info.ks_nfails - + pd->pd_sched_info.ks_nbusy_rval; return (0); } diff -Nru zfs-linux-0.7.0-rc2/module/Makefile.in zfs-linux-0.7.0-rc3/module/Makefile.in --- zfs-linux-0.7.0-rc2/module/Makefile.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/Makefile.in 2017-01-20 18:18:28.000000000 +0000 @@ -21,9 +21,9 @@ @# installed devel headers, or they may be in the module @# subdirectory when building against the spl source tree. @if [ -f @SPL_OBJ@/@SPL_SYMBOLS@ ]; then \ - /bin/cp @SPL_OBJ@/@SPL_SYMBOLS@ .; \ + cp @SPL_OBJ@/@SPL_SYMBOLS@ .; \ elif [ -f @SPL_OBJ@/module/@SPL_SYMBOLS@ ]; then \ - /bin/cp @SPL_OBJ@/module/@SPL_SYMBOLS@ .; \ + cp @SPL_OBJ@/module/@SPL_SYMBOLS@ .; \ else \ echo -e "\n" \ "*** Missing spl symbols ensure you have built the spl:\n" \ @@ -71,7 +71,7 @@ distdir: list='$(subdir-m)'; for subdir in $$list; do \ (cd @top_srcdir@/module && find $$subdir -name '*.c' -o -name '*.h' -o -name '*.S' |\ - xargs /bin/cp --parents -t $$distdir); \ + xargs cp --parents -t $$distdir); \ done distclean maintainer-clean: clean diff -Nru zfs-linux-0.7.0-rc2/module/nvpair/nvpair_alloc_fixed.c zfs-linux-0.7.0-rc3/module/nvpair/nvpair_alloc_fixed.c --- zfs-linux-0.7.0-rc2/module/nvpair/nvpair_alloc_fixed.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/nvpair/nvpair_alloc_fixed.c 2017-01-20 18:18:28.000000000 +0000 @@ -42,7 +42,7 @@ * - it uses a pre-allocated buffer for memory allocations. * - it does _not_ free memory in the pre-allocated buffer. * - * The reason for the selected implemention is simplicity. + * The reason for the selected implementation is simplicity. * This allocator is designed for the usage in interrupt context when * the caller may not wait for free memory. */ diff -Nru zfs-linux-0.7.0-rc2/module/unicode/u8_textprep.c zfs-linux-0.7.0-rc3/module/unicode/u8_textprep.c --- zfs-linux-0.7.0-rc2/module/unicode/u8_textprep.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/unicode/u8_textprep.c 2017-01-20 18:18:28.000000000 +0000 @@ -842,7 +842,7 @@ } /* - * At this point, this rountine does not know what it would get. + * At this point, this routine does not know what it would get. * The caller should sort it out if the state isn't a Hangul one. */ *state = U8_STATE_START; diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/Makefile.in zfs-linux-0.7.0-rc3/module/zcommon/Makefile.in --- zfs-linux-0.7.0-rc2/module/zcommon/Makefile.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/Makefile.in 2017-01-20 18:18:28.000000000 +0000 @@ -15,6 +15,8 @@ $(MODULE)-objs += zfs_fletcher.o $(MODULE)-objs += zfs_uio.o $(MODULE)-objs += zpool_prop.o +$(MODULE)-objs += zfs_fletcher_superscalar.o +$(MODULE)-objs += zfs_fletcher_superscalar4.o $(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o $(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_aarch64_neon.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_aarch64_neon.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_aarch64_neon.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_aarch64_neon.c 2017-01-20 18:18:28.000000000 +0000 @@ -2,7 +2,7 @@ * Implement fast Fletcher4 with NEON instructions. (aarch64) * * Use the 128-bit NEON SIMD instructions and registers to compute - * Fletcher4 in four incremental 64-bit parallel accumulator streams, + * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher.c 2017-01-20 18:18:28.000000000 +0000 @@ -28,6 +28,10 @@ */ /* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* * Fletcher Checksums * ------------------ * @@ -164,6 +168,8 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = { &fletcher_4_scalar_ops, + &fletcher_4_superscalar_ops, + &fletcher_4_superscalar4_ops, #if defined(HAVE_SSE2) &fletcher_4_sse2_ops, #endif @@ -217,14 +223,26 @@ /*ARGSUSED*/ void -fletcher_2_native(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +fletcher_init(zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +int +fletcher_2_incremental_native(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -232,18 +250,33 @@ } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } /*ARGSUSED*/ void -fletcher_2_byteswap(const void *buf, uint64_t size, +fletcher_2_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + fletcher_init(zcp); + (void) fletcher_2_incremental_native((void *) buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -251,6 +284,16 @@ } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); +} + +/*ARGSUSED*/ +void +fletcher_2_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } static void @@ -521,25 +564,28 @@ } } -void -fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +int +fletcher_4_incremental_native(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); + return (0); } -void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +int +fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) { + zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); + return (0); } @@ -562,7 +608,7 @@ { struct fletcher_4_kstat *fastest_stat = &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; - struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data; + struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; ssize_t off = 0; if (curr_stat == fastest_stat) { @@ -577,9 +623,9 @@ off += snprintf(buf + off, size - off, "%-17s", fletcher_4_supp_impls[id]->name); off += snprintf(buf + off, size - off, "%-15llu", - (u_longlong_t) curr_stat->native); + (u_longlong_t)curr_stat->native); off += snprintf(buf + off, size - off, "%-15llu\n", - (u_longlong_t) curr_stat->byteswap); + (u_longlong_t)curr_stat->byteswap); } return (0); @@ -605,6 +651,9 @@ #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, + zio_cksum_t *); + static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -616,8 +665,9 @@ zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); - zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : - fletcher_4_byteswap; + + fletcher_checksum_func_t *fletcher_4_test = native ? + fletcher_4_native : fletcher_4_byteswap; for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; @@ -673,7 +723,7 @@ /* move supported impl into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { - curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i]; + curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; if (curr_impl->valid && curr_impl->valid()) fletcher_4_supp_impls[c++] = curr_impl; @@ -704,7 +754,7 @@ /* install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { fletcher_4_kstat->ks_data = NULL; fletcher_4_kstat->ks_ndata = UINT32_MAX; @@ -767,6 +817,9 @@ fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +EXPORT_SYMBOL(fletcher_init); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_2_native); diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_sse.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_sse.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_sse.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_sse.c 2017-01-20 18:18:28.000000000 +0000 @@ -2,7 +2,7 @@ * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86) * * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute - * Fletcher4 in four incremental 64-bit parallel accumulator streams, + * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_superscalar4.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_superscalar4.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_superscalar4.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_superscalar4.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,228 @@ +/* + * Implement fast Fletcher4 using superscalar pipelines. + * + * Use regular C code to compute + * Fletcher4 in four incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Romain Dolbeau. + * + * Authors: + * Romain Dolbeau + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +static void +fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t)); +} + +static void +fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + + A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] + + ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3]; + B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] - + 3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + + 4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] + + 4 * ctx->superscalar[1].v[3]; + + C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] - + 6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] - + 14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] + + 16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] + + 16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3]; + + D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + + 10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] + + 34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] - + 64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] - + 96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] + + 64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] + + 64 * ctx->superscalar[3].v[3]; + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +static void +fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + uint64_t a3, b3, c3, d3; + uint64_t a4, b4, c4, d4; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + a3 = ctx->superscalar[0].v[2]; + b3 = ctx->superscalar[1].v[2]; + c3 = ctx->superscalar[2].v[2]; + d3 = ctx->superscalar[3].v[2]; + a4 = ctx->superscalar[0].v[3]; + b4 = ctx->superscalar[1].v[3]; + c4 = ctx->superscalar[2].v[3]; + d4 = ctx->superscalar[3].v[3]; + + for (; ip < ipend; ip += 4) { + a += ip[0]; + a2 += ip[1]; + a3 += ip[2]; + a4 += ip[3]; + b += a; + b2 += a2; + b3 += a3; + b4 += a4; + c += b; + c2 += b2; + c3 += b3; + c4 += b4; + d += c; + d2 += c2; + d3 += c3; + d4 += c4; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; + ctx->superscalar[0].v[2] = a3; + ctx->superscalar[1].v[2] = b3; + ctx->superscalar[2].v[2] = c3; + ctx->superscalar[3].v[2] = d3; + ctx->superscalar[0].v[3] = a4; + ctx->superscalar[1].v[3] = b4; + ctx->superscalar[2].v[3] = c4; + ctx->superscalar[3].v[3] = d4; +} + +static void +fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + uint64_t a3, b3, c3, d3; + uint64_t a4, b4, c4, d4; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + a3 = ctx->superscalar[0].v[2]; + b3 = ctx->superscalar[1].v[2]; + c3 = ctx->superscalar[2].v[2]; + d3 = ctx->superscalar[3].v[2]; + a4 = ctx->superscalar[0].v[3]; + b4 = ctx->superscalar[1].v[3]; + c4 = ctx->superscalar[2].v[3]; + d4 = ctx->superscalar[3].v[3]; + + for (; ip < ipend; ip += 4) { + a += BSWAP_32(ip[0]); + a2 += BSWAP_32(ip[1]); + a3 += BSWAP_32(ip[2]); + a4 += BSWAP_32(ip[3]); + b += a; + b2 += a2; + b3 += a3; + b4 += a4; + c += b; + c2 += b2; + c3 += b3; + c4 += b4; + d += c; + d2 += c2; + d3 += c3; + d4 += c4; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; + ctx->superscalar[0].v[2] = a3; + ctx->superscalar[1].v[2] = b3; + ctx->superscalar[2].v[2] = c3; + ctx->superscalar[3].v[2] = d3; + ctx->superscalar[0].v[3] = a4; + ctx->superscalar[1].v[3] = b4; + ctx->superscalar[2].v[3] = c4; + ctx->superscalar[3].v[3] = d4; +} + +static boolean_t fletcher_4_superscalar4_valid(void) +{ + return (B_TRUE); +} + +const fletcher_4_ops_t fletcher_4_superscalar4_ops = { + .init_native = fletcher_4_superscalar4_init, + .compute_native = fletcher_4_superscalar4_native, + .fini_native = fletcher_4_superscalar4_fini, + .init_byteswap = fletcher_4_superscalar4_init, + .compute_byteswap = fletcher_4_superscalar4_byteswap, + .fini_byteswap = fletcher_4_superscalar4_fini, + .valid = fletcher_4_superscalar4_valid, + .name = "superscalar4" +}; diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_superscalar.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_superscalar.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_fletcher_superscalar.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_fletcher_superscalar.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,162 @@ +/* + * Implement fast Fletcher4 using superscalar pipelines. + * + * Use regular C code to compute + * Fletcher4 in two incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Romain Dolbeau. + * + * Authors: + * Romain Dolbeau + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +static void +fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t)); +} + +static void +fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1]; + B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] - + ctx->superscalar[0].v[1]; + C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] + + 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1]; + D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] + + 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] + + ctx->superscalar[1].v[1]; + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +static void +fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + + for (; ip < ipend; ip += 2) { + a += ip[0]; + a2 += ip[1]; + b += a; + b2 += a2; + c += b; + c2 += b2; + d += c; + d2 += c2; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; +} + +static void +fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + + for (; ip < ipend; ip += 2) { + a += BSWAP_32(ip[0]); + a2 += BSWAP_32(ip[1]); + b += a; + b2 += a2; + c += b; + c2 += b2; + d += c; + d2 += c2; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; +} + +static boolean_t fletcher_4_superscalar_valid(void) +{ + return (B_TRUE); +} + +const fletcher_4_ops_t fletcher_4_superscalar_ops = { + .init_native = fletcher_4_superscalar_init, + .compute_native = fletcher_4_superscalar_native, + .fini_native = fletcher_4_superscalar_fini, + .init_byteswap = fletcher_4_superscalar_init, + .compute_byteswap = fletcher_4_superscalar_byteswap, + .fini_byteswap = fletcher_4_superscalar_fini, + .valid = fletcher_4_superscalar_valid, + .name = "superscalar" +}; diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_prop.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_prop.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_prop.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_prop.c 2017-01-20 18:18:28.000000000 +0000 @@ -143,7 +143,7 @@ { "noallow", ZFS_ACL_NOALLOW }, { "restricted", ZFS_ACL_RESTRICTED }, { "passthrough", ZFS_ACL_PASSTHROUGH }, - { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ + { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatibility */ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, { NULL } }; diff -Nru zfs-linux-0.7.0-rc2/module/zcommon/zfs_uio.c zfs-linux-0.7.0-rc3/module/zcommon/zfs_uio.c --- zfs-linux-0.7.0-rc2/module/zcommon/zfs_uio.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zcommon/zfs_uio.c 2017-01-20 18:18:28.000000000 +0000 @@ -193,7 +193,7 @@ */ p = iov->iov_base + skip; while (cnt) { - if (fuword8((uint8_t *) p, &tmp)) + if (fuword8((uint8_t *)p, &tmp)) return; incr = MIN(cnt, PAGESIZE); p += incr; @@ -203,7 +203,7 @@ * touch the last byte in case it straddles a page. */ p--; - if (fuword8((uint8_t *) p, &tmp)) + if (fuword8((uint8_t *)p, &tmp)) return; } } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/abd.c zfs-linux-0.7.0-rc3/module/zfs/abd.c --- zfs-linux-0.7.0-rc2/module/zfs/abd.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/abd.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,1544 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we weren't using segkpm (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs + * on each ABD access. (If segkpm isn't available then we use all linear + * ABDs to avoid this penalty.) See seg_kpm.c for more details. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not + * available, which is the case on all 32-bit systems and any 64-bit systems + * where kpm_enable is turned off. + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +#ifdef _KERNEL +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +static unsigned long +abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(nid, gfp, order); + if (!page) + return (0); + + return ((unsigned long) page_address(page)); +} + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + int order; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned long paddr; + unsigned chunk_pages; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order); + if (paddr == 0) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + page = virt_to_page(paddr); + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + if (chunks > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; +} +#else +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct scatterlist *sg; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +static void +abd_free_pages(abd_t *abd) +{ + struct scatterlist *sg; + struct sg_table table; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i, j; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + for (j = 0; j < sg->length; ) { + page = nth_page(sg_page(sg), j >> PAGE_SHIFT); + order = compound_order(page); + __free_pages(page, order); + j += (PAGESIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + } + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = nr_pages; + sg_free_table(&table); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +struct page; + +#define kpm_enable 1 +#define abd_alloc_chunk(o) \ + ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP)) +#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o)) +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) { + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = abd_alloc_chunk(0); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +static void +abd_free_pages(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + int j; + + abd_for_each_sg(abd, sg, n, i) { + for (j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT); + abd_free_chunk(p, 0); + } + } + + vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); +} + +#endif /* _KERNEL */ + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + size_t n; + int i; + struct scatterlist *sg; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(void) +{ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -sizeof (abd_t)); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + abd_t *abd; + + if (!zfs_abd_scatter_enabled || size <= PAGESIZE) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd = abd_alloc_struct(); + abd->abd_flags = ABD_FLAG_OWNER; + abd_alloc_pages(abd, size); + + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(size, PAGESIZE) - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_pages(abd); + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - P2ROUNDUP(abd->abd_size, PAGESIZE)); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + int i; + struct scatterlist *sg; + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + } + + abd->abd_size = size; + abd->abd_parent = sabd; + refcount_create(&abd->abd_children); + (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + + VERIFY3U(size, >, 0); + + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +#ifndef HAVE_1ARG_KMAP_ATOMIC +#define NR_KM_TYPE (6) +#ifdef _KERNEL +int km_table[NR_KM_TYPE] = { + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, +}; +#endif +#endif + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int iter_km; /* KM_* for kmap_atomic */ +#endif +}; + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +#ifndef HAVE_1ARG_KMAP_ATOMIC + ASSERT3U(km_type, <, NR_KM_TYPE); + aiter->iter_km = km_type; +#endif +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + while (size > 0) { + size_t len; + abd_iter_map(&aiter); + + len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd, 0); + abd_iter_init(&saiter, sabd, 1); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + size_t dlen, slen, len; + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + dlen = MIN(daiter.iter_mapsize, size); + slen = MIN(saiter.iter_mapsize, size); + len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i], i); + + if (dabd) + abd_iter_init(&daiter, dabd, i); + + ASSERT3S(dsize, >=, 0); + + local_irq_save(flags); + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + case 2: + len = MIN(caiters[1].iter_mapsize, len); + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + local_irq_restore(flags); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i], 2*i); + abd_iter_init(&xiters[i], tabds[i], 2*i+1); + } + + local_irq_save(flags); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + local_irq_restore(flags); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = abd->abd_u.abd_scatter.abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/arc.c zfs-linux-0.7.0-rc3/module/zfs/arc.c --- zfs-linux-0.7.0-rc2/module/zfs/arc.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/arc.c 2017-01-20 18:18:28.000000000 +0000 @@ -136,14 +136,14 @@ * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pdata) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an + * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each @@ -182,7 +182,7 @@ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t - * | b_pdata +-+ |b_next +---->+-----------+ + * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | @@ -199,8 +199,8 @@ * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a - * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be @@ -224,7 +224,7 @@ * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL + * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ @@ -238,19 +238,19 @@ * | +------+ | * +---------------------------------+ * - * Writing to the ARC requires that the ARC first discard the hdr's b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pdata. Writes are always done into buffers which have + * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * - * When the L2ARC is in use, it will also take advantage of the b_pdata. The - * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the @@ -271,7 +271,9 @@ #include #include #include +#include #include +#include #ifdef _KERNEL #include #include @@ -315,7 +317,7 @@ /* number of seconds before growing cache again */ static int arc_grow_retry = 5; -/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ @@ -455,13 +457,13 @@ kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* - * Uncompressed size of the data stored in b_pdata. If compressed + * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ @@ -919,6 +921,12 @@ #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ +/* + * We can feed L2ARC from two states of ARC buffers, mru and mfu, + * and each of the state has two types: data and metadata. + */ +#define L2ARC_FEED_TYPES 4 + #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -954,7 +962,7 @@ typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; @@ -964,15 +972,20 @@ static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); -static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_pabd(arc_buf_hdr_t *); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); static void arc_tuning_update(void); static void arc_prune_async(int64_t); +static uint64_t arc_all_memory(void); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1259,7 +1272,7 @@ * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ - while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) + while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; @@ -1329,7 +1342,9 @@ arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && - buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); @@ -1369,8 +1384,6 @@ return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1417,7 +1430,8 @@ cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); lsize = HDR_GET_LSIZE(hdr); - csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* @@ -1452,7 +1466,7 @@ * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); @@ -1476,18 +1490,9 @@ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { - /* - * Since the checksum doesn't apply to compressed buffers, we - * only keep a checksum if there are uncompressed buffers. - * Therefore there must be another buffer, which is - * uncompressed. - */ - IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL, - hdr->b_l1hdr.b_bufcnt > 1); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } @@ -1505,7 +1510,7 @@ void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { - panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); + panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif @@ -1582,8 +1587,6 @@ * allocate b_thawed. */ if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1602,8 +1605,6 @@ return; if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - hdr->b_l1hdr.b_bufcnt > 1); return; } @@ -1733,7 +1734,7 @@ if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { @@ -1785,7 +1786,7 @@ return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, + hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* @@ -1822,7 +1823,7 @@ } /* - * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) @@ -1855,14 +1856,14 @@ if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -1890,14 +1891,14 @@ if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -2044,7 +2045,7 @@ old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2113,7 +2114,7 @@ */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; @@ -2143,7 +2144,7 @@ } ASSERT3U(bufcnt, ==, buffers); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { @@ -2156,7 +2157,7 @@ ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, @@ -2197,7 +2198,7 @@ buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } @@ -2295,7 +2296,7 @@ /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pdata. + * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) @@ -2390,17 +2391,20 @@ * set the appropriate bit in the hdr's b_flags to indicate the hdr is * allocate a new buffer to store the buf's data. * - * There is one additional restriction here because we're sharing - * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively - * involved in an L2ARC write, because if this buf is used by an - * arc_write() then the hdr's data buffer will be released when the + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. */ - can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && + abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { - buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { @@ -2485,11 +2489,11 @@ } static void -l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; + df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); @@ -2514,7 +2518,7 @@ } (void) refcount_remove_many(&state->arcs_size, size, hdr); - l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* @@ -2526,7 +2530,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2535,7 +2539,9 @@ * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pdata = buf->b_data; + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; @@ -2553,7 +2559,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2562,7 +2568,9 @@ */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - hdr->b_l1hdr.b_pdata = NULL; + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* @@ -2658,7 +2666,7 @@ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pdata to share it with the new + * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * @@ -2673,8 +2681,8 @@ /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the @@ -2707,26 +2715,26 @@ } static void -arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then @@ -2738,10 +2746,10 @@ arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } - hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); @@ -2777,7 +2785,7 @@ * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2817,7 +2825,7 @@ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -2835,11 +2843,11 @@ /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } @@ -2924,6 +2932,18 @@ arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + } + return (buf); } @@ -2992,9 +3012,8 @@ while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - if (hdr->b_l1hdr.b_pdata != NULL) { - arc_hdr_free_pdata(hdr); - } + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_pabd(hdr); } ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -3061,7 +3080,7 @@ /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pdata field) during its write phase. + * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. @@ -3077,7 +3096,7 @@ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_pdata == NULL); + ASSERT(hdr->b_l1hdr.b_pabd == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3142,9 +3161,9 @@ * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly - * in arc_free_data_buf(). + * in arc_free_data_impl(). */ - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -3242,7 +3261,7 @@ * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since - * arc_get_data_buf() doesn't check for overflow + * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the @@ -3473,7 +3492,7 @@ refcount_add(&ap->p_refcnt, ap->p_pfunc); ap->p_adjust = adjust; if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == 0) { + ap, TQ_SLEEP) == TASKQID_INVALID) { refcount_remove(&ap->p_refcnt, ap->p_pfunc); continue; } @@ -3922,6 +3941,21 @@ (void) arc_adjust(); } +/* + * Return maximum amount of memory that we could possibly use. Reduced + * to half of all memory in user space which is primarily used for testing. + */ +static uint64_t +arc_all_memory(void) +{ +#ifdef _KERNEL + return (MIN(ptob(physmem), + vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC))); +#else + return (ptob(physmem) / 2); +#endif +} + typedef enum free_memory_reason_t { FMR_UNKNOWN, FMR_NEEDFREE, @@ -3958,6 +3992,7 @@ int64_t lowest = INT64_MAX; free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL + uint64_t available_memory = ptob(freemem); int64_t n; #ifdef __linux__ pgcnt_t needfree = btop(arc_need_free); @@ -3965,6 +4000,11 @@ pgcnt_t desfree = 0; #endif +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { @@ -3980,7 +4020,7 @@ * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ - n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + n = PAGESIZE * (btop(available_memory) - lotsfree - needfree - desfree); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; @@ -4047,8 +4087,9 @@ * fragmentation issues. */ if (zio_arena != NULL) { - n = vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, - VMEM_ALLOC) >> arc_zio_arena_free_shift); + n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> + arc_zio_arena_free_shift); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; @@ -4125,13 +4166,13 @@ } /* - * Threads can block in arc_get_data_buf() waiting for this thread to evict + * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_buf() are sleeping while holding the hash lock for their + * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * - * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, @@ -4397,7 +4438,7 @@ /* * Adapt arc info given the number of bytes we are trying to add and - * the state that we are comming from. This function is only called + * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void @@ -4480,18 +4521,45 @@ return (arc_size >= arc_c + overflow); } +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - void *datap = NULL; - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -4533,11 +4601,8 @@ VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { - ASSERT(type == ARC_BUFC_DATA); - datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -4573,14 +4638,34 @@ refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } - return (datap); +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } } /* * Free the arc data buffer. */ static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); @@ -4597,11 +4682,9 @@ VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - zio_buf_free(data, size); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(data, size); arc_space_return(size, ARC_SPACE_DATA); } } @@ -4883,7 +4966,7 @@ if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || @@ -4980,7 +5063,7 @@ hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -5132,7 +5215,7 @@ hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5150,9 +5233,9 @@ * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* @@ -5256,7 +5339,7 @@ ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, - size, hdr->b_l1hdr.b_pdata, + size, hdr->b_l1hdr.b_pabd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | @@ -5296,7 +5379,7 @@ } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { @@ -5528,16 +5611,17 @@ arc_unshare_buf(hdr, buf); /* - * Now we need to recreate the hdr's b_pdata. Since we + * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pdata and copy the + * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pdata(hdr); - bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + arc_hdr_alloc_pabd(hdr); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, + buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { @@ -5553,7 +5637,7 @@ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, @@ -5572,7 +5656,7 @@ mutex_exit(hash_lock); /* - * Allocate a new hdr. The new hdr will contain a b_pdata + * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); @@ -5648,6 +5732,7 @@ arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); enum zio_compress compress; + fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); @@ -5661,15 +5746,15 @@ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } } } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5691,33 +5776,47 @@ arc_hdr_set_compress(hdr, compress); /* - * If the hdr is compressed, then copy the compressed - * zio contents into arc_buf_hdr_t. Otherwise, copy the original - * data buf into the hdr. Ideally, we would like to always copy the - * io_data into b_pdata but the user may have disabled compressed - * arc thus the on-disk block may or may not match what we maintain - * in the hdr's b_pdata field. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - !ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - arc_hdr_alloc_pdata(hdr); - bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + * Fill the hdr with data. If the hdr is compressed, the data we want + * is available from the zio, otherwise we can take it from the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. + */ + if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + arc_hdr_alloc_pabd(hdr); + + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, + ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } } else { - ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - /* - * This hdr is not compressed so we're able to share - * the arc_buf_t data buffer with the hdr. - */ arc_share_buf(hdr, buf); - ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, - HDR_GET_LSIZE(hdr))); } + arc_hdr_verify(hdr, zio->io_bp); + spl_fstrans_unmark(cookie); } static void @@ -5821,6 +5920,7 @@ ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); + abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -5857,10 +5957,10 @@ callback->awcb_buf = buf; /* - * The hdr's b_pdata is now stale, free it now. A new data block + * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. @@ -5870,15 +5970,16 @@ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, @@ -5899,7 +6000,12 @@ pgcnt_t minfree = btop(arc_sys_free / 4); #endif - if (freemem > physmem * arc_lotsfree_percent / 100) +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + + if (available_memory > arc_all_memory() * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -6086,10 +6192,11 @@ static void arc_tuning_update(void) { - uint64_t percent; + uint64_t percent, allmem = arc_all_memory(); + /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && - (zfs_arc_max > 64 << 20) && (zfs_arc_max < ptob(physmem)) && + (zfs_arc_max > 64 << 20) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = arc_c_max; @@ -6155,7 +6262,7 @@ /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) - arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), ptob(physmem)); + arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); } @@ -6282,15 +6389,7 @@ void arc_init(void) { - /* - * allmem is "all memory that we could possibly use". - */ -#ifdef _KERNEL - uint64_t allmem = ptob(physmem); -#else - uint64_t allmem = (physmem * PAGESIZE) / 2; -#endif - uint64_t percent; + uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); @@ -6308,7 +6407,7 @@ spl_register_shrinker(&arc_shrinker); /* Set to 1/64 of all memory or a minimum of 512K */ - arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024)); + arc_sys_free = MAX(allmem / 64, (512 * 1024)); arc_need_free = 0; #endif @@ -6392,11 +6491,11 @@ * zfs_dirty_data_max_max (default 25% of physical memory). */ if (zfs_dirty_data_max_max == 0) - zfs_dirty_data_max_max = (uint64_t)physmem * PAGESIZE * + zfs_dirty_data_max_max = allmem * zfs_dirty_data_max_max_percent / 100; if (zfs_dirty_data_max == 0) { - zfs_dirty_data_max = (uint64_t)physmem * PAGESIZE * + zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); @@ -6741,13 +6840,8 @@ for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_data, !=, NULL); - if (df->l2df_type == ARC_BUFC_METADATA) { - zio_buf_free(df->l2df_data, df->l2df_size); - } else { - ASSERT(df->l2df_type == ARC_BUFC_DATA); - zio_data_buf_free(df->l2df_data, df->l2df_size); - } + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6901,12 +6995,12 @@ mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(zio->io_data, !=, NULL); + ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ @@ -6940,7 +7034,7 @@ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -6965,7 +7059,7 @@ multilist_t *ml = NULL; unsigned int idx; - ASSERT(list_num >= 0 && list_num <= 3); + ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: @@ -6980,6 +7074,8 @@ case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; + default: + return (NULL); } /* @@ -7138,10 +7234,12 @@ /* * Copy buffers for L2ARC writing. */ - for (try = 0; try <= 3; try++) { + for (try = 0; try < L2ARC_FEED_TYPES; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; + VERIFY3P(mls, !=, NULL); + /* * L2ARC fast warmup. * @@ -7160,7 +7258,7 @@ for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t asize, size; - void *to_write; + abd_t *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -7233,7 +7331,7 @@ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); size = arc_hdr_size(hdr); @@ -7249,18 +7347,13 @@ * add it to the l2arc_free_on_write queue. */ if (!HDR_SHARED_DATA(hdr)) { - to_write = hdr->b_l1hdr.b_pdata; + to_write = hdr->b_l1hdr.b_pabd; } else { - arc_buf_contents_t type = arc_buf_type(hdr); - if (type == ARC_BUFC_METADATA) { - to_write = zio_buf_alloc(size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - to_write = zio_data_buf_alloc(size); - } - - bcopy(hdr->b_l1hdr.b_pdata, to_write, size); - l2arc_free_data_on_write(to_write, size, type); + to_write = abd_alloc_for_io(size, + HDR_ISTYPE_METADATA(hdr)); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + l2arc_free_abd_on_write(to_write, size, + arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, size, to_write, @@ -7595,6 +7688,7 @@ EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); +/* BEGIN CSTYLED */ module_param(zfs_arc_min, ulong, 0644); MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); @@ -7693,5 +7787,5 @@ module_param(zfs_arc_dnode_reduce_percent, ulong, 0644); MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent, "Percentage of excess dnodes to try to unpin"); - +/* END CSTYLED */ #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/blkptr.c zfs-linux-0.7.0-rc3/module/zfs/blkptr.c --- zfs-linux-0.7.0-rc2/module/zfs/blkptr.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/blkptr.c 2017-01-20 18:18:28.000000000 +0000 @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include diff -Nru zfs-linux-0.7.0-rc2/module/zfs/bptree.c zfs-linux-0.7.0-rc3/module/zfs/bptree.c --- zfs-linux-0.7.0-rc2/module/zfs/bptree.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/bptree.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include @@ -223,7 +223,8 @@ flags |= TRAVERSE_HARD; zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " "bookmark %lld/%lld/%lld/%lld", - i, (longlong_t)bte.be_birth_txg, + (longlong_t)i, + (longlong_t)bte.be_birth_txg, (longlong_t)bte.be_zb.zb_objset, (longlong_t)bte.be_zb.zb_object, (longlong_t)bte.be_zb.zb_level, diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dbuf.c zfs-linux-0.7.0-rc3/module/zfs/dbuf.c --- zfs-linux-0.7.0-rc2/module/zfs/dbuf.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dbuf.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -46,6 +46,7 @@ #include #include #include +#include struct dbuf_hold_impl_data { /* Function arguments */ @@ -149,7 +150,7 @@ * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high - * water mark, then callers adding elments to the cache will begin to evict + * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ @@ -319,7 +320,7 @@ idx = hv & h->hash_table_mask; /* - * We musn't hold db_mtx to maintain lock ordering: + * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); @@ -789,7 +790,7 @@ } else { /* db is pointed to by an indirect block */ ASSERTV(int epb = db->db_parent->db.db_size >> - SPA_BLKPTRSHIFT); + SPA_BLKPTRSHIFT); ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); @@ -1019,7 +1020,7 @@ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(max_bonuslen); + db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); @@ -1131,10 +1132,9 @@ */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { @@ -1207,7 +1207,8 @@ } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; - if (zio == NULL) + if (zio == NULL && + db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); err = dbuf_read_impl(db, zio, flags); @@ -1221,7 +1222,7 @@ rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); - if (!err && !havepzio) + if (!err && !havepzio && zio != NULL) err = zio_wait(zio); } else { /* @@ -2156,7 +2157,7 @@ int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); ASSERT(db->db.db_data != NULL); - zio_buf_free(db->db.db_data, bonuslen); + kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } @@ -2685,8 +2686,7 @@ ASSERT3P(dh->dh_parent, ==, NULL); dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_fail_sparse, &dh->dh_parent, - &dh->dh_bp, dh); + dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); if (dh->dh_fail_sparse) { if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) @@ -2700,7 +2700,7 @@ if (dh->dh_err && dh->dh_err != ENOENT) return (dh->dh_err); dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_parent, dh->dh_bp); + dh->dh_parent, dh->dh_bp); } if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { @@ -2774,7 +2774,7 @@ dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, - fail_uncached, tag, dbp, 0); + fail_uncached, tag, dbp, 0); error = __dbuf_hold_impl(dh); @@ -3303,7 +3303,7 @@ if (*datap != db->db.db_data) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(*datap, bonuslen); + kmem_free(*datap, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; @@ -3709,6 +3709,9 @@ mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); } /* Issue I/O to commit a dirty buffer to disk. */ @@ -3801,7 +3804,8 @@ * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - void *contents = (data != NULL) ? data->b_data : NULL; + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, @@ -3879,23 +3883,23 @@ EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); - +/* BEGIN CSTYLED */ module_param(dbuf_cache_max_bytes, ulong, 0644); MODULE_PARM_DESC(dbuf_cache_max_bytes, - "Maximum size in bytes of the dbuf cache."); + "Maximum size in bytes of the dbuf cache."); module_param(dbuf_cache_hiwater_pct, uint, 0644); MODULE_PARM_DESC(dbuf_cache_hiwater_pct, - "Percentage over dbuf_cache_max_bytes when dbufs \ - much be evicted directly."); + "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " + "directly."); module_param(dbuf_cache_lowater_pct, uint, 0644); MODULE_PARM_DESC(dbuf_cache_lowater_pct, - "Percentage below dbuf_cache_max_bytes \ - when the evict thread stop evicting dbufs."); + "Percentage below dbuf_cache_max_bytes when the evict thread stops " + "evicting dbufs."); module_param(dbuf_cache_max_shift, int, 0644); MODULE_PARM_DESC(dbuf_cache_max_shift, - "Cap the size of the dbuf cache to log2 fraction of arc size."); - + "Cap the size of the dbuf cache to a log2 fraction of arc size."); +/* END CSTYLED */ #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/ddt.c zfs-linux-0.7.0-rc3/module/zfs/ddt.c --- zfs-linux-0.7.0-rc2/module/zfs/ddt.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/ddt.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include @@ -36,6 +36,7 @@ #include #include #include +#include static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_cache; @@ -528,10 +529,17 @@ uint64_t ddt_get_dedup_dspace(spa_t *spa) { - ddt_stat_t dds_total = { 0 }; + ddt_stat_t dds_total; + + if (spa->spa_dedup_dspace != ~0ULL) + return (spa->spa_dedup_dspace); + + bzero(&dds_total, sizeof (ddt_stat_t)); + /* Calculate and cache the stats */ ddt_get_dedup_stats(spa, &dds_total); - return (dds_total.dds_ref_dsize - dds_total.dds_dsize); + spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; + return (spa->spa_dedup_dspace); } uint64_t @@ -706,9 +714,8 @@ for (p = 0; p < DDT_PHYS_TYPES; p++) ASSERT(dde->dde_lead_zio[p] == NULL); - if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); cv_destroy(&dde->dde_cv); kmem_cache_free(ddt_entry_cache, dde); @@ -915,6 +922,7 @@ */ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, sizeof (ddt->ddt_histogram)); + spa->spa_dedup_dspace = ~0ULL; } return (0); @@ -1002,7 +1010,7 @@ ddt_enter(ddt); - if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else @@ -1040,7 +1048,7 @@ continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } @@ -1182,6 +1190,7 @@ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, sizeof (ddt->ddt_histogram)); + spa->spa_dedup_dspace = ~0ULL; } void diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu.c zfs-linux-0.7.0-rc3/module/zfs/dmu.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu.c 2017-01-20 18:18:28.000000000 +0000 @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -369,12 +370,17 @@ if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); - ASSERT(db != NULL); + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; - else + else { dbuf_rele(db, tag); + *dbp = NULL; + } return (err); } @@ -816,17 +822,12 @@ return (0); } -int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, +static int +dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { - dnode_t *dn; dmu_buf_t **dbp; - int numbufs, err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); + int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first @@ -871,22 +872,37 @@ } dmu_buf_rele_array(dbp, numbufs, FTAG); } - dnode_rele(dn, FTAG); return (err); } -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) { - dmu_buf_t **dbp; - int numbufs, i; + dnode_t *dn; + int err; - if (size == 0) - return; + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); - VERIFY0(dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + err = dmu_read_impl(dn, offset, size, buf, flags); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags) +{ + return (dmu_read_impl(dn, offset, size, buf, flags)); +} + +static void +dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; @@ -914,6 +930,37 @@ size -= tocpy; buf = (char *)buf + tocpy; } +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -1096,13 +1143,13 @@ } void -xuio_stat_wbuf_copied() +xuio_stat_wbuf_copied(void) { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void -xuio_stat_wbuf_nocopy() +xuio_stat_wbuf_nocopy(void) { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } @@ -1508,6 +1555,7 @@ dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } @@ -1532,11 +1580,11 @@ dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), - zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, - zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, - NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL, zb)); + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } @@ -2057,6 +2105,7 @@ void dmu_init(void) { + abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); @@ -2082,6 +2131,7 @@ xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); + abd_fini(); } #if defined(_KERNEL) && defined(HAVE_SPL) @@ -2093,7 +2143,9 @@ EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); +EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); +EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_object.c zfs-linux-0.7.0-rc3/module/zfs/dmu_object.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_object.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_object.c 2017-01-20 18:18:28.000000000 +0000 @@ -129,11 +129,11 @@ } dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); - dnode_rele(dn, FTAG); - mutex_exit(&os->os_obj_lock); - dmu_tx_add_new_object(tx, os, object); + dmu_tx_add_new_object(tx, os, dn); + dnode_rele(dn, FTAG); + return (object); } @@ -168,9 +168,10 @@ return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); + dmu_tx_add_new_object(tx, os, dn); + dnode_rele(dn, FTAG); - dmu_tx_add_new_object(tx, os, object); return (0); } @@ -236,28 +237,39 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset; - dmu_object_info_t doi; + uint64_t start_obj; struct dsl_dataset *ds = os->os_dsl_dataset; - int dnodesize; int error; - /* - * Avoid expensive dnode hold if this dataset doesn't use large dnodes. - */ - if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { - error = dmu_object_info(os, *objectp, &doi); - if (error && !(error == EINVAL && *objectp == 0)) - return (SET_ERROR(error)); - else - dnodesize = doi.doi_dnodesize; + if (*objectp == 0) { + start_obj = 1; + } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { + /* + * For large_dnode datasets, scan from the beginning of the + * dnode block to find the starting offset. This is needed + * because objectp could be part of a large dnode so we can't + * assume it's a hole even if dmu_object_info() returns ENOENT. + */ + int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT; + int skip; + uint64_t i; + + for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) { + dmu_object_info_t doi; + + error = dmu_object_info(os, i, &doi); + if (error) + skip = 1; + else + skip = doi.doi_dnodesize >> DNODE_SHIFT; + } + + start_obj = i; } else { - dnodesize = DNODE_MIN_SIZE; + start_obj = *objectp + 1; } - if (*objectp == 0) - offset = 1 << DNODE_SHIFT; - else - offset = (*objectp << DNODE_SHIFT) + dnodesize; + offset = start_obj << DNODE_SHIFT; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_objset.c zfs-linux-0.7.0-rc3/module/zfs/dmu_objset.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_objset.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_objset.c 2017-01-20 18:18:28.000000000 +0000 @@ -63,7 +63,7 @@ krwlock_t os_lock; /* - * Tunable to overwrite the maximum number of threads for the parallization + * Tunable to overwrite the maximum number of threads for the parallelization * of dmu_objset_find_dp, needed to speed up the import of pools with many * datasets. * Default is 4 times the number of leaf vdevs. @@ -1118,7 +1118,7 @@ os->os_upgrade_id = taskq_dispatch( os->os_spa->spa_upgrade_taskq, dmu_objset_upgrade_task_cb, os, TQ_SLEEP); - if (os->os_upgrade_id == 0) + if (os->os_upgrade_id == TASKQID_INVALID) os->os_upgrade_status = ENOMEM; } mutex_exit(&os->os_upgrade_lock); @@ -1771,6 +1771,15 @@ dmu_objset_upgrade(os, dmu_objset_userobjspace_upgrade_cb); } +boolean_t +dmu_objset_userobjspace_upgradable(objset_t *os) +{ + return (dmu_objset_type(os) == DMU_OST_ZFS && + !dmu_objset_is_snapshot(os) && + dmu_objset_userobjused_enabled(os) && + !dmu_objset_userobjspace_present(os)); +} + void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) @@ -2334,5 +2343,6 @@ EXPORT_SYMBOL(dmu_objset_userspace_present); EXPORT_SYMBOL(dmu_objset_userobjused_enabled); EXPORT_SYMBOL(dmu_objset_userobjspace_upgrade); +EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable); EXPORT_SYMBOL(dmu_objset_userobjspace_present); #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_send.c zfs-linux-0.7.0-rc3/module/zfs/dmu_send.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_send.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_send.c 2017-01-20 18:18:28.000000000 +0000 @@ -166,7 +166,7 @@ { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { @@ -179,13 +179,13 @@ if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } - fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - fletcher_4_incremental_native(payload, payload_len, + (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); @@ -613,6 +613,7 @@ data->eos_marker = B_TRUE; bqueue_enqueue(&st_arg->q, data, 1); spl_fstrans_unmark(cookie); + thread_exit(); } /* @@ -1591,7 +1592,7 @@ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, - 8, 1, &one, tx)); + 8, 1, &one, tx)); } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA) { @@ -1786,11 +1787,11 @@ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - fletcher_4_incremental_byteswap(drr_begin, + (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - fletcher_4_incremental_native(drr_begin, + (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); @@ -2470,9 +2471,9 @@ receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { - fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { - fletcher_4_incremental_native(buf, len, &ra->cksum); + (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } @@ -2874,6 +2875,7 @@ cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); spl_fstrans_unmark(cookie); + thread_exit(); } static int diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_traverse.c zfs-linux-0.7.0-rc3/module/zfs/dmu_traverse.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_traverse.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_traverse.c 2017-01-20 18:18:28.000000000 +0000 @@ -615,8 +615,8 @@ } if (!(flags & TRAVERSE_PREFETCH_DATA) || - 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, - td, TQ_NOQUEUE)) + taskq_dispatch(system_taskq, traverse_prefetch_thread, + td, TQ_NOQUEUE) == TASKQID_INVALID) pd->pd_exited = B_TRUE; err = traverse_visitbp(td, NULL, rootbp, czb); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_tx.c zfs-linux-0.7.0-rc3/module/zfs/dmu_tx.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_tx.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_tx.c 2017-01-20 18:18:28.000000000 +0000 @@ -113,21 +113,14 @@ } static dmu_tx_hold_t * -dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, - enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, + uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; - dnode_t *dn = NULL; - int err; - - if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os, object, tx, &dn); - if (err) { - tx->tx_err = err; - return (NULL); - } - if (err == 0 && tx->tx_txg != 0) { + if (dn != NULL) { + refcount_add(&dn->dn_holds, tx); + if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a @@ -154,17 +147,36 @@ return (txh); } +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dnode_t *dn = NULL; + dmu_tx_hold_t *txh; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os, object, FTAG, &dn); + if (err) { + tx->tx_err = err; + return (NULL); + } + } + txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); + if (dn != NULL) + dnode_rele(dn, FTAG); + return (txh); +} + void -dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ - if (!dmu_tx_is_syncing(tx)) { - (void) dmu_tx_hold_object_impl(tx, os, - object, THT_NEWOBJECT, 0, 0); - } + if (!dmu_tx_is_syncing(tx)) + (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } static int @@ -441,6 +453,23 @@ dmu_tx_count_dnode(txh); } +void +dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + ASSERT(len <= DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); + if (txh == NULL) + return; + + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); +} + static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { @@ -636,20 +665,17 @@ txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024; } -void -dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +static void +dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { - dmu_tx_hold_t *txh; + dmu_tx_t *tx; dnode_t *dn; int err; zio_t *zio; + tx = txh->txh_tx; ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_FREE, off, len); - if (txh == NULL) - return; dn = txh->txh_dnode; dmu_tx_count_dnode(txh); @@ -731,9 +757,32 @@ } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh == NULL) + return; + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +void +dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); + if (txh == NULL) + return; + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +static void +dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name) +{ + dmu_tx_t *tx = txh->txh_tx; dnode_t *dn; dsl_dataset_phys_t *ds_phys; uint64_t nblocks; @@ -741,10 +790,6 @@ ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_ZAP, add, (uintptr_t)name); - if (txh == NULL) - return; dn = txh->txh_dnode; dmu_tx_count_dnode(txh); @@ -755,7 +800,7 @@ * block. So there will be at most 2 blocks total, * including the header block. */ - dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); + dmu_tx_count_write(txh, 0, 2ULL << fzap_default_block_shift); return; } @@ -818,6 +863,34 @@ } void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh == NULL) + return; + dmu_tx_hold_zap_impl(txh, add, name); +} + +void +dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + ASSERT(dn != NULL); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); + if (txh == NULL) + return; + dmu_tx_hold_zap_impl(txh, add, name); +} + +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; @@ -831,6 +904,18 @@ } void +dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; @@ -1249,7 +1334,8 @@ * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ - for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + for (txh = list_head(&tx->tx_holds); + txh && txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1703,9 +1789,13 @@ #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); +EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); +EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); +EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_bonus); +EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); EXPORT_SYMBOL(dmu_tx_abort); EXPORT_SYMBOL(dmu_tx_assign); EXPORT_SYMBOL(dmu_tx_wait); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dmu_zfetch.c zfs-linux-0.7.0-rc3/module/zfs/dmu_zfetch.c --- zfs-linux-0.7.0-rc2/module/zfs/dmu_zfetch.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dmu_zfetch.c 2017-01-20 18:18:28.000000000 +0000 @@ -336,6 +336,7 @@ } #if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ module_param(zfs_prefetch_disable, int, 0644); MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching"); @@ -351,4 +352,5 @@ module_param(zfetch_array_rd_sz, ulong, 0644); MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); +/* END CSTYLED */ #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dnode.c zfs-linux-0.7.0-rc3/module/zfs/dnode.c --- zfs-linux-0.7.0-rc2/module/zfs/dnode.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dnode.c 2017-01-20 18:18:28.000000000 +0000 @@ -632,7 +632,7 @@ (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS; @@ -1184,6 +1184,7 @@ * errors: * EINVAL - invalid object number. * ENOSPC - hole too small to fulfill "slots" request + * ENOENT - the requested dnode is not allocated * EIO - i/o error. * succeeds even for free dnodes. */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dnode_sync.c zfs-linux-0.7.0-rc3/module/zfs/dnode_sync.c --- zfs-linux-0.7.0-rc2/module/zfs/dnode_sync.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dnode_sync.c 2017-01-20 18:18:28.000000000 +0000 @@ -292,7 +292,7 @@ } /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } @@ -539,7 +539,7 @@ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); /* * Now that we've released our hold, the dnode may - * be evicted, so we musn't access it. + * be evicted, so we mustn't access it. */ } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dsl_dataset.c zfs-linux-0.7.0-rc3/module/zfs/dsl_dataset.c --- zfs-linux-0.7.0-rc2/module/zfs/dsl_dataset.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dsl_dataset.c 2017-01-20 18:18:28.000000000 +0000 @@ -83,7 +83,7 @@ extern int spa_asize_inflation; /* - * Figure out how much of this delta should be propogated to the dsl_dir + * Figure out how much of this delta should be propagated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ @@ -705,7 +705,11 @@ int len; VERIFY0(dsl_dataset_get_snapname(ds)); mutex_enter(&ds->ds_lock); - len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname); + len = strlen(ds->ds_snapname); + /* add '@' if ds is a snap */ + if (len > 0) + len++; + len += dsl_dir_namelen(ds->ds_dir); mutex_exit(&ds->ds_lock); return (len); } @@ -1022,19 +1026,6 @@ return (&dsl_dataset_phys(ds)->ds_bp); } -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_bp = *bp; - } -} - spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { @@ -2133,8 +2124,7 @@ * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away - * when we release the long hold. Allow a long hold to exist for volumes, this - * may occur when asynchronously registering the minor with the kernel. + * when we release the long hold. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) @@ -2149,7 +2139,7 @@ dsl_dataset_long_rele(ds, owner); } - held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag)); + held = dsl_dataset_long_held(ds); if (owner != NULL) dsl_dataset_long_hold(ds, owner); @@ -3546,7 +3536,7 @@ */ boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, - uint64_t earlier_txg) + uint64_t earlier_txg) { dsl_pool_t *dp = later->ds_dir->dd_pool; int error; @@ -3630,7 +3620,6 @@ EXPORT_SYMBOL(dsl_dataset_user_release); EXPORT_SYMBOL(dsl_dataset_get_holds); EXPORT_SYMBOL(dsl_dataset_get_blkptr); -EXPORT_SYMBOL(dsl_dataset_set_blkptr); EXPORT_SYMBOL(dsl_dataset_get_spa); EXPORT_SYMBOL(dsl_dataset_modified_since_snap); EXPORT_SYMBOL(dsl_dataset_space_written); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dsl_pool.c zfs-linux-0.7.0-rc3/module/zfs/dsl_pool.c --- zfs-linux-0.7.0-rc2/module/zfs/dsl_pool.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dsl_pool.c 2017-01-20 18:18:28.000000000 +0000 @@ -1087,6 +1087,7 @@ EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); +/* BEGIN CSTYLED */ /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ module_param(zfs_dirty_data_max_percent, int, 0444); MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty"); @@ -1112,4 +1113,5 @@ module_param(zfs_delay_scale, ulong, 0644); MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); +/* END CSTYLED */ #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dsl_prop.c zfs-linux-0.7.0-rc3/module/zfs/dsl_prop.c --- zfs-linux-0.7.0-rc2/module/zfs/dsl_prop.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dsl_prop.c 2017-01-20 18:18:28.000000000 +0000 @@ -892,11 +892,15 @@ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { nvpair_t *pair = elem; + const char *name = nvpair_name(pair); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* - * dsl_prop_get_all_impl() returns properties in this - * format. + * This usually happens when we reuse the nvlist_t data + * returned by the counterpart dsl_prop_get_all_impl(). + * For instance we do this to restore the original + * received properties when an error occurs in the + * zfs_ioc_recv() codepath. */ nvlist_t *attrs = fnvpair_value_nvlist(pair); pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); @@ -904,14 +908,14 @@ if (nvpair_type(pair) == DATA_TYPE_STRING) { const char *value = fnvpair_value_string(pair); - dsl_prop_set_sync_impl(ds, nvpair_name(pair), + dsl_prop_set_sync_impl(ds, name, source, 1, strlen(value) + 1, value, tx); } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { uint64_t intval = fnvpair_value_uint64(pair); - dsl_prop_set_sync_impl(ds, nvpair_name(pair), + dsl_prop_set_sync_impl(ds, name, source, sizeof (intval), 1, &intval, tx); } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { - dsl_prop_set_sync_impl(ds, nvpair_name(pair), + dsl_prop_set_sync_impl(ds, name, source, 0, 0, NULL, tx); } else { panic("invalid nvpair type"); @@ -1127,6 +1131,10 @@ break; } out: + if (err) { + nvlist_free(*nvp); + *nvp = NULL; + } return (err); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/dsl_scan.c zfs-linux-0.7.0-rc3/module/zfs/dsl_scan.c --- zfs-linux-0.7.0-rc2/module/zfs/dsl_scan.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/dsl_scan.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2016 Gary Mills */ @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -72,7 +73,7 @@ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ /* max number of blocks to free in a single TXG */ -ulong zfs_free_max_blocks = 100000; +unsigned long zfs_free_max_blocks = 100000; #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ @@ -833,7 +834,7 @@ goto out; /* - * If dsl_scan_ddt() has aready visited this block, it will have + * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ @@ -1820,7 +1821,7 @@ { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1904,7 +1905,6 @@ if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) @@ -1919,9 +1919,9 @@ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(scan_delay); - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); + zio_nowait(zio_read(NULL, spa, bp, + abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, + NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ @@ -1985,6 +1985,7 @@ module_param(zfs_no_scrub_prefetch, int, 0644); MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching"); +/* CSTYLED */ module_param(zfs_free_max_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg"); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/edonr_zfs.c zfs-linux-0.7.0-rc3/module/zfs/edonr_zfs.c --- zfs-linux-0.7.0-rc2/module/zfs/edonr_zfs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/edonr_zfs.c 2017-01-20 18:18:28.000000000 +0000 @@ -22,20 +22,32 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ #include #include #include #include /* For CTASSERT() */ +#include #define EDONR_MODE 512 #define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE +static int +edonr_incremental(void *buf, size_t size, void *arg) +{ + EdonRState *ctx = arg; + EdonRUpdate(ctx, buf, size * 8); + return (0); +} + /* * Native zio_checksum interface for the Edon-R hash function. */ /*ARGSUSED*/ void -zio_checksum_edonr_native(const void *buf, uint64_t size, +abd_checksum_edonr_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { uint8_t digest[EDONR_MODE / 8]; @@ -43,7 +55,7 @@ ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - EdonRUpdate(&ctx, buf, size * 8); + (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); EdonRFinal(&ctx, digest); bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); } @@ -52,12 +64,12 @@ * Byteswapped zio_checksum interface for the Edon-R hash function. */ void -zio_checksum_edonr_byteswap(const void *buf, uint64_t size, +abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_edonr_native(buf, size, ctx_template, &tmp); + abd_checksum_edonr_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); @@ -65,7 +77,7 @@ } void * -zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) { EdonRState *ctx; uint8_t salt_block[EDONR_BLOCK_SIZE]; @@ -94,7 +106,7 @@ } void -zio_checksum_edonr_tmpl_free(void *ctx_template) +abd_checksum_edonr_tmpl_free(void *ctx_template) { EdonRState *ctx = ctx_template; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/fm.c zfs-linux-0.7.0-rc3/module/zfs/fm.c --- zfs-linux-0.7.0-rc2/module/zfs/fm.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/fm.c 2017-01-20 18:18:28.000000000 +0000 @@ -431,7 +431,7 @@ ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP); list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), - offsetof(zfs_zevent_t, ze_node)); + offsetof(zfs_zevent_t, ze_node)); list_link_init(&ev->ev_node); return (ev); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/gzip.c zfs-linux-0.7.0-rc3/module/zfs/gzip.c --- zfs-linux-0.7.0-rc2/module/zfs/gzip.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/gzip.c 2017-01-20 18:18:28.000000000 +0000 @@ -64,7 +64,7 @@ return (s_len); } - return ((size_t) dstlen); + return ((size_t)dstlen); } /*ARGSUSED*/ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/lz4.c zfs-linux-0.7.0-rc3/module/zfs/lz4.c --- zfs-linux-0.7.0-rc2/module/zfs/lz4.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/lz4.c 2017-01-20 18:18:28.000000000 +0000 @@ -63,7 +63,7 @@ return (s_len); /* - * Encode the compresed buffer size at the start. We'll need this in + * Encode the compressed buffer size at the start. We'll need this in * decompression to counter the effects of padding which might be * added to the compressed buffer and which, if unhandled, would * confuse the hell out of our decompression function. @@ -205,7 +205,7 @@ /* * Little Endian or Big Endian? - * Note: overwrite the below #define if you know your architecture endianess. + * Note: overwrite the below #define if you know your architecture endianness. */ #if defined(_BIG_ENDIAN) #define LZ4_BIG_ENDIAN 1 @@ -1006,7 +1006,7 @@ lz4_init(void) { lz4_cache = kmem_cache_create("lz4_cache", - sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); } void diff -Nru zfs-linux-0.7.0-rc2/module/zfs/Makefile.in zfs-linux-0.7.0-rc3/module/zfs/Makefile.in --- zfs-linux-0.7.0-rc2/module/zfs/Makefile.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/Makefile.in 2017-01-20 18:18:28.000000000 +0000 @@ -7,6 +7,7 @@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += abd.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o @@ -117,6 +118,8 @@ $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o $(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o $(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o diff -Nru zfs-linux-0.7.0-rc2/module/zfs/metaslab.c zfs-linux-0.7.0-rc3/module/zfs/metaslab.c --- zfs-linux-0.7.0-rc2/module/zfs/metaslab.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/metaslab.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -39,11 +39,6 @@ #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) - /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal @@ -57,7 +52,7 @@ /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core - * space_map representation must be before we compact it on-disk. + * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ int zfs_condense_pct = 200; @@ -125,12 +120,12 @@ * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ -uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; +uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. - * Once the space_map's free space drops below this level we dynamically + * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; @@ -172,7 +167,45 @@ */ int metaslab_bias_enabled = B_TRUE; -static uint64_t metaslab_fragmentation(metaslab_t *); + +/* + * Enable/disable segment-based metaslab selection. + */ +int zfs_metaslab_segment_weight_enabled = B_TRUE; + +/* + * When using segment-based metaslab selection, we will continue + * allocating from the active metaslab until we have exhausted + * zfs_metaslab_switch_threshold of its buckets. + */ +int zfs_metaslab_switch_threshold = 2; + +/* + * Internal switch to enable/disable the metaslab allocation tracing + * facility. + */ +#ifdef _METASLAB_TRACING +boolean_t metaslab_trace_enabled = B_TRUE; +#endif + +/* + * Maximum entries that the metaslab allocation tracing facility will keep + * in a given list when running in non-debug mode. We limit the number + * of entries in non-debug mode to prevent us from using up too much memory. + * The limit should be sufficiently large that we don't expect any allocation + * to every exceed this value. In debug mode, the system will panic if this + * limit is ever reached allowing for further investigation. + */ +#ifdef _METASLAB_TRACING +uint64_t metaslab_trace_max_entries = 5000; +#endif + +static uint64_t metaslab_weight(metaslab_t *); +static void metaslab_set_fragmentation(metaslab_t *); + +#ifdef _METASLAB_TRACING +kmem_cache_t *metaslab_alloc_trace_cache; +#endif /* * ========================================================================== @@ -386,11 +419,6 @@ return (space); } -/* - * ========================================================================== - * Metaslab groups - * ========================================================================== - */ static int metaslab_compare(const void *x1, const void *x2) { @@ -407,6 +435,58 @@ } /* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocated = 0; + uint64_t freed = 0; + uint64_t sm_free_space, msp_free_space; + int t; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an allocated + * space map. Calling this in non-syncing context does not + * provide a consistent view of the metaslab since we're performing + * allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - + space_map_alloc_delta(msp->ms_sm); + + /* + * Account for future allocations since we would have already + * deducted that space from the ms_freetree. + */ + for (t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocated += + range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]); + } + freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]); + + msp_free_space = range_tree_space(msp->ms_tree) + allocated + + msp->ms_deferspace + freed; + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +/* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is @@ -974,7 +1054,7 @@ /* * ========================================================================== - * Metaslab block operations + * Common allocator routines * ========================================================================== */ @@ -993,31 +1073,22 @@ return (rs->rs_end - rs->rs_start); } -uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size) +static range_seg_t * +metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { - uint64_t start; - range_tree_t *rt = msp->ms_tree; - - VERIFY(!msp->ms_condensing); + range_seg_t *rs, rsearch; + avl_index_t where; - start = msp->ms_ops->msop_alloc(msp, size); - if (start != -1ULL) { - vdev_t *vd = msp->ms_group->mg_vd; + rsearch.rs_start = start; + rsearch.rs_end = start + size; - VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); - range_tree_remove(rt, start, size); + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) { + rs = avl_nearest(t, where, AVL_AFTER); } - return (start); -} -/* - * ========================================================================== - * Common allocator routines - * ========================================================================== - */ + return (rs); +} #if defined(WITH_FF_BLOCK_ALLOCATOR) || \ defined(WITH_DF_BLOCK_ALLOCATOR) || \ @@ -1031,15 +1102,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t align) { - range_seg_t *rs, rsearch; - avl_index_t where; - - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; - - rs = avl_find(t, &rsearch, &where); - if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + range_seg_t *rs = metaslab_block_find(t, *cursor, size); while (rs != NULL) { uint64_t offset = P2ROUNDUP(rs->rs_start, align); @@ -1281,6 +1344,7 @@ { int error = 0; int t; + boolean_t success = B_FALSE; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_loaded); @@ -1298,14 +1362,18 @@ else range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); - msp->ms_loaded = (error == 0); + success = (error == 0); msp->ms_loading = B_FALSE; - if (msp->ms_loaded) { + if (success) { + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + for (t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defertree[t], range_tree_remove, msp->ms_tree); } + msp->ms_max_size = metaslab_block_maxsize(msp); } cv_broadcast(&msp->ms_load_cv); return (error); @@ -1318,6 +1386,7 @@ range_tree_vacate(msp->ms_tree, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; + msp->ms_max_size = 0; } int @@ -1362,21 +1431,23 @@ ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); metaslab_group_add(mg, ms); - ms->ms_fragmentation = metaslab_fragmentation(ms); - ms->ms_ops = mg->mg_class->mc_ops; + metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. + * The metaslab's weight will also be initialized when we sync + * out this txg. This ensures that we don't attempt to allocate + * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) metaslab_sync_done(ms, 0); /* * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space_map object then load the its space + * that has an allocated space map object then load the its space * map so that can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { @@ -1405,7 +1476,6 @@ metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - VERIFY(msp->ms_group == NULL); vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); @@ -1478,8 +1548,8 @@ * not support this metric. Otherwise, the return value should be in the * range [0, 100]. */ -static uint64_t -metaslab_fragmentation(metaslab_t *msp) +static void +metaslab_set_fragmentation(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; @@ -1488,18 +1558,22 @@ SPA_FEATURE_SPACEMAP_HISTOGRAM); int i; - if (!feature_enabled) - return (ZFS_FRAG_INVALID); + if (!feature_enabled) { + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ - if (msp->ms_sm == NULL) - return (0); + if (msp->ms_sm == NULL) { + msp->ms_fragmentation = 0; + return; + } /* - * If this metaslab's space_map has not been upgraded, flag it + * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { @@ -1513,12 +1587,14 @@ spa_dbgmsg(spa, "txg %llu, requesting force condense: " "msp %p, vd %p", txg, msp, vd); } - return (ZFS_FRAG_INVALID); + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; } for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); @@ -1535,7 +1611,8 @@ if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); - return (fragmentation); + + msp->ms_fragmentation = fragmentation; } /* @@ -1544,30 +1621,20 @@ * the LBA range, and whether the metaslab is loaded. */ static uint64_t -metaslab_weight(metaslab_t *msp) +metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * This vdev is in the process of being removed so there is nothing - * for us to do here. - */ - if (vd->vdev_removing) { - ASSERT0(space_map_allocated(msp->ms_sm)); - ASSERT0(vd->vdev_ms_shift); - return (0); - } + ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - space_map_allocated(msp->ms_sm); - msp->ms_fragmentation = metaslab_fragmentation(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* @@ -1616,6 +1683,210 @@ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } + WEIGHT_SET_SPACEBASED(weight); + return (weight); +} + +/* + * Return the weight of the specified metaslab, according to the segment-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint32_t segments = 0; + int i; + + ASSERT(msp->ms_loaded); + + for (i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { + uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + segments <<= 1; + segments += msp->ms_tree->rt_histogram[i]; + + /* + * The range tree provides more precision than the space map + * and must be downgraded so that all values fit within the + * space map's histogram. This allows us to compare loaded + * vs. unloaded metaslabs to determine which metaslab is + * considered "best". + */ + if (i > max_idx) + continue; + + if (segments != 0) { + WEIGHT_SET_COUNT(weight, segments); + WEIGHT_SET_INDEX(weight, i); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Calculate the weight based on the on-disk histogram. This should only + * be called after a sync pass has completely finished since the on-disk + * information is updated in metaslab_sync(). + */ +static uint64_t +metaslab_weight_from_spacemap(metaslab_t *msp) +{ + uint64_t weight = 0; + int i; + + for (i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { + WEIGHT_SET_COUNT(weight, + msp->ms_sm->sm_phys->smp_histogram[i]); + WEIGHT_SET_INDEX(weight, i + + msp->ms_sm->sm_shift); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Compute a segment-based weight for the specified metaslab. The weight + * is determined by highest bucket in the histogram. The information + * for the highest bucket is encoded into the weight value. + */ +static uint64_t +metaslab_segment_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The metaslab is completely free. + */ + if (space_map_allocated(msp->ms_sm) == 0) { + int idx = highbit64(msp->ms_size) - 1; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + if (idx < max_idx) { + WEIGHT_SET_COUNT(weight, 1ULL); + WEIGHT_SET_INDEX(weight, idx); + } else { + WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); + WEIGHT_SET_INDEX(weight, max_idx); + } + WEIGHT_SET_ACTIVE(weight, 0); + ASSERT(!WEIGHT_IS_SPACEBASED(weight)); + + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (space_map_allocated(msp->ms_sm) == msp->ms_size) + return (0); + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) { + weight = metaslab_weight_from_range_tree(msp); + } else { + weight = metaslab_weight_from_spacemap(msp); + } + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + return (weight); +} + +/* + * Determine if we should attempt to allocate from this metaslab. If the + * metaslab has a maximum size then we can quickly determine if the desired + * allocation size can be satisfied. Otherwise, if we're using segment-based + * weighting then we can determine the maximum allocation that this metaslab + * can accommodate based on the index encoded in the weight. If we're using + * space-based weights then rely on the entire weight (excluding the weight + * type bit). + */ +boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +{ + boolean_t should_allocate; + + if (msp->ms_max_size != 0) + return (msp->ms_max_size >= asize); + + if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + /* + * The metaslab segment weight indicates segments in the + * range [2^i, 2^(i+1)), where i is the index in the weight. + * Since the asize might be in the middle of the range, we + * should attempt the allocation if asize < 2^(i+1). + */ + should_allocate = (asize < + 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else { + should_allocate = (asize <= + (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + } + return (should_allocate); +} +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + uint64_t weight; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * This vdev is in the process of being removed so there is nothing + * for us to do here. + */ + if (vd->vdev_removing) { + ASSERT0(space_map_allocated(msp->ms_sm)); + ASSERT0(vd->vdev_ms_shift); + return (0); + } + + metaslab_set_fragmentation(msp); + + /* + * Update the maximum size if the metaslab is loaded. This will + * ensure that we get an accurate maximum size if newly freed space + * has been added back into the free tree. + */ + if (msp->ms_loaded) + msp->ms_max_size = metaslab_block_maxsize(msp); + + /* + * Segment-based weighting requires space map histogram support. + */ + if (zfs_metaslab_segment_weight_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + weight = metaslab_segment_weight(msp); + } else { + weight = metaslab_space_weight(msp); + } return (weight); } @@ -1634,6 +1905,7 @@ } } + msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -1644,18 +1916,58 @@ } static void -metaslab_passivate(metaslab_t *msp, uint64_t size) +metaslab_passivate(metaslab_t *msp, uint64_t weight) { + ASSERTV(uint64_t size = weight & ~METASLAB_WEIGHT_TYPE); + /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); - metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT(size >= SPA_MINBLOCKSIZE || + range_tree_space(msp->ms_tree) == 0); + ASSERT0(weight & METASLAB_ACTIVE_MASK); + + msp->ms_activation_weight = 0; + metaslab_group_sort(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } +/* + * Segment-based metaslabs are activated once and remain active until + * we either fail an allocation attempt (similar to space-based metaslabs) + * or have exhausted the free space in zfs_metaslab_switch_threshold + * buckets since the metaslab was activated. This function checks to see + * if we've exhaused the zfs_metaslab_switch_threshold buckets in the + * metaslab and passivates it proactively. This will allow us to select a + * metaslab with a larger contiguous region, if any, remaining within this + * metaslab group. If we're in sync pass > 1, then we continue using this + * metaslab so that we don't dirty more block and cause more sync passes. + */ +void +metaslab_segment_may_passivate(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t weight; + int activation_idx, current_idx; + + if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) + return; + + /* + * Since we are in the middle of a sync pass, the most accurate + * information that is accessible to us is the in-core range tree + * histogram; calculate the new weight based on that information. + */ + weight = metaslab_weight_from_range_tree(msp); + activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); + current_idx = WEIGHT_GET_INDEX(weight); + + if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) + metaslab_passivate(msp, weight); +} + static void metaslab_preload(void *arg) { @@ -1669,11 +1981,7 @@ metaslab_load_wait(msp); if (!msp->ms_loaded) (void) metaslab_load(msp); - - /* - * Set the ms_access_txg value so that we don't unload it right away. - */ - msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; + msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } @@ -1695,10 +2003,7 @@ /* * Load the next potential metaslabs */ - msp = avl_first(t); - while (msp != NULL) { - metaslab_t *msp_next = AVL_NEXT(t, msp); - + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced @@ -1706,27 +2011,11 @@ * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { - msp = msp_next; continue; } - /* - * We must drop the metaslab group lock here to preserve - * lock ordering with the ms_lock (when grabbing both - * the mg_lock and the ms_lock, the ms_lock must be taken - * first). As a result, it is possible that the ordering - * of the metaslabs within the avl tree may change before - * we reacquire the lock. The metaslab cannot be removed from - * the tree while we're in syncing context so it is safe to - * drop the mg_lock here. If the metaslabs are reordered - * nothing will break -- we just may end up loading a - * less than optimal one. - */ - mutex_exit(&mg->mg_lock); VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != 0); - mutex_enter(&mg->mg_lock); - msp = msp_next; + msp, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } @@ -1769,7 +2058,7 @@ range_seg_t *rs; uint64_t size, entries, segsz, object_size, optimal_size, record_size; dmu_object_info_t doi; - uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; + uint64_t vdev_blocksize = 1ULL << msp->ms_group->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); @@ -1876,7 +2165,7 @@ mutex_enter(&msp->ms_lock); /* - * While we would ideally like to create a space_map representation + * While we would ideally like to create a space map representation * that consists only of allocation records, doing so can be * prohibitively expensive because the in-core free tree can be * large, and therefore computationally expensive to subtract @@ -1939,7 +2228,7 @@ * metaslab_sync() is the metaslab's ms_tree. No other thread can * be modifying this txg's alloctree, freetree, freed_tree, or * space_map_phys_t. Therefore, we only hold ms_lock to satify - * space_map ASSERTs. We drop it whenever we call into the DMU, + * space map ASSERTs. We drop it whenever we call into the DMU, * because the DMU can call down to us (e.g. via zio_free()) at * any time. */ @@ -1961,8 +2250,8 @@ mutex_enter(&msp->ms_lock); /* - * Note: metaslab_condense() clears the space_map's histogram. - * Therefore we muse verify and remove this histogram before + * Note: metaslab_condense() clears the space map's histogram. + * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); @@ -1978,6 +2267,8 @@ } if (msp->ms_loaded) { + int t; + /* * When the space map is loaded, we have an accruate * histogram in the range tree. This gives us an opportunity @@ -1986,16 +2277,38 @@ */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); - } else { + + /* + * Since we've cleared the histogram we need to add back + * any free space that has already been processed, plus + * any deferred space. This allows the on-disk histogram + * to accurately reflect all free space even if some space + * is not yet available for allocation (i.e. deferred). + */ + space_map_histogram_add(msp->ms_sm, *freed_tree, tx); + /* - * Since the space map is not loaded we simply update the - * exisiting histogram with what was freed in this txg. This - * means that the on-disk histogram may not have an accurate - * view of the free space but it's close enough to allow - * us to make allocation decisions. + * Add back any deferred free space that has not been + * added back into the in-core free tree yet. This will + * ensure that we don't end up with a space map histogram + * that is completely empty unless the metaslab is fully + * allocated. */ - space_map_histogram_add(msp->ms_sm, *freetree, tx); + for (t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defertree[t], tx); + } } + + /* + * Always add the free space from this sync pass to the space + * map histogram. We want to make sure that the on-disk histogram + * accounts for all free space. If the space map is not loaded, + * then we will lose some accuracy but will correct it the next + * time we load the space map. + */ + space_map_histogram_add(msp->ms_sm, *freetree, tx); + metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); @@ -2014,6 +2327,7 @@ range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); mutex_exit(&msp->ms_lock); @@ -2035,9 +2349,12 @@ { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; range_tree_t **freed_tree; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; + uint64_t free_space; + boolean_t defer_allowed = B_TRUE; int t; ASSERT(!vd->vdev_ishole); @@ -2073,9 +2390,20 @@ freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; + free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + if (free_space <= spa_get_slop_space(spa)) { + defer_allowed = B_FALSE; + } + + defer_delta = 0; alloc_delta = space_map_alloc_delta(msp->ms_sm); - defer_delta = range_tree_space(*freed_tree) - - range_tree_space(*defer_tree); + if (defer_allowed) { + defer_delta = range_tree_space(*freed_tree) - + range_tree_space(*defer_tree); + } else { + defer_delta -= range_tree_space(*defer_tree); + } vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); @@ -2096,7 +2424,12 @@ */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); - range_tree_swap(freed_tree, defer_tree); + if (defer_allowed) { + range_tree_swap(freed_tree, defer_tree); + } else { + range_tree_vacate(*freed_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + } space_map_update(msp->ms_sm); @@ -2111,7 +2444,19 @@ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } - if (msp->ms_loaded && msp->ms_access_txg < txg) { + /* + * Calculate the new weights before unloading any metaslabs. + * This will give us the most accurate weighting. + */ + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_selected_txg + metaslab_unload_delay < txg) { + for (t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_alloctree[(txg + t) & TXG_MASK])); @@ -2121,7 +2466,6 @@ metaslab_unload(msp); } - metaslab_group_sort(mg, msp, metaslab_weight(msp)); mutex_exit(&msp->ms_lock); } @@ -2156,6 +2500,140 @@ /* * ========================================================================== + * Metaslab allocation tracing facility + * ========================================================================== + */ +#ifdef _METASLAB_TRACING +kstat_t *metaslab_trace_ksp; +kstat_named_t metaslab_trace_over_limit; + +void +metaslab_alloc_trace_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", + "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); + if (metaslab_trace_ksp != NULL) { + metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; + kstat_named_init(&metaslab_trace_over_limit, + "metaslab_trace_over_limit", KSTAT_DATA_UINT64); + kstat_install(metaslab_trace_ksp); + } +} + +void +metaslab_alloc_trace_fini(void) +{ + if (metaslab_trace_ksp != NULL) { + kstat_delete(metaslab_trace_ksp); + metaslab_trace_ksp = NULL; + } + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} + +/* + * Add an allocation trace element to the allocation tracing list. + */ +static void +metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) +{ + metaslab_alloc_trace_t *mat; + + if (!metaslab_trace_enabled) + return; + + /* + * When the tracing list reaches its maximum we remove + * the second element in the list before adding a new one. + * By removing the second element we preserve the original + * entry as a clue to what allocations steps have already been + * performed. + */ + if (zal->zal_size == metaslab_trace_max_entries) { + metaslab_alloc_trace_t *mat_next; +#ifdef DEBUG + panic("too many entries in allocation list"); +#endif + atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + zal->zal_size--; + mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); + list_remove(&zal->zal_list, mat_next); + kmem_cache_free(metaslab_alloc_trace_cache, mat_next); + } + + mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); + list_link_init(&mat->mat_list_node); + mat->mat_mg = mg; + mat->mat_msp = msp; + mat->mat_size = psize; + mat->mat_dva_id = dva_id; + mat->mat_offset = offset; + mat->mat_weight = 0; + + if (msp != NULL) + mat->mat_weight = msp->ms_weight; + + /* + * The list is part of the zio so locking is not required. Only + * a single thread will perform allocations for a given zio. + */ + list_insert_tail(&zal->zal_list, mat); + zal->zal_size++; + + ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ + list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), + offsetof(metaslab_alloc_trace_t, mat_list_node)); + zal->zal_size = 0; +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ + metaslab_alloc_trace_t *mat; + + while ((mat = list_remove_head(&zal->zal_list)) != NULL) + kmem_cache_free(metaslab_alloc_trace_cache, mat); + list_destroy(&zal->zal_list); + zal->zal_size = 0; +} +#else + +#define metaslab_trace_add(zal, mg, msp, psize, id, off) + +void +metaslab_alloc_trace_init(void) +{ +} + +void +metaslab_alloc_trace_fini(void) +{ +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ +} + +#endif /* _METASLAB_TRACING */ + +/* + * ========================================================================== * Metaslab block operations * ========================================================================== */ @@ -2209,13 +2687,49 @@ } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, - uint64_t txg, uint64_t min_distance, dva_t *dva, int d) +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + uint64_t start; + range_tree_t *rt = msp->ms_tree; + metaslab_class_t *mc = msp->ms_group->mg_class; + + VERIFY(!msp->ms_condensing); + + start = mc->mc_ops->msop_alloc(msp, size); + if (start != -1ULL) { + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); + range_tree_remove(rt, start, size); + + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); + + /* Track the last successful allocation */ + msp->ms_alloc_txg = txg; + metaslab_verify_space(msp, txg); + } + + /* + * Now that we've attempted the allocation we need to update the + * metaslab's maximum block size since it may have changed. + */ + msp->ms_max_size = metaslab_block_maxsize(msp); + return (start); +} + +static uint64_t +metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { - spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; + metaslab_t *search; uint64_t offset = -1ULL; - avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t activation_weight; uint64_t target_distance; int i; @@ -2228,20 +2742,39 @@ } } + search = kmem_alloc(sizeof (*search), KM_SLEEP); + search->ms_weight = UINT64_MAX; + search->ms_start = 0; for (;;) { boolean_t was_active; + avl_tree_t *t = &mg->mg_metaslab_tree; + avl_index_t idx; mutex_enter(&mg->mg_lock); - for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < asize) { - spa_dbgmsg(spa, "%s: failed to meet weight " - "requirement: vdev %llu, txg %llu, mg %p, " - "msp %p, asize %llu, " - "weight %llu", spa_name(spa), - mg->mg_vd->vdev_id, txg, - mg, msp, asize, msp->ms_weight); - mutex_exit(&mg->mg_lock); - return (-1ULL); + + /* + * Find the metaslab with the highest weight that is less + * than what we've already tried. In the common case, this + * means that we will examine each metaslab at most once. + * Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. + * If a metaslab is activated by another thread, and we fail + * to allocate from the metaslab we have selected, we may + * not try the newly-activated metaslab, and instead activate + * another metaslab. This is not optimal, but generally + * does not cause any problems (a possible exception being + * if every metaslab is completely full except for the + * the newly-activated metaslab which we fail to examine). + */ + msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + continue; } /* @@ -2258,16 +2791,21 @@ (space_map_allocated(msp->ms_sm) != 0 ? 0 : min_distance >> 1); - for (i = 0; i < d; i++) + for (i = 0; i < d; i++) { if (metaslab_distance(msp, &dva[i]) < target_distance) break; + } if (i == d) break; } mutex_exit(&mg->mg_lock); - if (msp == NULL) + if (msp == NULL) { + kmem_free(search, sizeof (*search)); return (-1ULL); + } + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); @@ -2275,11 +2813,11 @@ * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we - * were blocked on the metaslab lock. + * were blocked on the metaslab lock. We check the + * active status first to see if we need to reselect + * a new metaslab. */ - if (msp->ms_weight < asize || (was_active && - !(msp->ms_weight & METASLAB_ACTIVE_MASK) && - activation_weight == METASLAB_WEIGHT_PRIMARY)) { + if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { mutex_exit(&msp->ms_lock); continue; } @@ -2296,6 +2834,22 @@ mutex_exit(&msp->ms_lock); continue; } + msp->ms_selected_txg = txg; + + /* + * Now that we have the lock, recheck to see if we should + * continue to use this metaslab for this allocation. The + * the metaslab is now loaded so metaslab_should_allocate() can + * accurately determine if the allocation attempt should + * proceed. + */ + if (!metaslab_should_allocate(msp, asize)) { + /* Passivate this metaslab and select a new one. */ + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + goto next; + } + /* * If this metaslab is currently condensing then pick again as @@ -2303,50 +2857,131 @@ * to disk. */ if (msp->ms_condensing) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_CONDENSING); mutex_exit(&msp->ms_lock); continue; } - if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) + offset = metaslab_block_alloc(msp, asize, txg); + metaslab_trace_add(zal, mg, msp, asize, d, offset); + + if (offset != -1ULL) { + /* Proactively passivate the metaslab, if needed */ + metaslab_segment_may_passivate(msp); break; + } +next: + ASSERT(msp->ms_loaded); - metaslab_passivate(msp, metaslab_block_maxsize(msp)); + /* + * We were unable to allocate from this metaslab so determine + * a new weight for this metaslab. Now that we have loaded + * the metaslab we can provide a better hint to the metaslab + * selector. + * + * For space-based metaslabs, we use the maximum block size. + * This information is only available when the metaslab + * is loaded and is more accurate than the generic free + * space weight that was calculated by metaslab_weight(). + * This information allows us to quickly compare the maximum + * available allocation in the metaslab to the allocation + * size being requested. + * + * For segment-based metaslabs, determine the new weight + * based on the highest bucket in the range tree. We + * explicitly use the loaded segment weight (i.e. the range + * tree histogram) since it contains the space that is + * currently available for allocation and is accurate + * even within a sync pass. + */ + if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + uint64_t weight = metaslab_block_maxsize(msp); + WEIGHT_SET_SPACEBASED(weight); + metaslab_passivate(msp, weight); + } else { + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + } + + /* + * We have just failed an allocation attempt, check + * that metaslab_should_allocate() agrees. Otherwise, + * we may end up in an infinite loop retrying the same + * metaslab. + */ + ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } + mutex_exit(&msp->ms_lock); + kmem_free(search, sizeof (*search)); + return (offset); +} - if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) +{ + uint64_t offset; + ASSERT(mg->mg_initialized); - range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); - msp->ms_access_txg = txg + metaslab_unload_delay; + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + min_distance, dva, d); - mutex_exit(&msp->ms_lock); + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + metaslab_trace_add(zal, mg, NULL, asize, d, + TRACE_GROUP_FAILURE); + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be out of + * space. We must notify the allocation throttle + * to start skipping allocation attempts to this + * metaslab group until more space becomes available. + * Note: this failure cannot be caused by the + * allocation throttle since the allocation throttle + * is only responsible for skipping devices and + * not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); return (offset); } /* + * If we have to write a ditto block (i.e. more than one DVA for a given BP) + * on the same vdev as an existing DVA of this BP, then try to allocate it + * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the + * existing DVAs. + */ +int ditto_same_vdev_distance_shift = 3; + +/* * Allocate a block for the specified i/o. */ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal) { metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; - int dshift = 3; - int all_zero; - int zio_lock = B_FALSE; - boolean_t allocatable; - uint64_t asize; - uint64_t distance; + boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) { + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); return (SET_ERROR(ENOSPC)); + } /* * Start at the rotor and loop through all mgs until we find something. @@ -2412,9 +3047,10 @@ rotor = mg; top: - all_zero = B_TRUE; do { + boolean_t allocatable; uint64_t offset; + uint64_t distance, asize; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; @@ -2422,7 +3058,7 @@ /* * Don't allocate from faulted devices. */ - if (zio_lock) { + if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); @@ -2437,61 +3073,53 @@ * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ - if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { + if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, psize); } - if (!allocatable) + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE); goto next; + } ASSERT(mg->mg_initialized); /* - * Avoid writing single-copy data to a failing vdev. + * Avoid writing single-copy data to a failing, + * non-redundant vdev, unless we've already tried all + * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3 && vd->vdev_children == 0) { - all_zero = B_FALSE; + d == 0 && !try_hard && vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR); goto next; } ASSERT(mg->mg_class == mc); - distance = vd->vdev_asize >> dshift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - else - all_zero = B_FALSE; + /* + * If we don't need to try hard, then require that the + * block be 1/8th of the device away from any other DVAs + * in this BP. If we are trying hard, allow any offset + * to be used (distance=0). + */ + distance = 0; + if (!try_hard) { + distance = vd->vdev_asize >> + ditto_same_vdev_distance_shift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + } asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); - - mutex_enter(&mg->mg_lock); - if (offset == -1ULL) { - mg->mg_failed_allocations++; - if (asize == SPA_GANGBLOCKSIZE) { - /* - * This metaslab group was unable to allocate - * the minimum gang block size so it must be - * out of space. We must notify the allocation - * throttle to start skipping allocation - * attempts to this metaslab group until more - * space becomes available. - * - * Note: this failure cannot be caused by the - * allocation throttle since the allocation - * throttle is only responsible for skipping - * devices and not failing block allocations. - */ - mg->mg_no_free_space = B_TRUE; - } - } - mg->mg_allocations++; - mutex_exit(&mg->mg_lock); + offset = metaslab_group_alloc(mg, zal, asize, txg, distance, + dva, d); if (offset != -1ULL) { /* @@ -2565,20 +3193,17 @@ mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); - if (!all_zero) { - dshift++; - ASSERT(dshift < 64); - goto top; - } - - if (!allocatable && !zio_lock) { - dshift = 3; - zio_lock = B_TRUE; + /* + * If we haven't tried hard, do so now. + */ + if (!try_hard) { + try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); return (SET_ERROR(ENOSPC)); } @@ -2625,6 +3250,7 @@ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_tree, offset, size); + msp->ms_max_size = metaslab_block_maxsize(msp); } else { if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); @@ -2747,7 +3373,8 @@ int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, + zio_alloc_list_t *zal, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -2766,10 +3393,11 @@ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + ASSERT3P(zal, !=, NULL); for (d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags); + txg, flags, zal); if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); @@ -2924,37 +3552,52 @@ } #if defined(_KERNEL) && defined(HAVE_SPL) +/* CSTYLED */ module_param(metaslab_aliquot, ulong, 0644); -module_param(metaslab_debug_load, int, 0644); -module_param(metaslab_debug_unload, int, 0644); -module_param(metaslab_preload_enabled, int, 0644); -module_param(zfs_mg_noalloc_threshold, int, 0644); -module_param(zfs_mg_fragmentation_threshold, int, 0644); -module_param(zfs_metaslab_fragmentation_threshold, int, 0644); -module_param(metaslab_fragmentation_factor_enabled, int, 0644); -module_param(metaslab_lba_weighting_enabled, int, 0644); -module_param(metaslab_bias_enabled, int, 0644); - MODULE_PARM_DESC(metaslab_aliquot, "allocation granularity (a.k.a. stripe size)"); + +module_param(metaslab_debug_load, int, 0644); MODULE_PARM_DESC(metaslab_debug_load, "load all metaslabs when pool is first opened"); + +module_param(metaslab_debug_unload, int, 0644); MODULE_PARM_DESC(metaslab_debug_unload, "prevent metaslabs from being unloaded"); + +module_param(metaslab_preload_enabled, int, 0644); MODULE_PARM_DESC(metaslab_preload_enabled, "preload potential metaslabs during reassessment"); +module_param(zfs_mg_noalloc_threshold, int, 0644); MODULE_PARM_DESC(zfs_mg_noalloc_threshold, "percentage of free space for metaslab group to allow allocation"); + +module_param(zfs_mg_fragmentation_threshold, int, 0644); MODULE_PARM_DESC(zfs_mg_fragmentation_threshold, "fragmentation for metaslab group to allow allocation"); +module_param(zfs_metaslab_fragmentation_threshold, int, 0644); MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold, "fragmentation for metaslab to allow allocation"); + +module_param(metaslab_fragmentation_factor_enabled, int, 0644); MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled, "use the fragmentation metric to prefer less fragmented metaslabs"); + +module_param(metaslab_lba_weighting_enabled, int, 0644); MODULE_PARM_DESC(metaslab_lba_weighting_enabled, "prefer metaslabs with lower LBAs"); + +module_param(metaslab_bias_enabled, int, 0644); MODULE_PARM_DESC(metaslab_bias_enabled, "enable metaslab group biasing"); + +module_param(zfs_metaslab_segment_weight_enabled, int, 0644); +MODULE_PARM_DESC(zfs_metaslab_segment_weight_enabled, + "enable segment-based metaslab selection"); + +module_param(zfs_metaslab_switch_threshold, int, 0644); +MODULE_PARM_DESC(zfs_metaslab_switch_threshold, + "segment-based metaslab selection maximum buckets before switching"); #endif /* _KERNEL && HAVE_SPL */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/policy.c zfs-linux-0.7.0-rc3/module/zfs/policy.c --- zfs-linux-0.7.0-rc2/module/zfs/policy.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/policy.c 2017-01-20 18:18:28.000000000 +0000 @@ -36,7 +36,7 @@ /* * The passed credentials cannot be directly verified because Linux only - * provides and interface to check the *current* proces credentials. In + * provides and interface to check the *current* process credentials. In * order to handle this the capable() test is only run when the passed * credentials match the current process credentials or the kcred. In * all other cases this function must fail and return the passed err. diff -Nru zfs-linux-0.7.0-rc2/module/zfs/rrwlock.c zfs-linux-0.7.0-rc3/module/zfs/rrwlock.c --- zfs-linux-0.7.0-rc2/module/zfs/rrwlock.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/rrwlock.c 2017-01-20 18:18:28.000000000 +0000 @@ -313,8 +313,8 @@ * The idea is to split single busy lock into array of locks, so that * each reader can lock only one of them for read, depending on result * of simple hash function. That proportionally reduces lock congestion. - * Writer same time has to sequentially aquire write on all the locks. - * That makes write aquisition proportionally slower, but in places where + * Writer at the same time has to sequentially acquire write on all the locks. + * That makes write acquisition proportionally slower, but in places where * it is used (filesystem unmount) performance is not critical. * * All the functions below are direct wrappers around functions above. diff -Nru zfs-linux-0.7.0-rc2/module/zfs/sa.c zfs-linux-0.7.0-rc3/module/zfs/sa.c --- zfs-linux-0.7.0-rc2/module/zfs/sa.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/sa.c 2017-01-20 18:18:28.000000000 +0000 @@ -202,7 +202,7 @@ */ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; -static int sa_legacy_attr_count = 16; +static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs); static kmem_cache_t *sa_cache = NULL; /*ARGSUSED*/ @@ -1245,7 +1245,7 @@ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); /* - * Determine number of variable lenghts in header + * Determine number of variable lengths in header * The standard 8 byte header has one for free and a * 16 byte header would have 4 + 1; */ @@ -1644,8 +1644,11 @@ } /* - * add/remove/replace a single attribute and then rewrite the entire set + * Add/remove a single attribute or replace a variable-sized attribute value + * with a value of a different size, and then rewrite the entire set * of attributes. + * Same-length attribute value replacement (including fixed-length attributes) + * is handled more efficiently by the upper layers. */ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, @@ -1662,7 +1665,7 @@ int spill_data_size = 0; int spill_attr_count = 0; int error; - uint16_t length; + uint16_t length, reg_length; int i, j, k, length_idx; sa_hdr_phys_t *hdr; sa_idx_tab_t *idx_tab; @@ -1690,7 +1693,7 @@ if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; - old_data[1] = zio_buf_alloc(spill_data_size); + old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = @@ -1731,20 +1734,36 @@ sa_attr_type_t attr; attr = idx_tab->sa_layout->lot_attrs[i]; - length = SA_REGISTERED_LEN(sa, attr); + reg_length = SA_REGISTERED_LEN(sa, attr); + if (reg_length == 0) { + length = hdr->sa_lengths[length_idx]; + length_idx++; + } else { + length = reg_length; + } if (attr == newattr) { - if (length == 0) - ++length_idx; + /* + * There is nothing to do for SA_REMOVE, + * so it is just skipped. + */ if (action == SA_REMOVE) continue; - ASSERT(length == 0); - ASSERT(action == SA_REPLACE); + + /* + * Duplicate attributes are not allowed, so the + * action can not be SA_ADD here. + */ + ASSERT3S(action, ==, SA_REPLACE); + + /* + * Only a variable-sized attribute can be + * replaced here, and its size must be changing. + */ + ASSERT3U(reg_length, ==, 0); + ASSERT3U(length, !=, buflen); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); } else { - if (length == 0) - length = hdr->sa_lengths[length_idx++]; - SA_ADD_BULK_ATTR(attr_desc, j, attr, NULL, (void *) (TOC_OFF(idx_tab->sa_idx_tab[attr]) + @@ -1760,20 +1779,19 @@ } } if (action == SA_ADD) { - length = SA_REGISTERED_LEN(sa, newattr); - if (length == 0) { - length = buflen; - } + reg_length = SA_REGISTERED_LEN(sa, newattr); + IMPLY(reg_length != 0, reg_length == buflen); SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, - datastart, length); + datastart, buflen); } + ASSERT3U(j, ==, attr_count); error = sa_build_layouts(hdl, attr_desc, attr_count, tx); if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) - zio_buf_free(old_data[1], spill_data_size); + vmem_free(old_data[1], spill_data_size); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); @@ -1846,26 +1864,6 @@ return (error); } -int -sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, - uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) -{ - int error; - sa_bulk_attr_t bulk; - - VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN); - - bulk.sa_attr = attr; - bulk.sa_data = userdata; - bulk.sa_data_func = locator; - bulk.sa_length = buflen; - - mutex_enter(&hdl->sa_lock); - error = sa_bulk_update_impl(hdl, &bulk, 1, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - /* * Return size of an attribute */ @@ -2044,7 +2042,6 @@ EXPORT_SYMBOL(sa_bulk_lookup_locked); EXPORT_SYMBOL(sa_bulk_update); EXPORT_SYMBOL(sa_size); -EXPORT_SYMBOL(sa_update_from_cb); EXPORT_SYMBOL(sa_object_info); EXPORT_SYMBOL(sa_object_size); EXPORT_SYMBOL(sa_get_userdata); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/sha256.c zfs-linux-0.7.0-rc3/module/zfs/sha256.c --- zfs-linux-0.7.0-rc2/module/zfs/sha256.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/sha256.c 2017-01-20 18:18:28.000000000 +0000 @@ -24,30 +24,39 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include -#include #include +#include + +static int +sha_incremental(void *buf, size_t size, void *arg) +{ + SHA2_CTX *ctx = arg; + SHA2Update(ctx, buf, size); + return (0); +} /*ARGSUSED*/ void -zio_checksum_SHA256(const void *buf, uint64_t size, +abd_checksum_SHA256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; zio_cksum_t tmp; SHA2Init(SHA256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(&tmp, &ctx); /* * A prior implementation of this function had a * private SHA256 implementation always wrote things out in * Big Endian and there wasn't a byteswap variant of it. - * To preseve on disk compatibility we need to force that - * behaviour. + * To preserve on disk compatibility we need to force that + * behavior. */ zcp->zc_word[0] = BE_64(tmp.zc_word[0]); zcp->zc_word[1] = BE_64(tmp.zc_word[1]); @@ -57,24 +66,24 @@ /*ARGSUSED*/ void -zio_checksum_SHA512_native(const void *buf, uint64_t size, +abd_checksum_SHA512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; SHA2Init(SHA512_256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(zcp, &ctx); } /*ARGSUSED*/ void -zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, +abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); + abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/skein_zfs.c zfs-linux-0.7.0-rc3/module/zfs/skein_zfs.c --- zfs-linux-0.7.0-rc2/module/zfs/skein_zfs.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/skein_zfs.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,42 +20,52 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include +#include + +static int +skein_incremental(void *buf, size_t size, void *arg) +{ + Skein_512_Ctxt_t *ctx = arg; + (void) Skein_512_Update(ctx, buf, size); + return (0); +} /* * Computes a native 256-bit skein MAC checksum. Please note that this * function requires the presence of a ctx_template that should be allocated - * using zio_checksum_skein_tmpl_init. + * using abd_checksum_skein_tmpl_init. */ /*ARGSUSED*/ void -zio_checksum_skein_native(const void *buf, uint64_t size, +abd_checksum_skein_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { Skein_512_Ctxt_t ctx; ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - (void) Skein_512_Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); (void) Skein_512_Final(&ctx, (uint8_t *)zcp); bzero(&ctx, sizeof (ctx)); } /* - * Byteswapped version of zio_checksum_skein_native. This just invokes + * Byteswapped version of abd_checksum_skein_native. This just invokes * the native checksum function and byteswaps the resulting checksum (since * skein is internally endian-insensitive). */ void -zio_checksum_skein_byteswap(const void *buf, uint64_t size, +abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_skein_native(buf, size, ctx_template, &tmp); + abd_checksum_skein_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); @@ -67,7 +77,7 @@ * computations and returns a pointer to it. */ void * -zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) { Skein_512_Ctxt_t *ctx; @@ -82,7 +92,7 @@ * zio_checksum_skein_tmpl_init. */ void -zio_checksum_skein_tmpl_free(void *ctx_template) +abd_checksum_skein_tmpl_free(void *ctx_template) { Skein_512_Ctxt_t *ctx = ctx_template; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/spa.c zfs-linux-0.7.0-rc3/module/zfs/spa.c --- zfs-linux-0.7.0-rc2/module/zfs/spa.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/spa.c 2017-01-20 18:18:28.000000000 +0000 @@ -555,8 +555,7 @@ case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); - if (!error && (intval < ZIO_FAILURE_MODE_WAIT || - intval > ZIO_FAILURE_MODE_PANIC)) + if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* @@ -1209,7 +1208,7 @@ list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { @@ -1314,7 +1313,7 @@ static void spa_unload(spa_t *spa) { - int i; + int i, c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -1332,6 +1331,19 @@ } /* + * Even though vdev_free() also calls vdev_metaslab_fini, we need + * to call it earlier, before we wait for async i/o to complete. + * This ensures that there is no async metaslab prefetching, by + * calling taskq_wait(mg_taskq). + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + for (c = 0; c < spa->spa_root_vdev->vdev_children; c++) + vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { @@ -1964,6 +1976,7 @@ int error = zio->io_error; spa_t *spa = zio->io_spa; + abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) @@ -1971,7 +1984,6 @@ else atomic_inc_64(&sle->sle_data_count); } - zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1994,7 +2006,6 @@ { zio_t *rio; size_t size; - void *data; if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -2005,12 +2016,11 @@ */ if (!spa_load_verify_metadata) return (0); - if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -2018,7 +2028,7 @@ spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(rio, spa, bp, data, size, + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); @@ -2756,10 +2766,14 @@ error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps); - if (error != ENOENT && error != 0) { + if (error == ENOENT) { + VERIFY(!nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + spa->spa_avz_action = AVZ_ACTION_INITIALIZE; + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } else if (error == 0 && !nvlist_exists(mos_config, - ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their @@ -3680,7 +3694,7 @@ nvlist_t **newdevs; /* - * Generate new dev list by concatentating with the + * Generate new dev list by concatenating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, @@ -6125,6 +6139,7 @@ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { @@ -6277,7 +6292,7 @@ case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* - * The version is synced seperatly before other + * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); @@ -6307,7 +6322,7 @@ * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's - * configuratoin has already been dirtied. + * configuration has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); @@ -6518,8 +6533,8 @@ tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); @@ -6707,7 +6722,7 @@ } dmu_tx_commit(tx); - taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* @@ -6999,6 +7014,7 @@ MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); +/* CSTYLED */ module_param(zio_taskq_batch_pct, uint, 0444); MODULE_PARM_DESC(zio_taskq_batch_pct, "Percentage of CPUs to run an IO worker thread"); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/space_map.c zfs-linux-0.7.0-rc3/module/zfs/space_map.c --- zfs-linux-0.7.0-rc2/module/zfs/space_map.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/space_map.c 2017-01-20 18:18:28.000000000 +0000 @@ -72,7 +72,7 @@ } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - entry_map = zio_buf_alloc(bufsize); + entry_map = vmem_alloc(bufsize, KM_SLEEP); mutex_exit(sm->sm_lock); if (end > bufsize) { @@ -128,7 +128,7 @@ else range_tree_vacate(rt, NULL, NULL); - zio_buf_free(entry_map, bufsize); + vmem_free(entry_map, bufsize); return (error); } @@ -173,7 +173,6 @@ dmu_buf_will_dirty(sm->sm_dbuf, tx); ASSERT(space_map_histogram_verify(sm, rt)); - /* * Transfer the content of the range tree histogram to the space * map histogram. The space map histogram contains 32 buckets ranging @@ -272,7 +271,7 @@ expected_entries = space_map_entries(sm, rt); - entry_map = zio_buf_alloc(sm->sm_blksz); + entry_map = vmem_alloc(sm->sm_blksz, KM_SLEEP); entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t)); entry = entry_map; @@ -335,7 +334,7 @@ VERIFY3U(range_tree_space(rt), ==, rt_space); VERIFY3U(range_tree_space(rt), ==, total); - zio_buf_free(entry_map, sm->sm_blksz); + vmem_free(entry_map, sm->sm_blksz); } static int diff -Nru zfs-linux-0.7.0-rc2/module/zfs/space_reftree.c zfs-linux-0.7.0-rc3/module/zfs/space_reftree.c --- zfs-linux-0.7.0-rc2/module/zfs/space_reftree.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/space_reftree.c 2017-01-20 18:18:28.000000000 +0000 @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -97,7 +97,7 @@ void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, - int64_t refcnt) + int64_t refcnt) { space_reftree_add_node(t, start, refcnt); space_reftree_add_node(t, end, -refcnt); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/spa_config.c zfs-linux-0.7.0-rc3/module/zfs/spa_config.c --- zfs-linux-0.7.0-rc2/module/zfs/spa_config.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/spa_config.c 2017-01-20 18:18:28.000000000 +0000 @@ -222,7 +222,7 @@ * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not - * contain the correct information to open the pool and an explicity import + * contain the correct information to open the pool and an explicit import * would be required. */ void @@ -419,14 +419,14 @@ */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, - ZPOOL_CONFIG_POOL_NAME, &pool_name)); + ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); - fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/spa_misc.c zfs-linux-0.7.0-rc3/module/zfs/spa_misc.c --- zfs-linux-0.7.0-rc2/module/zfs/spa_misc.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/spa_misc.c 2017-01-20 18:18:28.000000000 +0000 @@ -530,7 +530,7 @@ if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev); - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); } @@ -635,6 +635,9 @@ spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + /* Reset cached value */ + spa->spa_dedup_dspace = ~0ULL; + /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature @@ -1830,12 +1833,14 @@ refcount_init(); unique_init(); range_tree_init(); + metaslab_alloc_trace_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); vdev_raidz_math_init(); + vdev_file_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -1850,12 +1855,14 @@ spa_evict_all(); + vdev_file_fini(); vdev_cache_stat_fini(); vdev_raidz_math_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); + metaslab_alloc_trace_fini(); range_tree_fini(); unique_fini(); refcount_fini(); @@ -2090,9 +2097,9 @@ EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); - EXPORT_SYMBOL(spa_namespace_lock); +/* BEGIN CSTYLED */ module_param(zfs_flags, uint, 0644); MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); @@ -2115,4 +2122,5 @@ module_param(spa_slop_shift, int, 0644); MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool"); +/* END CSTYLED */ #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/spa_stats.c zfs-linux-0.7.0-rc3/module/zfs/spa_stats.c --- zfs-linux-0.7.0-rc2/module/zfs/spa_stats.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/spa_stats.c 2017-01-20 18:18:28.000000000 +0000 @@ -106,7 +106,7 @@ } /* - * When the kstat is written discard all spa_read_history_t entires. The + * When the kstat is written discard all spa_read_history_t entries. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static int @@ -327,7 +327,7 @@ } /* - * When the kstat is written discard all spa_txg_history_t entires. The + * When the kstat is written discard all spa_txg_history_t entries. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static int @@ -474,7 +474,7 @@ /* * Set txg IO stats. */ -int +static int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) { @@ -503,6 +503,54 @@ return (error); } +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + txg_stat_t *ts; + + if (zfs_txg_history == 0) + return (NULL); + + ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs1); + spa_config_exit(spa, SCL_ALL, FTAG); + + ts->txg = txg; + ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); + + return (ts); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + if (ts == NULL) + return; + + if (zfs_txg_history == 0) { + kmem_free(ts, sizeof (txg_stat_t)); + return; + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs2); + spa_config_exit(spa, SCL_ALL, FTAG); + + spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); + spa_txg_history_set_io(spa, ts->txg, + ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], + ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], + ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], + ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], + ts->ndirty); + + kmem_free(ts, sizeof (txg_stat_t)); +} + /* * ========================================================================== * SPA TX Assign Histogram Routines diff -Nru zfs-linux-0.7.0-rc2/module/zfs/txg.c zfs-linux-0.7.0-rc3/module/zfs/txg.c --- zfs-linux-0.7.0-rc2/module/zfs/txg.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/txg.c 2017-01-20 18:18:28.000000000 +0000 @@ -481,22 +481,17 @@ spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; - vdev_stat_t *vs1, *vs2; clock_t start, delta; (void) spl_fstrans_mark(); txg_thread_enter(tx, &cpr); - vs1 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP); - vs2 = kmem_alloc(sizeof (vdev_stat_t), KM_SLEEP); - start = delta = 0; for (;;) { - clock_t timer, timeout; + clock_t timeout = zfs_txg_timeout * hz; + clock_t timer; uint64_t txg; - uint64_t ndirty; - - timeout = zfs_txg_timeout * hz; + txg_stat_t *ts; /* * We sync when we're scanning, there's someone waiting @@ -527,15 +522,8 @@ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } - if (tx->tx_exiting) { - kmem_free(vs2, sizeof (vdev_stat_t)); - kmem_free(vs1, sizeof (vdev_stat_t)); + if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); - } - - spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); - vdev_get_stats(spa->spa_root_vdev, vs1); - spa_config_exit(spa, SCL_ALL, FTAG); /* * Consume the quiesced txg which has been handed off to @@ -546,16 +534,13 @@ tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); + ts = spa_txg_history_init_io(spa, txg, dp); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); - spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, - gethrtime()); - ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; - start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; @@ -564,23 +549,13 @@ tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); + spa_txg_history_fini_io(spa, ts); cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); - vdev_get_stats(spa->spa_root_vdev, vs2); - spa_config_exit(spa, SCL_ALL, FTAG); - spa_txg_history_set_io(spa, txg, - vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], - vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], - vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], - vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], - ndirty); - spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev.c zfs-linux-0.7.0-rc3/module/zfs/vdev.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev.c 2017-01-20 18:18:28.000000000 +0000 @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -999,16 +1000,16 @@ vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, - zio->io_offset, zio->io_size, zio->io_data, + zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; @@ -1126,8 +1127,8 @@ for (l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } @@ -1191,7 +1192,7 @@ for (c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != 0); + vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); taskq_destroy(tq); } @@ -2705,7 +2706,7 @@ !vdev_readable(vd) || !vdev_writeable(vd)) { /* - * When reopening in reponse to a clear event, it may be due to + * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ @@ -3373,6 +3374,17 @@ spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { + /* + * Since vdev_offline() code path is already in an offline + * state we can miss a statechange event to OFFLINE. Check + * the previous state to catch this condition. + */ + if (vd->vdev_ops->vdev_op_leaf && + (state == VDEV_STATE_OFFLINE) && + (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { + /* post an offline state change */ + zfs_post_state_change(spa, vd, vd->vdev_prevstate); + } vd->vdev_stat.vs_aux = aux; return; } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_cache.c zfs-linux-0.7.0-rc3/module/zfs/vdev_cache.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_cache.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_cache.c 2017-01-20 18:18:28.000000000 +0000 @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -31,6 +31,7 @@ #include #include #include +#include /* * Virtual device read-ahead caching. @@ -136,12 +137,12 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) { ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - ASSERT(ve->ve_data != NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_abd); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -171,14 +172,14 @@ ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); - ASSERT(ve->ve_hits != 0); + ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -192,7 +193,7 @@ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); @@ -201,7 +202,7 @@ } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); } /* @@ -216,16 +217,16 @@ zio_t *pio; zio_link_t *zl; - ASSERT(fio->io_size == VCBS); + ASSERT3U(fio->io_size, ==, VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == fio); - ASSERT(ve->ve_offset == fio->io_offset); - ASSERT(ve->ve_data == fio->io_data); + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); ve->ve_fill_io = NULL; @@ -256,7 +257,7 @@ zio_t *fio; ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); - ASSERT(zio->io_type == ZIO_TYPE_READ); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (B_FALSE); @@ -270,7 +271,7 @@ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (B_FALSE); - ASSERT(cache_phase + zio->io_size <= VCBS); + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); mutex_enter(&vc->vc_lock); @@ -309,7 +310,7 @@ } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; @@ -337,7 +338,7 @@ uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); mutex_enter(&vc->vc_lock); @@ -354,8 +355,8 @@ if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_abd, zio->io_abd, start - io_start, + start - ve->ve_offset, end - start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_disk.c zfs-linux-0.7.0-rc3/module/zfs/vdev_disk.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_disk.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_disk.c 2017-01-20 18:18:28.000000000 +0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -411,6 +412,7 @@ ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); + zio_delay_interrupt(zio); } } @@ -434,17 +436,10 @@ #endif } - /* Drop reference aquired by __vdev_disk_physio */ + /* Drop reference acquired by __vdev_disk_physio */ rc = vdev_disk_dio_put(dr); } -static inline unsigned long -bio_nr_pages(void *bio_ptr, unsigned int bio_size) -{ - return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> - PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); -} - static unsigned int bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) { @@ -484,10 +479,14 @@ return (bio_size); } -#ifndef bio_set_op_attrs -#define bio_set_op_attrs(bio, rw, flags) \ - do { (bio)->bi_rw |= (rw)|(flags); } while (0) -#endif +static unsigned int +bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) +{ + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); + + return (abd_scatter_bio_map_off(bio, abd, size, off)); +} static inline void vdev_submit_bio_impl(struct bio *bio) @@ -516,11 +515,11 @@ } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, - size_t kbuf_size, uint64_t kbuf_offset, int rw, int flags) +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t abd_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; @@ -528,7 +527,8 @@ struct blk_plug plug; #endif - ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + ASSERT(zio != NULL); + ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -547,9 +547,10 @@ * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; - bio_offset = kbuf_offset; - bio_size = kbuf_size; + + abd_offset = 0; + bio_offset = io_offset; + bio_size = io_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -569,7 +570,8 @@ /* bio_alloc() with __GFP_WAIT never returns NULL */ dr->dr_bio[i] = bio_alloc(GFP_NOIO, - MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); + MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), + BIO_MAX_PAGES)); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (ENOMEM); @@ -585,10 +587,11 @@ bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -651,7 +654,7 @@ bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio->bi_bdev = bdev; - bio_set_op_attrs(bio, 0, VDEV_WRITE_FLUSH_FUA); + bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); @@ -730,7 +733,7 @@ } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, flags); if (error) { zio->io_error = error; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_file.c zfs-linux-0.7.0-rc3/module/zfs/vdev_file.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_file.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_file.c 2017-01-20 18:18:28.000000000 +0000 @@ -31,11 +31,14 @@ #include #include #include +#include /* * Virtual device vector for files. */ +static taskq_t *vdev_file_taskq; + static void vdev_file_hold(vdev_t *vd) { @@ -150,11 +153,21 @@ vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); @@ -201,8 +214,9 @@ * the sync must be dispatched to a different context. */ if (spl_fstrans_check()) { - VERIFY3U(taskq_dispatch(system_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, 0); + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); return; } @@ -219,8 +233,8 @@ zio->io_target_timestamp = zio_handle_io_delay(zio); - VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, - TQ_SLEEP), !=, 0); + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, TASKQID_INVALID); } /* ARGSUSED */ @@ -242,6 +256,21 @@ B_TRUE /* leaf vdev */ }; +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + /* * From userland we access disks just like files. */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_label.c zfs-linux-0.7.0-rc3/module/zfs/vdev_label.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_label.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_label.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* @@ -145,6 +145,7 @@ #include #include #include +#include #include /* @@ -178,8 +179,8 @@ } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -192,8 +193,8 @@ } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == @@ -587,6 +588,7 @@ spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -599,7 +601,8 @@ if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -607,7 +610,7 @@ zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -646,7 +649,7 @@ goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); return (config); } @@ -782,8 +785,10 @@ spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - char *pad2; + abd_t *vp_abd; + abd_t *pad2; uberblock_t *ub; + abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -867,8 +872,9 @@ /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. @@ -928,7 +934,7 @@ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -936,14 +942,15 @@ /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); - bzero(ub, VDEV_UBERBLOCK_RING); - *ub = spa->spa_uberblock; + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); - bzero(pad2, VDEV_PAD_SIZE); + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -953,7 +960,7 @@ for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -966,7 +973,7 @@ offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -979,9 +986,9 @@ } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -1039,7 +1046,7 @@ vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -1060,7 +1067,7 @@ mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } static void @@ -1076,8 +1083,8 @@ for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } @@ -1144,7 +1151,7 @@ static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { - uberblock_t *ubbuf; + abd_t *ub_abd; int c, l, n; for (c = 0; c < vd->vdev_children; c++) @@ -1158,17 +1165,18 @@ n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); - *ubbuf = *ub; + /* Copy the uberblock_t into the ABD */ + ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1245,6 +1253,7 @@ { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1263,15 +1272,16 @@ */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1279,7 +1289,7 @@ } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); nvlist_free(label); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_mirror.c zfs-linux-0.7.0-rc3/module/zfs/vdev_mirror.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_mirror.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_mirror.c 2017-01-20 18:18:28.000000000 +0000 @@ -31,6 +31,7 @@ #include #include #include +#include #include /* @@ -152,7 +153,7 @@ /* * Apply half the seek increment to I/O's within seek offset - * of the last I/O queued to this vdev as they should incure less + * of the last I/O queued to this vdev as they should incur less * of a seek increment. */ if (ABS(lastoffset - zio_offset) < @@ -272,13 +273,13 @@ while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -433,7 +434,8 @@ mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } @@ -458,7 +460,7 @@ while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; @@ -543,7 +545,7 @@ mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; @@ -584,7 +586,7 @@ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_queue.c zfs-linux-0.7.0-rc3/module/zfs/vdev_queue.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_queue.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_queue.c 2017-01-20 18:18:28.000000000 +0000 @@ -37,6 +37,7 @@ #include #include #include +#include /* * ZFS I/O Scheduler @@ -370,11 +371,11 @@ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -389,7 +390,7 @@ else compfn = vdev_queue_offset_compare; avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); + sizeof (zio_t), offsetof(struct zio, io_queue_node)); } vq->vq_lastoffset = 0; @@ -496,12 +497,12 @@ zio_t *pio; zio_link_t *zl = NULL; while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_abd, aio->io_abd, + 0, pio->io_offset - aio->io_offset, pio->io_size); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_abd); } /* @@ -523,7 +524,7 @@ boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; - void *buf; + abd_t *abd; limit = MAX(MIN(zfs_vdev_aggregation_limit, spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0); @@ -626,12 +627,12 @@ size = IO_SPAN(first, last); ASSERT3U(size, <=, limit); - buf = zio_buf_alloc_flags(size, KM_NOSLEEP); - if (buf == NULL) + abd = abd_alloc_for_io(size, B_TRUE); + if (abd == NULL) return (NULL); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - buf, size, first->io_type, zio->io_priority, + abd, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -644,12 +645,11 @@ if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_abd, + dio->io_offset - aio->io_offset, dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_abd, dio->io_abd, + dio->io_offset - aio->io_offset, 0, dio->io_size); } zio_add_child(dio, aio); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz.c 2017-01-20 18:18:28.000000000 +0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -136,7 +137,7 @@ size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -144,11 +145,13 @@ } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_abd); size += rm->rm_col[c].rc_size; + } - if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -185,7 +188,7 @@ size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -199,8 +202,9 @@ * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -208,15 +212,20 @@ * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_abd = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -226,13 +235,18 @@ vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_data = bad_parity[x]; + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset_size( + rm->rm_abd_copy, offset, + rm->rm_col[x].rc_size); + offset += rm->rm_col[x].rc_size; } } @@ -246,8 +260,10 @@ good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); } /* @@ -260,7 +276,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -274,7 +290,7 @@ rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); - if (rm->rm_datacopy != NULL) + if (rm->rm_abd_copy != NULL) return; /* @@ -290,17 +306,21 @@ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + rm->rm_abd_copy = + abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, + col->rc_size); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -329,6 +349,7 @@ /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -373,7 +394,7 @@ rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - rm->rm_datacopy = NULL; + rm->rm_abd_copy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; @@ -389,7 +410,7 @@ } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_abd = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; @@ -412,13 +433,18 @@ ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_data = zio->io_data; - - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rm->rm_col[c].rc_size); + off = rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_size); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -464,29 +490,84 @@ return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -494,50 +575,43 @@ static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -546,59 +620,48 @@ static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -630,40 +693,159 @@ } } +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + int i; + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + int i; + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + int j; + uint8_t *b; + + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + int i; + + for (i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); - - src = rm->rm_col[VDEV_RAIDZ_P].rc_data; - dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); + + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -672,57 +854,46 @@ static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); - count = MIN(ccount, xcount); + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - dst = rm->rm_col[x].rc_data; + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; exp = 255 - (rm->rm_cols - 1 - x); + rq.q = abd_to_buf(src); + rq.exp = exp; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -730,11 +901,13 @@ static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -750,15 +923,15 @@ * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); - rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -767,12 +940,12 @@ rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xd = rm->rm_col[x].rc_data; - yd = rm->rm_col[y].rc_data; + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; /* * We now have: @@ -796,24 +969,27 @@ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + ASSERT3U(xsize, >=, ysize); + rpq.p = p; + rpq.q = q; + rpq.pxy = pxy; + rpq.qxy = qxy; + rpq.aexp = aexp; + rpq.bexp = bexp; + + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, - rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1131,7 +1307,7 @@ c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = abd_to_buf(rm->rm_col[c].rc_abd); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1139,7 +1315,7 @@ ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1187,8 +1363,25 @@ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1275,6 +1468,20 @@ kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1321,7 +1528,6 @@ dt = &tgts[nbadparity]; - /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) @@ -1479,7 +1685,7 @@ rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1536,7 +1742,7 @@ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1552,6 +1758,7 @@ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1565,9 +1772,11 @@ zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_abd, buf, rc->rc_size); } } @@ -1616,7 +1825,7 @@ if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1625,7 +1834,7 @@ rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1728,7 +1937,8 @@ ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_abd, + rc->rc_size); } /* @@ -1758,7 +1968,8 @@ for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_abd, orig[i], + rc->rc_size); } do { @@ -1997,7 +2208,7 @@ continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); @@ -2077,7 +2288,7 @@ continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neon.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neon.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neon.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neon.c 2017-01-20 18:18:28.000000000 +0000 @@ -23,11 +23,38 @@ */ #include +#include #if defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + #define GEN_P_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() @@ -38,15 +65,12 @@ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ @@ -54,69 +78,115 @@ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_31() \ - GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ - GEN_X_DEFINE_0_3() \ +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 2 #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 2 #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 2 #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() @@ -124,7 +194,6 @@ #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neon_common.h zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neon_common.h --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neon_common.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neon_common.h 2017-01-20 18:18:28.000000000 +0000 @@ -125,7 +125,7 @@ #define ASM_BUG() ASSERT(0) -#define OFFSET(ptr, val) (((unsigned char *)ptr)+val) +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) extern const uint8_t gf_clmul_mod_lt[4*256][16]; @@ -135,20 +135,6 @@ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prfm pstl1strm, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prfm pldl1keep, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -242,6 +228,19 @@ #define ZERO(r...) \ { \ switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ + "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ case 4: \ __asm( \ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neonx2.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neonx2.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_aarch64_neonx2.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_aarch64_neonx2.c 2017-01-20 18:18:28.000000000 +0000 @@ -28,111 +28,179 @@ #include "vdev_raidz_math_aarch64_neon_common.h" -#define GEN_P_DEFINE() \ +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 8 +#define ZERO_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() -#define GEN_P_STRIDE 8 -#define GEN_P_P 0, 1, 2, 3, 4, 5, 6, 7 +#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7 -#define GEN_PQ_DEFINE() \ +#define COPY_STRIDE 8 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define ADD_STRIDE 8 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQR_STRIDE 4 #define GEN_PQR_D 0, 1, 2, 3 -#define GEN_PQR_P 4, 5, 6, 7 -#define GEN_PQR_Q 8, 9, 22, 23 -#define GEN_PQR_R 24, 25, 26, 27 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ +#define SYN_Q_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_22_23() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 4 #define REC_PQ_X 0, 1, 2, 3 #define REC_PQ_Y 4, 5, 6, 7 -#define REC_PQ_D 8, 9, 22, 23 +#define REC_PQ_T 8, 9, 22, 23 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 4 #define REC_PR_X 0, 1, 2, 3 #define REC_PR_Y 4, 5, 6, 7 -#define REC_PR_D 8, 9, 22, 23 +#define REC_PR_T 8, 9, 22, 23 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 4 #define REC_QR_X 0, 1, 2, 3 #define REC_QR_Y 4, 5, 6, 7 -#define REC_QR_D 8, 9, 22, 23 +#define REC_QR_T 8, 9, 22, 23 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ - GEN_X_DEFINE_28_30() \ - GEN_X_DEFINE_31() \ GEN_X_DEFINE_33_36() -#define REC_PQR_STRIDE 4 -#define REC_PQR_X 0, 1, 2, 3 -#define REC_PQR_Y 4, 5, 6, 7 -#define REC_PQR_Z 8, 9, 22, 23 -#define REC_PQR_D 24, 25, 26, 27 -#define REC_PQR_XS 24, 25, 26, 27 -#define REC_PQR_YS 28, 29, 30, 31 +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 #include #include "vdev_raidz_math_impl.h" diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx2.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx2.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx2.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx2.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,6 @@ /* * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ - #include #if defined(__x86_64) && defined(HAVE_AVX2) @@ -66,19 +65,6 @@ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} #define XOR_ACC(src, r...) \ { \ @@ -122,25 +108,7 @@ } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \ - "vpxor %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \ - "vpxor %" VR3(r) ", %" VR3(r)", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "vpxor %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxor %" VR1(r) ", %" VR1(r)", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -335,59 +303,86 @@ kfpu_end(); \ } -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 - -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 - -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 - -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx512bw.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx512bw.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx512bw.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx512bw.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_AVX512BW) + +#include +#include + +#define __asm __asm__ __volatile__ + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "zmm"#REG +#define VR1_(_1, REG, ...) "zmm"#REG +#define VR2_(_1, _2, REG, ...) "zmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ASM_BUG() ASSERT(0) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 64 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa64 %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \ + "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm22, %zmm22"); \ + __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \ + "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \ + "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \ + "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \ + "vpxord %zmm22, %" VR0(r)", %zmm12\n" \ + "vpxord %zmm22, %" VR1(r)", %zmm13\n" \ + "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \ + "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "zmm15" +#define _as "zmm14" +#define _bs "zmm13" +#define _ltmod "zmm12" +#define _ltmul "zmm11" +#define _ta "zmm10" +#define _tb "zmm15" + +static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F; + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpbroadcastb (%[mask]), %%" _0f "\n" \ + /* upper bits */ \ + "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n" \ + \ + "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \ + "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \ + "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpandq %%" _0f ", %%" _as ", %%" _as "\n" \ + "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \ + "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \ + /* lower bits */ \ + "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n" \ + \ + "vpxorq %%" _ta ", %%" _as ", %%" _as "\n" \ + "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\ + "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\ + \ + "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \ + : : [mask] "r" (&_mul_mask), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_01(r)); \ + _MULx2(c, R_23(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +/* + * ZERO, COPY, and MUL operations are already 2x unrolled, which means that + * the stride of these operations for avx512 must not exceed 4. Otherwise, a + * single step would exceed 512B block size. + */ + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx512bw); +DEFINE_REC_METHODS(avx512bw); + +static boolean_t +raidz_will_avx512bw_work(void) +{ + return (zfs_avx_available() && + zfs_avx512f_available() && + zfs_avx512bw_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx512bw), + .rec = RAIDZ_REC_METHODS(avx512bw), + .is_supported = &raidz_will_avx512bw_work, + .name = "avx512bw" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AVX512BW) */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx512f.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx512f.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_avx512f.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_avx512f.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,487 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_AVX512F) + +#include +#include + +#define __asm __asm__ __volatile__ + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "zmm"#REG +#define VR1_(_1, REG, ...) "zmm"#REG +#define VR2_(_1, _2, REG, ...) "zmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define VRy0_(REG, ...) "ymm"#REG +#define VRy1_(_1, REG, ...) "ymm"#REG +#define VRy2_(_1, _2, REG, ...) "ymm"#REG +#define VRy3_(_1, _2, _3, REG, ...) "ymm"#REG +#define VRy4_(_1, _2, _3, _4, REG, ...) "ymm"#REG +#define VRy5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG +#define VRy6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG +#define VRy7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG + +#define VRy0(r...) VRy0_(r) +#define VRy1(r...) VRy1_(r) +#define VRy2(r...) VRy2_(r, 1) +#define VRy3(r...) VRy3_(r, 1, 2) +#define VRy4(r...) VRy4_(r, 1, 2) +#define VRy5(r...) VRy5_(r, 1, 2, 3) +#define VRy6(r...) VRy6_(r, 1, 2, 3, 4) +#define VRy7(r...) VRy7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ELEM_SIZE 64 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + } \ +} + + +#define ZERO(r...) XOR(r, r) + + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa64 %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR3(r)); \ + break; \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \ + "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm31, %zmm31"); \ + __asm("vmovq %0, %%xmm30" :: "r"(0x8080808080808080)); \ + __asm("vpbroadcastq %xmm30, %zmm30"); \ + __asm("vmovq %0, %%xmm29" :: "r"(0xfefefefefefefefe)); \ + __asm("vpbroadcastq %xmm29, %zmm29"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpandq %" VR0(r)", %zmm30, %zmm26\n" \ + "vpandq %" VR1(r)", %zmm30, %zmm25\n" \ + "vpsrlq $7, %zmm26, %zmm28\n" \ + "vpsrlq $7, %zmm25, %zmm27\n" \ + "vpsllq $1, %zmm26, %zmm26\n" \ + "vpsllq $1, %zmm25, %zmm25\n" \ + "vpsubq %zmm28, %zmm26, %zmm26\n" \ + "vpsubq %zmm27, %zmm25, %zmm25\n" \ + "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \ + "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \ + "vpandq %zmm26, %zmm31, %zmm26\n" \ + "vpandq %zmm25, %zmm31, %zmm25\n" \ + "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \ + "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \ + break; \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + + +/* General multiplication by adding powers of two */ + +#define _mul_x2_in 21, 22 +#define _mul_x2_acc 23, 24 + +#define _MUL_PARAM(x, in, acc) \ +{ \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ + if (x & 0xfe) { MUL2(in); } \ + if (x & 0x02) { XOR(in, acc); } \ + if (x & 0xfc) { MUL2(in); } \ + if (x & 0x04) { XOR(in, acc); } \ + if (x & 0xf8) { MUL2(in); } \ + if (x & 0x08) { XOR(in, acc); } \ + if (x & 0xf0) { MUL2(in); } \ + if (x & 0x10) { XOR(in, acc); } \ + if (x & 0xe0) { MUL2(in); } \ + if (x & 0x20) { XOR(in, acc); } \ + if (x & 0xc0) { MUL2(in); } \ + if (x & 0x40) { XOR(in, acc); } \ + if (x & 0x80) { MUL2(in); XOR(in, acc); } \ +} + +#define MUL_x2_DEFINE(x) \ +static void \ +mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); } + + +MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3); +MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7); +MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11); +MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15); +MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19); +MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23); +MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27); +MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31); +MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35); +MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39); +MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43); +MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47); +MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51); +MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55); +MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59); +MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63); +MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67); +MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71); +MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75); +MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79); +MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83); +MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87); +MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91); +MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95); +MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99); +MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103); +MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107); +MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111); +MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115); +MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119); +MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123); +MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127); +MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131); +MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135); +MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139); +MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143); +MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147); +MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151); +MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155); +MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159); +MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163); +MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167); +MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171); +MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175); +MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179); +MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183); +MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187); +MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191); +MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195); +MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199); +MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203); +MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207); +MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211); +MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215); +MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219); +MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223); +MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227); +MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231); +MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235); +MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239); +MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243); +MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247); +MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251); +MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255); + + +typedef void (*mul_fn_ptr_t)(void); + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x2_mul_fns[256] = { + mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5, + mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11, + mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17, + mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23, + mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29, + mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35, + mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41, + mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47, + mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53, + mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59, + mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65, + mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71, + mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77, + mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83, + mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89, + mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95, + mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101, + mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107, + mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113, + mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119, + mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125, + mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131, + mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137, + mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143, + mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149, + mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155, + mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161, + mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167, + mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173, + mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179, + mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185, + mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191, + mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197, + mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203, + mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209, + mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215, + mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221, + mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227, + mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233, + mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239, + mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245, + mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251, + mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255 +}; + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + COPY(R_01(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_01(r)); \ + COPY(R_23(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_23(r)); \ + } \ +} + + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 4 +#define REC_PQ_DEFINE() MUL2_SETUP() +#define REC_PQ_X 0, 1, 2, 3 +#define REC_PQ_Y 4, 5, 6, 7 +#define REC_PQ_T 8, 9, 10, 11 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 4 +#define REC_PR_DEFINE() MUL2_SETUP() +#define REC_PR_X 0, 1, 2, 3 +#define REC_PR_Y 4, 5, 6, 7 +#define REC_PR_T 8, 9, 10, 11 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 4 +#define REC_QR_DEFINE() MUL2_SETUP() +#define REC_QR_X 0, 1, 2, 3 +#define REC_QR_Y 4, 5, 6, 7 +#define REC_QR_T 8, 9, 10, 11 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 4 +#define REC_PQR_DEFINE() MUL2_SETUP() +#define REC_PQR_X 0, 1, 2, 3 +#define REC_PQR_Y 4, 5, 6, 7 +#define REC_PQR_Z 8, 9, 10, 11 +#define REC_PQR_XS 12, 13, 14, 15 +#define REC_PQR_YS 16, 17, 18, 19 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx512f); +DEFINE_REC_METHODS(avx512f); + +static boolean_t +raidz_will_avx512f_work(void) +{ + return (zfs_avx_available() && + zfs_avx2_available() && + zfs_avx512f_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx512f_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx512f), + .rec = RAIDZ_REC_METHODS(avx512f), + .is_supported = &raidz_will_avx512f_work, + .name = "avx512f" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math.c 2017-01-20 18:18:28.000000000 +0000 @@ -57,6 +57,12 @@ #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ &vdev_raidz_avx2_impl, #endif +#if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ + &vdev_raidz_avx512f_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ + &vdev_raidz_avx512bw_impl, +#endif #if defined(__aarch64__) &vdev_raidz_aarch64_neon_impl, &vdev_raidz_aarch64_neonx2_impl, @@ -118,10 +124,10 @@ break; #endif case IMPL_ORIGINAL: - ops = (raidz_impl_ops_t *) &vdev_raidz_original_impl; + ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; break; case IMPL_SCALAR: - ops = (raidz_impl_ops_t *) &vdev_raidz_scalar_impl; + ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; break; default: ASSERT3U(impl, <, raidz_supp_impl_cnt); @@ -156,7 +162,7 @@ default: gen_parity = NULL; cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", - raidz_parity(rm)); + raidz_parity(rm)); break; } @@ -190,7 +196,7 @@ return (rm->rm_ops->rec[RAIDZ_REC_Q]); } } else if (nbaddata == 2 && - parity_valid[CODE_P] && parity_valid[CODE_Q]) { + parity_valid[CODE_P] && parity_valid[CODE_Q]) { return (rm->rm_ops->rec[RAIDZ_REC_PQ]); } return ((raidz_rec_f) NULL); @@ -217,8 +223,8 @@ return (rm->rm_ops->rec[RAIDZ_REC_QR]); } } else if (nbaddata == 3 && - parity_valid[CODE_P] && parity_valid[CODE_Q] && - parity_valid[CODE_R]) { + parity_valid[CODE_P] && parity_valid[CODE_Q] && + parity_valid[CODE_R]) { return (rm->rm_ops->rec[RAIDZ_REC_PQR]); } return ((raidz_rec_f) NULL); @@ -234,17 +240,17 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, const int *dt, const int nbaddata) { - raidz_rec_f rec_data = NULL; + raidz_rec_f rec_fn = NULL; switch (raidz_parity(rm)) { case PARITY_P: - rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); break; case PARITY_PQ: - rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); break; case PARITY_PQR: - rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", @@ -252,10 +258,10 @@ break; } - if (rec_data == NULL) + if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_data(rm, dt)); + return (rec_fn(rm, dt)); } const char *raidz_gen_name[] = { @@ -294,8 +300,8 @@ static int raidz_math_kstat_data(char *buf, size_t size, void *data) { - raidz_impl_kstat_t * fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; - raidz_impl_kstat_t * cstat = (raidz_impl_kstat_t *) data; + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data; ssize_t off = 0; int i; @@ -322,11 +328,11 @@ for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) off += snprintf(buf + off, size - off, "%-16llu", - (u_longlong_t) cstat->gen[i]); + (u_longlong_t)cstat->gen[i]); for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) off += snprintf(buf + off, size - off, "%-16llu", - (u_longlong_t) cstat->rec[i]); + (u_longlong_t)cstat->rec[i]); } (void) snprintf(buf + off, size - off, "\n"); @@ -386,7 +392,7 @@ uint64_t run_cnt, speed, best_speed = 0; hrtime_t t_start, t_diff; raidz_impl_ops_t *curr_impl; - raidz_impl_kstat_t * fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; int impl, i; for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { @@ -440,14 +446,14 @@ /* move supported impl into raidz_supp_impl */ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { - curr_impl = (raidz_impl_ops_t *) raidz_all_maths[i]; + curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; /* initialize impl */ if (curr_impl->init) curr_impl->init(); if (curr_impl->is_supported()) - raidz_supp_impl[c++] = (raidz_impl_ops_t *) curr_impl; + raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; } membar_producer(); /* complete raidz_supp_impl[] init */ raidz_supp_impl_cnt = c; /* number of supported impl */ @@ -465,13 +471,12 @@ return; #endif - /* Fake an zio and run the benchmark on it */ + /* Fake an zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ - bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE); - VERIFY(bench_zio->io_data); - memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { @@ -495,12 +500,12 @@ vdev_raidz_map_free(bench_rm); /* cleanup the bench zio */ - zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE); + abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); /* install kstats for all impl */ raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (raidz_math_kstat != NULL) { raidz_math_kstat->ks_data = NULL; @@ -537,7 +542,7 @@ } static const struct { - char *name; + char *name; uint32_t sel; } math_impl_opts[] = { #if !defined(_KERNEL) diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_impl.h zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_impl.h --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_impl.h 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_impl.h 2017-01-20 18:18:28.000000000 +0000 @@ -32,250 +32,14 @@ #define noinline __attribute__((noinline)) #endif -/* Calculate data offset in raidz column, offset is in bytes */ -#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_data) + (off))) - -/* - * PARITY CALCULATION - * An optimized function is called for a full length of data columns - * If RAIDZ map contains remainder columns (shorter columns) the same function - * is called for reminder of full columns. - * - * GEN_[P|PQ|PQR]_BLOCK() functions are designed to be efficiently in-lined by - * the compiler. This removes a lot of conditionals from the inside loop which - * makes the code faster, especially for vectorized code. - * They are also highly parametrized, allowing for each implementation to define - * most optimal stride, and register allocation. - */ - -static raidz_inline void -GEN_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t *col; - - GEN_P_DEFINE(); - - for (ioff = off; ioff < end; ioff += (GEN_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&(rm->rm_col[1]), ioff), GEN_P_P); - - for (c = 2; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), GEN_P_P); - } - - STORE(COL_OFF(pcol, ioff), GEN_P_P); - } -} - -/* - * Generate P parity (RAIDZ1) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_P_BLOCK(rm, 0, short_size, ncols); - - /* fullcols */ - GEN_P_BLOCK(rm, short_size, psize, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -static raidz_inline void -GEN_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t *col; - - GEN_PQ_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[2], ioff), GEN_PQ_P); - COPY(GEN_PQ_P, GEN_PQ_Q); - - for (c = 3; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQ_D); - MUL2(GEN_PQ_Q); - XOR(GEN_PQ_D, GEN_PQ_P); - XOR(GEN_PQ_D, GEN_PQ_Q); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQ_P); - - for (; c < ncols; c++) - MUL2(GEN_PQ_Q); - - STORE(COL_OFF(qcol, ioff), GEN_PQ_Q); - } -} - -/* - * Generate PQ parity (RAIDZ2) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_PQ_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQ_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - - -static raidz_inline void -GEN_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int ncols, const int nbigcols) -{ - int c; - size_t ioff; - raidz_col_t *col; - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - - GEN_PQR_DEFINE(); - - MUL2_SETUP(); - - for (ioff = off; ioff < end; ioff += (GEN_PQR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(&rm->rm_col[3], ioff), GEN_PQR_P); - COPY(GEN_PQR_P, GEN_PQR_Q); - COPY(GEN_PQR_P, GEN_PQR_R); - - for (c = 4; c < nbigcols; c++) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), GEN_PQR_D); - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - XOR(GEN_PQR_D, GEN_PQR_P); - XOR(GEN_PQR_D, GEN_PQR_Q); - XOR(GEN_PQR_D, GEN_PQR_R); - } - - STORE(COL_OFF(pcol, ioff), GEN_PQR_P); - - for (; c < ncols; c++) { - MUL2(GEN_PQR_Q); - MUL4(GEN_PQR_R); - } - - STORE(COL_OFF(qcol, ioff), GEN_PQR_Q); - STORE(COL_OFF(rcol, ioff), GEN_PQR_R); - } -} - - -/* - * Generate PQR parity (RAIDZ3) - * - * @rm RAIDZ map - */ -static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) -{ - const int ncols = raidz_ncols(rm); - const size_t psize = raidz_big_size(rm); - const size_t short_size = raidz_short_size(rm); - - raidz_math_begin(); - - /* short_size */ - GEN_PQR_BLOCK(rm, 0, short_size, ncols, ncols); - - /* fullcols */ - GEN_PQR_BLOCK(rm, short_size, psize, ncols, raidz_nbigcols(rm)); - - raidz_math_end(); -} - -/* - * DATA RECONSTRUCTION - * - * Data reconstruction process consists of two phases: - * - Syndrome calculation - * - Data reconstruction - * - * Syndrome is calculated by generating parity using available data columns - * and zeros in places of erasure. Existing parity is added to corresponding - * syndrome value to obtain the [P|Q|R]syn values from equation: - * P = Psyn + Dx + Dy + Dz - * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz - * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz - * - * For data reconstruction phase, the corresponding equations are solved - * for missing data (Dx, Dy, Dz). This generally involves multiplying known - * symbols by an coefficient and adding them together. The multiplication - * constant coefficients are calculated ahead of the operation in - * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. - * - * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" - * and "short" columns. - * For this reason, reconstruction is performed in minimum of - * two steps. First, from offset 0 to short_size, then from short_size to - * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work - * over both ranges. The split also enables removal of conditional expressions - * from loop bodies, improving throughput of SIMD implementations. - * For the best performance, all functions marked with raidz_inline attribute - * must be inlined by compiler. - * - * parity data - * columns columns - * <----------> <------------------> - * x y <----+ missing columns (x, y) - * | | - * +---+---+---+---+-v-+---+-v-+---+ ^ 0 - * | | | | | | | | | | - * | | | | | | | | | | - * | P | Q | R | D | D | D | D | D | | - * | | | | 0 | 1 | 2 | 3 | 4 | | - * | | | | | | | | | v - * | | | | | +---+---+---+ ^ short_size - * | | | | | | | - * +---+---+---+---+---+ v big_size - * <------------------> <----------> - * big columns short columns - * - */ - /* * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. * @rm RAIDZ map * @tgtidx array of missing data indexes - * @coeff output array of coefficients. Array must be user - * provided and must hold minimum MUL_CNT values + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. */ static noinline void raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) @@ -383,240 +147,602 @@ coeff[MUL_PQR_YQ] = yd; } +/* + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused + */ +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) +{ + v_t *dst = (v_t *)dc; + size_t i; + + ZERO_DEFINE(); + + (void) private; /* unused */ + + ZERO(ZERO_D); + + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); + } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ +} /* - * Reconstruction using P parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @ncols number of column + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused */ -static raidz_inline void -REC_P_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int ncols) +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) { - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; - - REC_P_DEFINE(); - - for (ioff = off; ioff < end; ioff += (REC_P_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_P_X); - - for (c = firstdc; c < x; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; - for (c++; c < ncols; c++) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_P_X); - } + COPY_DEFINE(); - STORE(COL_OFF(xcol, ioff), REC_P_X); + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); } + + return (0); +} + + +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ } /* - * Reconstruct single data column using P parity - * @rec_method REC_P_BLOCK() + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. * - * @rm RAIDZ map - * @tgtidx array of missing data indexes + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); + } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ +} + +/* + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. + * + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) + */ +static int +raidz_mul_abd_cb(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *)private); + v_t *d = (v_t *)dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros */ -static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; raidz_math_begin(); - /* 0 - short_size */ - REC_P_BLOCK(rm, 0, short_size, x, ncols); + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); - /* short_size - xsize */ - REC_P_BLOCK(rm, short_size, xsize, x, nbigcols); + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; - raidz_math_end(); + /* add data column */ + raidz_add(pabd, dabd, size); + } - return (1 << CODE_P); + raidz_math_end(); } + /* - * Reconstruct using Q parity - */ + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); -#define REC_Q_SYN_UPDATE() MUL2(REC_Q_X) + MUL2_SETUP(); -#define REC_Q_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_Q_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); \ + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } } + /* - * Reconstruction using Q parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map */ static raidz_inline void -REC_Q_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +raidz_generate_pq_impl(raidz_map_t * const rm) { - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; - REC_Q_DEFINE(); + raidz_math_begin(); - for (ioff = off; ioff < end; ioff += (REC_Q_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); - ZERO(REC_Q_X); + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_Q_INNER_LOOP(c); + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } - REC_Q_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_Q_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_Q_SYN_UPDATE(); - if (x != c) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_Q_X); - } - } - for (; c < ncols; c++) - REC_Q_SYN_UPDATE(); - } + raidz_math_end(); +} + + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + v_t *r = (v_t *)c[CODE_R]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); - XOR_ACC(COL_OFF(qcol, ioff), REC_Q_X); - MUL(coeff[MUL_Q_X], REC_Q_X); - STORE(COL_OFF(xcol, ioff), REC_Q_X); + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); } } + /* - * Reconstruct single data column using Q parity - * @rec_method REC_Q_BLOCK() + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + +/* + * Reconstruct single data column using P parity + * + * @syn_method raidz_add_abd() + * @rec_method not applicable * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; - - raidz_rec_q_coeff(rm, tgtidx, coeff); + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; raidz_math_begin(); - /* 0 - short_size */ - REC_Q_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; - /* short_size - xsize */ - REC_Q_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } raidz_math_end(); - return (1 << CODE_Q); + return (1 << CODE_P); } + /* - * Reconstruct using R parity - */ + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); + + SYN_Q_DEFINE(); -#define REC_R_SYN_UPDATE() MUL4(REC_R_X) -#define REC_R_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - REC_R_SYN_UPDATE(); \ - XOR_ACC(COL_OFF(col, ioff), REC_R_X); \ + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); + } } + /* - * Reconstruction using R parity + * Reconstruct single data column using Q parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns + * @tgtidx array of missing data indexes */ -static raidz_inline void -REC_R_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const unsigned *coeff, const int ncols, const int nbigcols) +static raidz_inline int +raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) { - int c; - size_t ioff = 0; + size_t c; + size_t dsize; + abd_t *dabd; const size_t firstdc = raidz_parity(rm); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t *col; + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; - REC_R_DEFINE(); - - for (ioff = off; ioff < end; ioff += (REC_R_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); + unsigned coeff[MUL_CNT]; + raidz_rec_q_coeff(rm, tgtidx, coeff); - ZERO(REC_R_X); + raidz_math_begin(); - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_R_INNER_LOOP(c); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } - REC_R_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_R_INNER_LOOP(c); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; } else { - for (c = firstdc; c < nbigcols; c++) { - REC_R_SYN_UPDATE(); - if (c != x) { - col = &rm->rm_col[c]; - XOR_ACC(COL_OFF(col, ioff), REC_R_X); - } - } - for (; c < ncols; c++) - REC_R_SYN_UPDATE(); + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; } - XOR_ACC(COL_OFF(rcol, ioff), REC_R_X); - MUL(coeff[MUL_R_X], REC_R_X); - STORE(COL_OFF(xcol, ioff), REC_R_X); + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); + + raidz_math_end(); + + return (1 << CODE_Q); } + +/* + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); + + SYN_R_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); + } +} + + /* * Reconstruct single data column using R parity - * @rec_method REC_R_BLOCK() + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -624,122 +750,136 @@ static raidz_inline int raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + unsigned coeff[MUL_CNT]; raidz_rec_r_coeff(rm, tgtidx, coeff); raidz_math_begin(); - /* 0 - short_size */ - REC_R_BLOCK(rm, 0, short_size, x, coeff, ncols, ncols); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } - /* short_size - xsize */ - REC_R_BLOCK(rm, short_size, xsize, x, coeff, ncols, nbigcols); + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); raidz_math_end(); return (1 << CODE_R); } + /* - * Reconstruct using PQ parity - */ + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); -#define REC_PQ_SYN_UPDATE() MUL2(REC_PQ_Y) -#define REC_PQ_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PQ_D); \ - REC_PQ_SYN_UPDATE(); \ - XOR(REC_PQ_D, REC_PQ_X); \ - XOR(REC_PQ_D, REC_PQ_Y); \ + SYN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); + } } /* - * Reconstruction using PQ parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - */ -static raidz_inline void -REC_PQ_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) -{ - int c; - size_t ioff = 0; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; REC_PQ_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQ_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PQ_X); - ZERO(REC_PQ_Y); - MUL2_SETUP(); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQ_INNER_LOOP(c); - - REC_PQ_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQ_INNER_LOOP(c); - - REC_PQ_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQ_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQ_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQ_D); - XOR(REC_PQ_D, REC_PQ_X); - XOR(REC_PQ_D, REC_PQ_Y); - } - } - for (; c < ncols; c++) - REC_PQ_SYN_UPDATE(); - } + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); - XOR_ACC(COL_OFF(qcol, ioff), REC_PQ_Y); + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); /* Save Pxy */ - COPY(REC_PQ_X, REC_PQ_D); + COPY(REC_PQ_X, REC_PQ_T); /* Calc X */ - MUL(coeff[MUL_PQ_X], REC_PQ_X); - MUL(coeff[MUL_PQ_Y], REC_PQ_Y); + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); XOR(REC_PQ_Y, REC_PQ_X); - STORE(COL_OFF(xcol, ioff), REC_PQ_X); + STORE(x, REC_PQ_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PQ_D, REC_PQ_X); - STORE(COL_OFF(ycol, ioff), REC_PQ_X); - } + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); } } + /* * Reconstruct two data columns using PQ parity - * @rec_method REC_PQ_BLOCK() + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -747,126 +887,156 @@ static raidz_inline int raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); - unsigned coeff[MUL_CNT]; + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + unsigned coeff[MUL_CNT]; raidz_rec_pq_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQ_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } - /* short_size - xsize */ - REC_PQ_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + return ((1 << CODE_P) | (1 << CODE_Q)); } + /* - * Reconstruct using PR parity - */ + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PR_DEFINE(); -#define REC_PR_SYN_UPDATE() MUL4(REC_PR_Y) -#define REC_PR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_PR_D); \ - REC_PR_SYN_UPDATE(); \ - XOR(REC_PR_D, REC_PR_X); \ - XOR(REC_PR_D, REC_PR_Y); \ + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } } /* - * Reconstruction using PR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - */ -static raidz_inline void -REC_PR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) -{ - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; REC_PR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PR_STRIDE * sizeof (v_t))) { - LOAD(COL_OFF(pcol, ioff), REC_PR_X); - ZERO(REC_PR_Y); - MUL2_SETUP(); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PR_INNER_LOOP(c); - - REC_PR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PR_D); - XOR(REC_PR_D, REC_PR_X); - XOR(REC_PR_D, REC_PR_Y); - } - } - for (; c < ncols; c++) - REC_PR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(rcol, ioff), REC_PR_Y); + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); /* Save Pxy */ - COPY(REC_PR_X, REC_PR_D); + COPY(REC_PR_X, REC_PR_T); /* Calc X */ - MUL(coeff[MUL_PR_X], REC_PR_X); - MUL(coeff[MUL_PR_Y], REC_PR_Y); + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); XOR(REC_PR_Y, REC_PR_X); - STORE(COL_OFF(xcol, ioff), REC_PR_X); + STORE(x, REC_PR_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PR_D, REC_PR_X); - STORE(COL_OFF(ycol, ioff), REC_PR_X); - } + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); } } /* * Reconstruct two data columns using PR parity - * @rec_method REC_PR_BLOCK() + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -874,134 +1044,162 @@ static raidz_inline int raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } - /* short_size - xsize */ - REC_PR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); - return ((1 << CODE_P) | (1 << CODE_R)); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); } /* - * Reconstruct using QR parity - */ + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); -#define REC_QR_SYN_UPDATE() \ -{ \ - MUL2(REC_QR_X); \ - MUL4(REC_QR_Y); \ -} + SYN_QR_DEFINE(); -#define REC_QR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[c]; \ - LOAD(COL_OFF(col, ioff), REC_QR_D); \ - REC_QR_SYN_UPDATE(); \ - XOR(REC_QR_D, REC_QR_X); \ - XOR(REC_QR_D, REC_QR_Y); \ + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); + } } + /* - * Reconstruction using QR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - */ -static raidz_inline void -REC_QR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const unsigned *coeff, const int ncols, - const int nbigcols, const boolean_t calcy) -{ - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t *col; + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; REC_QR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_QR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - ZERO(REC_QR_X); - ZERO(REC_QR_Y); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_QR_INNER_LOOP(c); - - REC_QR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_QR_INNER_LOOP(c); - - REC_QR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_QR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_QR_SYN_UPDATE(); - if (c != x && c != y) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_QR_D); - XOR(REC_QR_D, REC_QR_X); - XOR(REC_QR_D, REC_QR_Y); - } - } - for (; c < ncols; c++) - REC_QR_SYN_UPDATE(); - } + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); - XOR_ACC(COL_OFF(qcol, ioff), REC_QR_X); - XOR_ACC(COL_OFF(rcol, ioff), REC_QR_Y); + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); - /* Save Qxy */ - COPY(REC_QR_X, REC_QR_D); + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); /* Calc X */ - MUL(coeff[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ - MUL(coeff[MUL_QR_X], REC_QR_X); /* X = X * xm */ - STORE(COL_OFF(xcol, ioff), REC_QR_X); - - if (calcy) { - /* Calc Y */ - MUL(coeff[MUL_QR_YQ], REC_QR_D); /* X = Q * xqm */ - XOR(REC_QR_Y, REC_QR_D); /* X = R ^ X */ - MUL(coeff[MUL_QR_Y], REC_QR_D); /* X = X * xm */ - STORE(COL_OFF(ycol, ioff), REC_QR_D); - } + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); } } + /* * Reconstruct two data columns using QR parity - * @rec_method REC_QR_BLOCK() + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1009,158 +1207,182 @@ static raidz_inline int raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_QR_BLOCK(rm, 0, short_size, x, y, coeff, ncols, ncols, B_TRUE); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); - /* short_size - xsize */ - REC_QR_BLOCK(rm, short_size, xsize, x, y, coeff, ncols, nbigcols, - xsize == ysize); + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_Q) | (1 << CODE_R)); } + /* - * Reconstruct using PQR parity - */ + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + v_t *z = (v_t *)c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); -#define REC_PQR_SYN_UPDATE() \ -{ \ - MUL2(REC_PQR_Y); \ - MUL4(REC_PQR_Z); \ -} + SYN_PQR_DEFINE(); + + MUL2_SETUP(); -#define REC_PQR_INNER_LOOP(c) \ -{ \ - col = &rm->rm_col[(c)]; \ - LOAD(COL_OFF(col, ioff), REC_PQR_D); \ - REC_PQR_SYN_UPDATE(); \ - XOR(REC_PQR_D, REC_PQR_X); \ - XOR(REC_PQR_D, REC_PQR_Y); \ - XOR(REC_PQR_D, REC_PQR_Z); \ + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } } + /* - * Reconstruction using PQR parity - * @rm RAIDZ map - * @off starting offset - * @end ending offset - * @x missing data column - * @y missing data column - * @z missing data column - * @coeff multiplication coefficients - * @ncols number of column - * @nbigcols number of big columns - * @calcy calculate second data column - * @calcz calculate third data column - */ -static raidz_inline void -REC_PQR_BLOCK(raidz_map_t * const rm, const size_t off, const size_t end, - const int x, const int y, const int z, const unsigned *coeff, - const int ncols, const int nbigcols, const boolean_t calcy, - const boolean_t calcz) -{ - int c; - size_t ioff; - const size_t firstdc = raidz_parity(rm); - raidz_col_t * const pcol = raidz_col_p(rm, CODE_P); - raidz_col_t * const qcol = raidz_col_p(rm, CODE_Q); - raidz_col_t * const rcol = raidz_col_p(rm, CODE_R); - raidz_col_t * const xcol = raidz_col_p(rm, x); - raidz_col_t * const ycol = raidz_col_p(rm, y); - raidz_col_t * const zcol = raidz_col_p(rm, z); - raidz_col_t *col; + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + v_t *z = (v_t *)t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + const v_t *r = (v_t *)c[CODE_R]; REC_PQR_DEFINE(); - for (ioff = off; ioff < end; ioff += (REC_PQR_STRIDE * sizeof (v_t))) { - MUL2_SETUP(); - LOAD(COL_OFF(pcol, ioff), REC_PQR_X); - ZERO(REC_PQR_Y); - ZERO(REC_PQR_Z); - - if (ncols == nbigcols) { - for (c = firstdc; c < x; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < y; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < z; c++) - REC_PQR_INNER_LOOP(c); - - REC_PQR_SYN_UPDATE(); - for (c++; c < nbigcols; c++) - REC_PQR_INNER_LOOP(c); - } else { - for (c = firstdc; c < nbigcols; c++) { - REC_PQR_SYN_UPDATE(); - if (c != x && c != y && c != z) { - col = &rm->rm_col[c]; - LOAD(COL_OFF(col, ioff), REC_PQR_D); - XOR(REC_PQR_D, REC_PQR_X); - XOR(REC_PQR_D, REC_PQR_Y); - XOR(REC_PQR_D, REC_PQR_Z); - } - } - for (; c < ncols; c++) - REC_PQR_SYN_UPDATE(); - } - - XOR_ACC(COL_OFF(qcol, ioff), REC_PQR_Y); - XOR_ACC(COL_OFF(rcol, ioff), REC_PQR_Z); + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); + + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); /* Save Pxyz and Qxyz */ COPY(REC_PQR_X, REC_PQR_XS); COPY(REC_PQR_Y, REC_PQR_YS); /* Calc X */ - MUL(coeff[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ - MUL(coeff[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ XOR(REC_PQR_Y, REC_PQR_X); - MUL(coeff[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ - STORE(COL_OFF(xcol, ioff), REC_PQR_X); + STORE(x, REC_PQR_X); - if (calcy) { - /* Calc Y */ - XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ - MUL(coeff[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ - XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ - COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ - MUL(coeff[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ - MUL(coeff[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ - XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ - STORE(COL_OFF(ycol, ioff), REC_PQR_YS); - } - - if (calcz) { - /* Calc Z */ - XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ - STORE(COL_OFF(zcol, ioff), REC_PQR_YS); - } + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); + + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); } } + /* * Reconstruct three data columns using PQR parity - * @rec_method REC_PQR_BLOCK() + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes @@ -1168,31 +1390,87 @@ static raidz_inline int raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) { - const int x = tgtidx[TARGET_X]; - const int y = tgtidx[TARGET_Y]; - const int z = tgtidx[TARGET_Z]; - const int ncols = raidz_ncols(rm); - const int nbigcols = raidz_nbigcols(rm); - const size_t xsize = raidz_col_size(rm, x); - const size_t ysize = raidz_col_size(rm, y); - const size_t zsize = raidz_col_size(rm, z); - const size_t short_size = raidz_short_size(rm); + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + raidz_math_begin(); - /* 0 - short_size */ - REC_PQR_BLOCK(rm, 0, short_size, x, y, z, coeff, ncols, ncols, - B_TRUE, B_TRUE); - - /* short_size - xsize */ - REC_PQR_BLOCK(rm, short_size, xsize, x, y, z, coeff, ncols, nbigcols, - xsize == ysize, xsize == zsize); + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); raidz_math_end(); + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_scalar.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_scalar.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_scalar.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_scalar.c 2017-01-20 18:18:28.000000000 +0000 @@ -154,71 +154,96 @@ #define raidz_math_begin() {} #define raidz_math_end() {} -#define GEN_P_DEFINE() v_t p0 -#define GEN_P_STRIDE 1 -#define GEN_P_P p0 - -#define GEN_PQ_DEFINE() v_t d0, p0, q0 -#define GEN_PQ_STRIDE 1 -#define GEN_PQ_D d0 -#define GEN_PQ_P p0 -#define GEN_PQ_Q q0 - -#define GEN_PQR_DEFINE() v_t d0, p0, q0, r0 -#define GEN_PQR_STRIDE 1 -#define GEN_PQR_D d0 -#define GEN_PQR_P p0 -#define GEN_PQR_Q q0 -#define GEN_PQR_R r0 - -#define REC_P_DEFINE() v_t x0 -#define REC_P_STRIDE 1 -#define REC_P_X x0 - -#define REC_Q_DEFINE() v_t x0 -#define REC_Q_STRIDE 1 -#define REC_Q_X x0 - -#define REC_R_DEFINE() v_t x0 -#define REC_R_STRIDE 1 -#define REC_R_X x0 - -#define REC_PQ_DEFINE() v_t x0, y0, d0 -#define REC_PQ_STRIDE 1 -#define REC_PQ_X x0 -#define REC_PQ_Y y0 -#define REC_PQ_D d0 - -#define REC_PR_DEFINE() v_t x0, y0, d0 -#define REC_PR_STRIDE 1 -#define REC_PR_X x0 -#define REC_PR_Y y0 -#define REC_PR_D d0 - -#define REC_QR_DEFINE() v_t x0, y0, d0 -#define REC_QR_STRIDE 1 -#define REC_QR_X x0 -#define REC_QR_Y y0 -#define REC_QR_D d0 - -#define REC_PQR_DEFINE() v_t x0, y0, z0, d0, t0 -#define REC_PQR_STRIDE 1 -#define REC_PQR_X x0 -#define REC_PQR_Y y0 -#define REC_PQR_Z z0 -#define REC_PQR_D d0 -#define REC_PQR_XS d0 -#define REC_PQR_YS t0 +#define SYN_STRIDE 1 + +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 + +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 + +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 + +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 + +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 + +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 + +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 + +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 + + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 -#include "vdev_raidz_math_impl.h" -/* - * If compiled with -O0, gcc doesn't do any stack frame coalescing - * and -Wframe-larger-than=1024 is triggered in debug mode. - * Starting with gcc 4.8, new opt level -Og is introduced for debugging, which - * does not trigger this warning. - */ -#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 + +#include "vdev_raidz_math_impl.h" DEFINE_GEN_METHODS(scalar); DEFINE_REC_METHODS(scalar); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_sse2.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_sse2.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_sse2.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_sse2.c 2017-01-20 18:18:28.000000000 +0000 @@ -58,9 +58,6 @@ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) {} -#define PREFETCH(ptr, offset) {} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -106,27 +103,8 @@ break; \ } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r) "\n" \ - "pxor %" VR2(r) ", %" VR2(r) "\n" \ - "pxor %" VR3(r) ", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r)); \ - break; \ - case 1: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r)); \ - break; \ - } \ -} + +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -236,6 +214,10 @@ #define MUL2(r...) \ { \ switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(VR0(r), VR1(r)); \ + _MUL2_x2(VR2(r), VR3(r)); \ + break; \ case 2: \ _MUL2_x2(VR0(r), VR1(r)); \ break; \ @@ -255,7 +237,7 @@ #define _MUL_PARAM(x, in, acc) \ { \ - if (x & 0x01) { COPY(in, acc); } else { XOR(acc, acc); } \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ if (x & 0xfe) { MUL2(in); } \ if (x & 0x02) { XOR(in, acc); } \ if (x & 0xfc) { MUL2(in); } \ @@ -271,8 +253,8 @@ if (x & 0x80) { MUL2(in); XOR(in, acc); } \ } -#define _mul_x1_in 9 -#define _mul_x1_acc 11 +#define _mul_x1_in 11 +#define _mul_x1_acc 12 #define MUL_x1_DEFINE(x) \ static void \ @@ -533,61 +515,87 @@ #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 2 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 +#define GEN_PQ_STRIDE 4 #define GEN_PQ_DEFINE() {} -#define GEN_PQ_STRIDE 2 -#define GEN_PQ_D 0, 1 -#define GEN_PQ_P 2, 3 -#define GEN_PQ_Q 4, 5 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 - -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 - -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 2 -#define REC_Q_X 0, 1 - -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 2 -#define REC_R_X 0, 1 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() MUL2_SETUP() #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() MUL2_SETUP() #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() MUL2_SETUP() #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() MUL2_SETUP() #define REC_PQR_X 0 #define REC_PQR_Y 1 #define REC_PQR_Z 2 -#define REC_PQR_D 3 -#define REC_PQR_XS 4 -#define REC_PQR_YS 5 +#define REC_PQR_XS 3 +#define REC_PQR_YS 4 #include diff -Nru zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_ssse3.c zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_ssse3.c --- zfs-linux-0.7.0-rc2/module/zfs/vdev_raidz_math_ssse3.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/vdev_raidz_math_ssse3.c 2017-01-20 18:18:28.000000000 +0000 @@ -66,19 +66,6 @@ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} #define XOR_ACC(src, r...) \ { \ @@ -122,25 +109,7 @@ } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r) "\n" \ - "pxor %" VR2(r) ", %" VR2(r) "\n" \ - "pxor %" VR3(r) ", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "pxor %" VR0(r) ", %" VR0(r) "\n" \ - "pxor %" VR1(r) ", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -337,59 +306,86 @@ #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() -#define GEN_P_DEFINE() {} + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 +#define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 - -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 - -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 - -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 -#define REC_PQ_DEFINE() {} #define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() {} #define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() {} #define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 -#define REC_PQR_DEFINE() {} #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 @@ -419,7 +415,8 @@ #endif /* defined(__x86_64) && defined(HAVE_SSSE3) */ -#if defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2)) +#if defined(__x86_64) +#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) const uint8_t __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { @@ -2473,4 +2470,5 @@ 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } }; -#endif /* defined(__x86_64) && (defined(HAVE_SSSE3) || defined(HAVE_AVX2)) */ +#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */ +#endif /* defined(__x86_64) */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zap_leaf.c zfs-linux-0.7.0-rc3/module/zfs/zap_leaf.c --- zfs-linux-0.7.0-rc2/module/zfs/zap_leaf.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zap_leaf.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ /* @@ -538,7 +538,7 @@ int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf) + uint8_t integer_size, uint64_t num_integers, const void *buf) { int delta_chunks; zap_leaf_t *l = zeh->zeh_leaf; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zap_micro.c zfs-linux-0.7.0-rc3/module/zfs/zap_micro.c --- zfs-linux-0.7.0-rc2/module/zfs/zap_micro.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zap_micro.c 2017-01-20 18:18:28.000000000 +0000 @@ -124,7 +124,7 @@ * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when - * chosing the bucket. + * choosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); @@ -584,7 +584,7 @@ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = zio_buf_alloc(sz); + mzp = vmem_alloc(sz, KM_SLEEP); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; @@ -592,7 +592,7 @@ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { - zio_buf_free(mzp, sz); + vmem_free(mzp, sz); return (err); } } @@ -619,7 +619,7 @@ if (err) break; } - zio_buf_free(mzp, sz); + vmem_free(mzp, sz); *zapp = zap; return (err); } @@ -1128,34 +1128,30 @@ cmn_err(CE_PANIC, "out of entries!"); } -int -zap_add(objset_t *os, uint64_t zapobj, const char *key, +static int +zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) + const void *val, dmu_tx_t *tx, void *tag) { - zap_t *zap; - int err; + int err = 0; mzap_ent_t *mze; const uint64_t *intval = val; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { - err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, - FTAG, tx); + tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { @@ -1168,8 +1164,39 @@ } ASSERT(zap == zn->zn_zap); zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ return (err); } @@ -1288,23 +1315,17 @@ return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); } -int -zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, +static int +zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { - zap_t *zap; - int err; mzap_ent_t *mze; zap_name_t *zn; + int err = 0; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); + if (zn == NULL) return (SET_ERROR(ENOTSUP)); - } if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { @@ -1319,6 +1340,34 @@ } } zap_name_free(zn); + return (err); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, mt, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, MT_EXACT, tx); zap_unlockdir(zap, FTAG); return (err); } @@ -1589,6 +1638,7 @@ EXPORT_SYMBOL(zap_create_claim_norm_dnsize); EXPORT_SYMBOL(zap_destroy); EXPORT_SYMBOL(zap_lookup); +EXPORT_SYMBOL(zap_lookup_by_dnode); EXPORT_SYMBOL(zap_lookup_norm); EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); @@ -1596,12 +1646,14 @@ EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_count_write_by_dnode); EXPORT_SYMBOL(zap_add); +EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); +EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); EXPORT_SYMBOL(zap_count); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfeature.c zfs-linux-0.7.0-rc3/module/zfs/zfeature.c --- zfs-linux-0.7.0-rc2/module/zfs/zfeature.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfeature.c 2017-01-20 18:18:28.000000000 +0000 @@ -277,7 +277,8 @@ static int -feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ ASSERTV(uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj); ASSERT(zfeature_depends_on(feature->fi_feature, @@ -500,7 +501,8 @@ * Returns B_FALSE otherwise (i.e. if the feature is not enabled). */ boolean_t -spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) { +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) +{ int err; ASSERT(VALID_FEATURE_FID(fid)); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfeature_common.c zfs-linux-0.7.0-rc3/module/zfs/zfeature_common.c --- zfs-linux-0.7.0-rc2/module/zfs/zfeature_common.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfeature_common.c 2017-01-20 18:18:28.000000000 +0000 @@ -122,7 +122,8 @@ } boolean_t -zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { +zfeature_depends_on(spa_feature_t fid, spa_feature_t check) +{ zfeature_info_t *feature = &spa_feature_table[fid]; int i; @@ -133,6 +134,18 @@ return (B_FALSE); } +static boolean_t +deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature) +{ + int i; + + for (i = 0; deps[i] != SPA_FEATURE_NONE; i++) + if (deps[i] == feature) + return (B_TRUE); + + return (B_FALSE); +} + static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, zfeature_flags_t flags, const spa_feature_t *deps) @@ -150,6 +163,9 @@ if (deps == NULL) deps = nodeps; + VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) || + (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET))); + feature->fi_feature = fid; feature->fi_guid = guid; feature->fi_uname = name; @@ -217,8 +233,8 @@ { static const spa_feature_t filesystem_limits_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_acl.c zfs-linux-0.7.0-rc3/module/zfs/zfs_acl.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_acl.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_acl.c 2017-01-20 18:18:28.000000000 +0000 @@ -2492,15 +2492,8 @@ * If attribute then validate against base file */ if (is_attr) { - uint64_t parent; - - if ((error = sa_lookup(zp->z_sa_hdl, - SA_ZPL_PARENT(ZTOZSB(zp)), &parent, - sizeof (parent))) != 0) - return (error); - if ((error = zfs_zget(ZTOZSB(zp), - parent, &xzp)) != 0) { + zp->z_xattr_parent, &xzp)) != 0) { return (error); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_ctldir.c zfs-linux-0.7.0-rc3/module/zfs/zfs_ctldir.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_ctldir.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_ctldir.c 2017-01-20 18:18:28.000000000 +0000 @@ -111,11 +111,6 @@ int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; int zfs_admin_snapshot = 1; -/* - * Dedicated task queue for unmounting snapshots. - */ -static taskq_t *zfs_expire_taskq; - typedef struct { char *se_name; /* full snapshot name */ char *se_path; /* full mount path */ @@ -147,7 +142,7 @@ se->se_spa = spa; se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; - se->se_taskqid = -1; + se->se_taskqid = TASKQID_INVALID; refcount_create(&se->se_refcount); @@ -339,7 +334,7 @@ return; } - se->se_taskqid = -1; + se->se_taskqid = TASKQID_INVALID; (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); zfsctl_snapshot_rele(se); @@ -365,8 +360,8 @@ { ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); - if (taskq_cancel_id(zfs_expire_taskq, se->se_taskqid) == 0) { - se->se_taskqid = -1; + if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { + se->se_taskqid = TASKQID_INVALID; zfsctl_snapshot_rele(se); } } @@ -377,13 +372,13 @@ static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { - ASSERT3S(se->se_taskqid, ==, -1); + ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); if (delay <= 0) return; zfsctl_snapshot_hold(se); - se->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq, + se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); } @@ -548,7 +543,6 @@ int zfsctl_create(zfs_sb_t *zsb) { -#if defined(CONFIG_64BIT) ASSERT(zsb->z_ctldir == NULL); zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT, @@ -557,9 +551,6 @@ return (SET_ERROR(ENOENT)); return (0); -#else - return (SET_ERROR(EOPNOTSUPP)); -#endif /* CONFIG_64BIT */ } /* @@ -873,7 +864,7 @@ ZFS_MAX_DATASET_NAME_LEN, from); if (error == 0) error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, - ZFS_MAX_DATASET_NAME_LEN, to); + ZFS_MAX_DATASET_NAME_LEN, to); if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); if (error != 0) @@ -1261,9 +1252,6 @@ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_objsetid)); rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); - - zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri, - 1, 8, TASKQ_PREPOPULATE); } /* @@ -1273,8 +1261,6 @@ void zfsctl_fini(void) { - taskq_destroy(zfs_expire_taskq); - avl_destroy(&zfs_snapshots_by_name); avl_destroy(&zfs_snapshots_by_objsetid); rw_destroy(&zfs_snapshot_lock); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_debug.c zfs-linux-0.7.0-rc3/module/zfs/zfs_debug.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_debug.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_debug.c 2017-01-20 18:18:28.000000000 +0000 @@ -62,7 +62,7 @@ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; (void) snprintf(buf, size, "%-12llu %-s\n", - (u_longlong_t) zdm->zdm_timestamp, zdm->zdm_msg); + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); return (0); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_dir.c zfs-linux-0.7.0-rc3/module/zfs/zfs_dir.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_dir.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_dir.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ @@ -240,7 +240,7 @@ mutex_enter(&dzp->z_lock); for (;;) { - if (dzp->z_unlinked) { + if (dzp->z_unlinked && !(flag & ZXATTR)) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); @@ -838,7 +838,7 @@ */ int zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) + boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; zfs_sb_t *zsb = ZTOZSB(dzp); @@ -998,8 +998,9 @@ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), &xzp->z_id, sizeof (xzp->z_id), tx)); - (void) zfs_log_create(zsb->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, acl_ids.z_fuidp, vap); + if (!zp->z_unlinked) + (void) zfs_log_create(zsb->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_fm.c zfs-linux-0.7.0-rc3/module/zfs/zfs_fm.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_fm.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_fm.c 2017-01-20 18:18:28.000000000 +0000 @@ -272,15 +272,13 @@ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, spa_load_state(spa), NULL); - if (spa != NULL) { - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, - DATA_TYPE_STRING, - spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? - FM_EREPORT_FAILMODE_WAIT : - spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? - FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, - NULL); - } + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + DATA_TYPE_STRING, + spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? + FM_EREPORT_FAILMODE_WAIT : + spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, + NULL); if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_ioctl.c zfs-linux-0.7.0-rc3/module/zfs/zfs_ioctl.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_ioctl.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_ioctl.c 2017-01-20 18:18:28.000000000 +0000 @@ -467,6 +467,14 @@ dsl_dataset_t *ds; dsl_pool_t *dp; + /* + * First do a quick check for root in the global zone, which + * is allowed to do all write_perms. This ensures that zfs_ioc_* + * will get to handle nonexistent datasets. + */ + if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) + return (0); + error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); @@ -2033,8 +2041,10 @@ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); - if (error == EIO) + if (error == EIO) { + nvlist_free(nv); return (error); + } VERIFY0(error); } if (error == 0) @@ -3631,6 +3641,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) { zfs_sb_t *zsb; + zvol_state_t *zv; int error; if (get_zfs_sb(fsname, &zsb) == 0) { @@ -3643,6 +3654,9 @@ error = error ? error : resume_err; } deactivate_super(zsb->z_sb); + } else if ((zv = zvol_suspend(fsname)) != NULL) { + error = dsl_dataset_rollback(fsname, zvol_tag(zv), outnvl); + zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, NULL, outnvl); } @@ -3881,7 +3895,7 @@ * because GRUB doesn't support them. */ if (zfs_is_bootfs(dsname) && - intval != ZFS_DNSIZE_LEGACY) { + intval != ZFS_DNSIZE_LEGACY) { return (SET_ERROR(EDOM)); } @@ -4230,6 +4244,7 @@ if (error == 0) { zfs_sb_t *zsb = NULL; + zvol_state_t *zv = NULL; if (get_zfs_sb(tofs, &zsb) == 0) { /* online recv */ @@ -4245,6 +4260,9 @@ error = zfs_resume_fs(zsb, tofs); error = error ? error : end_err; deactivate_super(zsb->z_sb); + } else if ((zv = zvol_suspend(tofs)) != NULL) { + error = dmu_recv_end(&drc, zvol_tag(zv)); + zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } @@ -4273,7 +4291,7 @@ *read_bytes = off - input_fp->f_offset; if (VOP_SEEK(input_fp->f_vnode, input_fp->f_offset, &off, NULL) == 0) - input_fp->f_offset = off; + input_fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { @@ -4305,7 +4323,7 @@ /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL - * explictly if we're restoring local properties cleared in the + * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && @@ -4461,7 +4479,7 @@ return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", - (uchar_t **) &begin_record, &begin_record_size); + (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); @@ -5354,7 +5372,7 @@ do { error = zfs_zevent_next(ze, &event, - &zc->zc_nvlist_dst_size, &dropped); + &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); @@ -5560,7 +5578,7 @@ off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, - fd, resumeobj, resumeoff, fp->f_vnode, &off); + fd, resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5629,7 +5647,7 @@ if (error != 0) goto out; error = dmu_send_estimate(tosnap, fromsnap, compressok, - &space); + &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* @@ -5765,7 +5783,7 @@ static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) + zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_log.c zfs-linux-0.7.0-rc3/module/zfs/zfs_log.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_log.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_log.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ @@ -212,6 +213,34 @@ } /* + * If zp is an xattr node, check whether the xattr owner is unlinked. + * We don't want to log anything if the owner is unlinked. + */ +static int +zfs_xattr_owner_unlinked(znode_t *zp) +{ + int unlinked = 0; + znode_t *dzp; + igrab(ZTOI(zp)); + /* + * if zp is XATTR node, keep walking up via z_xattr_parent until we + * get the owner + */ + while (zp->z_pflags & ZFS_XATTR) { + ASSERT3U(zp->z_xattr_parent, !=, 0); + if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { + unlinked = 1; + break; + } + iput(ZTOI(zp)); + zp = dzp; + unlinked = zp->z_unlinked; + } + iput(ZTOI(zp)); + return (unlinked); +} + +/* * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and * TK_MKXATTR transactions. * @@ -247,7 +276,7 @@ size_t namesize = strlen(name) + 1; size_t fuidsz = 0; - if (zil_replaying(zilog, tx)) + if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) return; /* @@ -346,13 +375,13 @@ */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid) + znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; lr_remove_t *lr; size_t namesize = strlen(name) + 1; - if (zil_replaying(zilog, tx)) + if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize); @@ -370,7 +399,7 @@ */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) + znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; lr_link_t *lr; @@ -425,7 +454,7 @@ */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; lr_rename_t *lr; @@ -455,15 +484,16 @@ void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag, - zil_callback_t callback, void *callback_data) + znode_t *zp, offset_t off, ssize_t resid, int ioflag, + zil_callback_t callback, void *callback_data) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; ssize_t immediate_write_sz; - if (zil_replaying(zilog, tx) || zp->z_unlinked) { + if (zil_replaying(zilog, tx) || zp->z_unlinked || + zfs_xattr_owner_unlinked(zp)) { if (callback != NULL) callback(callback_data); return; @@ -538,12 +568,13 @@ */ void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) + znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; - if (zil_replaying(zilog, tx) || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked || + zfs_xattr_owner_unlinked(zp)) return; itx = zil_itx_create(txtype, sizeof (*lr)); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_replay.c zfs-linux-0.7.0-rc3/module/zfs/zfs_replay.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_replay.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_replay.c 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Cyril Plisko. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -58,7 +58,7 @@ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { bzero(vap, sizeof (*vap)); vap->va_mask = (uint_t)mask; @@ -870,7 +870,7 @@ * The FUID table index may no longer be valid and * during zfs_create() a new index may be assigned. * Because of this the log will contain the original - * doman+rid in order to create a new FUID. + * domain+rid in order to create a new FUID. * * The individual ACEs may contain an ephemeral uid/gid which is no * longer valid and will need to be replaced with an actual FUID. diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_rlock.c zfs-linux-0.7.0-rc3/module/zfs/zfs_rlock.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_rlock.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_rlock.c 2017-01-20 18:18:28.000000000 +0000 @@ -65,7 +65,7 @@ * Otherwise, the proxy lock is split into smaller lock ranges and * new proxy locks created for non overlapping ranges. * The reference counts are adjusted accordingly. - * Meanwhile, the orginal lock is kept around (this is the callers handle) + * Meanwhile, the original lock is kept around (this is the callers handle) * and its offset and length are used when releasing the lock. * * Thread coordination @@ -87,7 +87,7 @@ * * Grow block handling * ------------------- - * ZFS supports multiple block sizes currently upto 128K. The smallest + * ZFS supports multiple block sizes currently up to 128K. The smallest * block size is used for the file which is grown as needed. During this * growth all other writers and readers must be excluded. * So if the block size needs to be grown then the whole file is diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_sa.c zfs-linux-0.7.0-rc3/module/zfs/zfs_sa.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_sa.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_sa.c 2017-01-20 18:18:28.000000000 +0000 @@ -203,13 +203,13 @@ return (error); } - obj = zio_buf_alloc(size); + obj = vmem_alloc(size, KM_SLEEP); error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size); if (error == 0) error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP); - zio_buf_free(obj, size); + vmem_free(obj, size); return (error); } @@ -233,7 +233,7 @@ if (error) goto out; - obj = zio_buf_alloc(size); + obj = vmem_alloc(size, KM_SLEEP); error = nvlist_pack(zp->z_xattr_cached, &obj, &size, NV_ENCODE_XDR, KM_SLEEP); @@ -253,7 +253,7 @@ dmu_tx_commit(tx); } out_free: - zio_buf_free(obj, size); + vmem_free(obj, size); out: return (error); } diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_vfsops.c zfs-linux-0.7.0-rc3/module/zfs/zfs_vfsops.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_vfsops.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_vfsops.c 2017-01-20 18:18:28.000000000 +0000 @@ -744,19 +744,17 @@ zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_SLEEP); /* - * We claim to always be readonly so we can open snapshots; - * other ZPL code will prevent us from writing to snapshots. + * Optional temporary mount options, free'd in zfs_sb_free(). */ - error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zsb, &os); - if (error) { - kmem_free(zsb, sizeof (zfs_sb_t)); - return (error); - } + zsb->z_mntopts = (zmo ? zmo : zfs_mntopts_alloc()); /* - * Optional temporary mount options, free'd in zfs_sb_free(). + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. */ - zsb->z_mntopts = (zmo ? zmo : zfs_mntopts_alloc()); + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zsb, &os); + if (error) + goto out_zmo; /* * Initialize the zfs-specific filesystem structure. @@ -896,8 +894,9 @@ out: dmu_objset_disown(os, zsb); +out_zmo: *zsbp = NULL; - + zfs_mntopts_free(zsb->z_mntopts); kmem_free(zsb, sizeof (zfs_sb_t)); return (error); } @@ -912,13 +911,6 @@ if (error) return (error); - /* - * Set the objset user_ptr to track its zsb. - */ - mutex_enter(&zsb->z_os->os_user_ptr_lock); - dmu_objset_set_user(zsb->z_os, zsb); - mutex_exit(&zsb->z_os->os_user_ptr_lock); - zsb->z_log = zil_open(zsb->z_os, zfs_get_data); /* @@ -982,6 +974,13 @@ readonly_changed_cb(zsb, B_TRUE); } + /* + * Set the objset user_ptr to track its zsb. + */ + mutex_enter(&zsb->z_os->os_user_ptr_lock); + dmu_objset_set_user(zsb->z_os, zsb); + mutex_exit(&zsb->z_os->os_user_ptr_lock); + return (0); } EXPORT_SYMBOL(zfs_sb_setup); @@ -1222,8 +1221,9 @@ defined(SHRINKER_NUMA_AWARE) if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { *objects = 0; - for_each_online_node(sc.nid) + for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); + } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); } @@ -1344,7 +1344,7 @@ if (!unmounting) { mutex_enter(&zsb->z_znodes_lock); for (zp = list_head(&zsb->z_all_znodes); zp != NULL; - zp = list_next(&zsb->z_all_znodes, zp)) { + zp = list_next(&zsb->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); } @@ -1919,7 +1919,11 @@ void zfs_fini(void) { - taskq_wait_outstanding(system_taskq, 0); + /* + * we don't use outstanding because zpl_posix_acl_free might add more. + */ + taskq_wait(system_delay_taskq); + taskq_wait(system_taskq); unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_vnops.c zfs-linux-0.7.0-rc3/module/zfs/zfs_vnops.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_vnops.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_vnops.c 2017-01-20 18:18:28.000000000 +0000 @@ -857,7 +857,7 @@ /* * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. + * privileged and at least one of the execute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID @@ -952,7 +952,7 @@ if (atomic_read(&ip->i_count) == 1) VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)), - (task_func_t *)iput, ip, TQ_SLEEP) != 0); + (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); else iput(ip); } @@ -1086,7 +1086,7 @@ error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= zp->z_blksz); + ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O @@ -1313,6 +1313,9 @@ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); + if (name == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); os = zsb->z_os; @@ -1509,6 +1512,123 @@ } EXPORT_SYMBOL(zfs_create); +/* ARGSUSED */ +int +zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL, *dzp = ITOZ(dip); + zfs_sb_t *zsb = ITOZSB(dip); + objset_t *os; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zsb->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zsb); + ZFS_VERIFY_ZP(dzp); + os = zsb->z_os; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zsb); + return (error); + } + } + +top: + *ipp = NULL; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (zfs_acl_ids_overquota(zsb, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); + + fuid_dirtied = zsb->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zsb, tx); + if (!zsb->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); + if (error) { + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zsb); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zsb, tx); + + /* Add to unlinked set */ + zp->z_unlinked = 1; + zfs_unlinked_add(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +out: + + if (error) { + if (zp) + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *ipp = ZTOI(zp); + } + + ZFS_EXIT(zsb); + return (error); +} + /* * Remove an entry from a directory. * @@ -1550,6 +1670,9 @@ int zflg = ZEXISTS; boolean_t waited = B_FALSE; + if (name == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); zilog = zsb->z_log; @@ -1801,6 +1924,9 @@ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); zilog = zsb->z_log; @@ -1963,6 +2089,9 @@ int zflg = ZEXISTS; boolean_t waited = B_FALSE; + if (name == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); zilog = zsb->z_log; @@ -1998,7 +2127,7 @@ } /* - * Grab a lock on the directory to make sure that noone is + * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); @@ -2497,7 +2626,7 @@ if (zsb->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - - dmu_objset_id(zsb->z_os); + dmu_objset_id(zsb->z_os); } ZFS_EXIT(zsb); @@ -3265,6 +3394,9 @@ int zflg = 0; boolean_t waited = B_FALSE; + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(sdzp); zilog = zsb->z_log; @@ -3609,6 +3741,9 @@ ASSERT(S_ISLNK(vap->va_mode)); + if (name == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); zilog = zsb->z_log; @@ -3802,9 +3937,16 @@ uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; - + boolean_t is_tmpfile = 0; + uint64_t txg; +#ifdef HAVE_TMPFILE + is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); +#endif ASSERT(S_ISDIR(tdip->i_mode)); + if (name == NULL) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(dzp); zilog = zsb->z_log; @@ -3885,6 +4027,9 @@ tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); @@ -3900,23 +4045,43 @@ ZFS_EXIT(zsb); return (error); } - + /* unmark z_unlinked so zfs_link_create will not reject */ + if (is_tmpfile) + szp->z_unlinked = 0; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); + /* + * tmpfile is created to be in z_unlinkedobj, so remove it. + * Also, we don't log in ZIL, be cause all previous file + * operation on the tmpfile are ignored by ZIL. Instead we + * always wait for txg to sync to make sure all previous + * operation are sync safe. + */ + if (is_tmpfile) { + VERIFY(zap_remove_int(zsb->z_os, zsb->z_unlinkedobj, + szp->z_id, tx) == 0); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = 1; } - + txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); - if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) + if (!is_tmpfile && zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); + if (is_tmpfile) + txg_wait_synced(dmu_objset_pool(zsb->z_os), txg); + zfs_inode_update(dzp); zfs_inode_update(szp); ZFS_EXIT(zsb); @@ -4112,8 +4277,7 @@ * writepages() normally handles the entire commit for * performance reasons. */ - if (zsb->z_log != NULL) - zil_commit(zsb->z_log, zp->z_id); + zil_commit(zsb->z_log, zp->z_id); } ZFS_EXIT(zsb); @@ -4768,6 +4932,7 @@ #endif /* HAVE_UIO_ZEROCOPY */ #if defined(_KERNEL) && defined(HAVE_SPL) +/* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); module_param(zfs_read_chunk_size, long, 0644); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zfs_znode.c zfs-linux-0.7.0-rc3/module/zfs/zfs_znode.c --- zfs-linux-0.7.0-rc2/module/zfs/zfs_znode.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zfs_znode.c 2017-01-20 18:18:28.000000000 +0000 @@ -119,6 +119,7 @@ zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; zp->z_moved = 0; return (0); } @@ -478,6 +479,34 @@ } } +void +zfs_set_inode_flags(znode_t *zp, struct inode *ip) +{ + /* + * Linux and Solaris have different sets of file attributes, so we + * restrict this conversion to the intersection of the two. + */ +#ifdef HAVE_INODE_SET_FLAGS + unsigned int flags = 0; + if (zp->z_pflags & ZFS_IMMUTABLE) + flags |= S_IMMUTABLE; + if (zp->z_pflags & ZFS_APPENDONLY) + flags |= S_APPEND; + + inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); +#else + if (zp->z_pflags & ZFS_IMMUTABLE) + ip->i_flags |= S_IMMUTABLE; + else + ip->i_flags &= ~S_IMMUTABLE; + + if (zp->z_pflags & ZFS_APPENDONLY) + ip->i_flags |= S_APPEND; + else + ip->i_flags &= ~S_APPEND; +#endif +} + /* * Update the embedded inode given the znode. We should work toward * eliminating this function as soon as possible by removing values @@ -574,9 +603,7 @@ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || - tmp_gen == 0) { - + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; @@ -589,6 +616,11 @@ set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); + zfs_set_inode_flags(zp, ip); + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); @@ -759,7 +791,7 @@ links = 2; } else { size = 0; - links = 1; + links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) @@ -915,6 +947,7 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; + boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); @@ -926,7 +959,6 @@ ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); @@ -952,11 +984,8 @@ zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); - ZTOI(zp)->i_flags |= S_IMMUTABLE; - } else { - ZTOI(zp)->i_flags &= ~S_IMMUTABLE; + update_inode = B_TRUE; } - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); @@ -967,12 +996,8 @@ zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); - ZTOI(zp)->i_flags |= S_APPEND; - } else { - - ZTOI(zp)->i_flags &= ~S_APPEND; + update_inode = B_TRUE; } - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); @@ -1012,6 +1037,9 @@ zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } + + if (update_inode) + zfs_set_inode_flags(zp, ZTOI(zp)); } int @@ -1060,34 +1088,30 @@ mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); - if (zp->z_unlinked) { - err = SET_ERROR(ENOENT); - } else { - /* - * If igrab() returns NULL the VFS has independently - * determined the inode should be evicted and has - * called iput_final() to start the eviction process. - * The SA handle is still valid but because the VFS - * requires that the eviction succeed we must drop - * our locks and references to allow the eviction to - * complete. The zfs_zget() may then be retried. - * - * This unlikely case could be optimized by registering - * a sops->drop_inode() callback. The callback would - * need to detect the active SA hold thereby informing - * the VFS that this inode should not be evicted. - */ - if (igrab(ZTOI(zp)) == NULL) { - mutex_exit(&zp->z_lock); - sa_buf_rele(db, NULL); - zfs_znode_hold_exit(zsb, zh); - /* inode might need this to finish evict */ - cond_resched(); - goto again; - } - *zpp = zp; - err = 0; + /* + * If igrab() returns NULL the VFS has independently + * determined the inode should be evicted and has + * called iput_final() to start the eviction process. + * The SA handle is still valid but because the VFS + * requires that the eviction succeed we must drop + * our locks and references to allow the eviction to + * complete. The zfs_zget() may then be retried. + * + * This unlikely case could be optimized by registering + * a sops->drop_inode() callback. The callback would + * need to detect the active SA hold thereby informing + * the VFS that this inode should not be evicted. + */ + if (igrab(ZTOI(zp)) == NULL) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zsb, zh); + /* inode might need this to finish evict */ + cond_resched(); + goto again; } + *zpp = zp; + err = 0; mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zsb, zh); @@ -1221,12 +1245,12 @@ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); set_nlink(ZTOI(zp), (uint32_t)links); + zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = 0; zfs_inode_update(zp); - zfs_znode_hold_exit(zsb, zh); return (0); @@ -2141,6 +2165,7 @@ EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); +/* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zil.c zfs-linux-0.7.0-rc3/module/zfs/zil.c --- zfs-linux-0.7.0-rc2/module/zfs/zil.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zil.c 2017-01-20 18:18:28.000000000 +0000 @@ -40,6 +40,7 @@ #include #include #include +#include /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -543,7 +544,7 @@ /* * Allocate an initial log block if: * - there isn't one already - * - the existing block is the wrong endianess + * - the existing block is the wrong endianness */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); @@ -878,6 +879,7 @@ * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -914,12 +916,14 @@ /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); @@ -2266,6 +2270,7 @@ module_param(zfs_nocacheflush, int, 0644); MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); +/* CSTYLED */ module_param(zil_slog_limit, ulong, 0644); MODULE_PARM_DESC(zil_slog_limit, "Max commit bytes to separate log device"); #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zio.c zfs-linux-0.7.0-rc3/module/zfs/zio.c --- zfs-linux-0.7.0-rc2/module/zfs/zio.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zio.c 2017-01-20 18:18:28.000000000 +0000 @@ -42,6 +42,7 @@ #include #include #include +#include /* * ========================================================================== @@ -56,7 +57,7 @@ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; -int zio_dva_throttle_enabled = B_FALSE; +int zio_dva_throttle_enabled = B_TRUE; /* * ========================================================================== @@ -67,6 +68,11 @@ kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#endif + int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 @@ -131,7 +137,7 @@ size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; -#ifdef _ILP32 +#if defined(_ILP32) && defined(_KERNEL) /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. @@ -212,6 +218,13 @@ if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; #endif +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) + (void) printf("zio_fini: [%d] %llu != %llu\n", + (int)((c + 1) << SPA_MINBLOCKSHIFT), + (long long unsigned)zio_buf_cache_allocs[c], + (long long unsigned)zio_buf_cache_frees[c]); +#endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); @@ -251,6 +264,9 @@ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_allocs[c], 1); +#endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } @@ -271,26 +287,15 @@ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } -/* - * Use zio_buf_alloc_flags when specific allocation flags are needed. e.g. - * passing KM_NOSLEEP when it is acceptable for an allocation to fail. - */ -void * -zio_buf_alloc_flags(size_t size, int flags) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - return (kmem_cache_alloc(zio_buf_cache[c], flags)); -} - void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_frees[c], 1); +#endif kmem_cache_free(zio_buf_cache[c], buf); } @@ -311,12 +316,18 @@ * ========================================================================== */ void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, - zio_transform_func_t *transform) +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, + zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - zt->zt_orig_data = zio->io_data; + /* + * Ensure that anyone expecting this zio to contain a linear ABD isn't + * going to get a nasty surprise when they try to access the data. + */ + IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); + + zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; @@ -324,7 +335,7 @@ zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; - zio->io_data = data; + zio->io_abd = data; zio->io_size = size; } @@ -336,12 +347,12 @@ while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, - zt->zt_orig_data, zt->zt_orig_size); + zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_abd); - zio->io_data = zt->zt_orig_data; + zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; @@ -355,21 +366,26 @@ * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_abd, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size); + abd_return_buf_copy(data, tmp, size); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } } /* @@ -552,7 +568,7 @@ */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, @@ -580,6 +596,7 @@ offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); + metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; @@ -611,7 +628,7 @@ zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; - zio->io_orig_data = zio->io_data = data; + zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; @@ -641,6 +658,7 @@ static void zio_destroy(zio_t *zio) { + metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); @@ -755,7 +773,7 @@ zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -773,7 +791,7 @@ zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -814,7 +832,7 @@ } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -967,7 +985,7 @@ zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -988,7 +1006,7 @@ zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -1011,8 +1029,9 @@ * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -1024,7 +1043,7 @@ */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -1090,9 +1109,9 @@ } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, - int type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private) +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, + int type, zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *private) { zio_t *zio; @@ -1151,14 +1170,17 @@ !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); - - zio_push_transform(zio, cbuf, psize, psize, zio_decompress); + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1299,7 +1321,7 @@ /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1337,9 +1359,11 @@ zio_buf_free(cbuf, lsize); psize = lsize; } else { - bzero((char *)cbuf + psize, rounded - psize); + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); psize = rounded; - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1555,9 +1579,9 @@ * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, - (task_func_t *) zio_interrupt, + (task_func_t *)zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); - if (!tid) { + if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. @@ -1942,26 +1966,38 @@ * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1972,8 +2008,12 @@ * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1983,7 +2023,8 @@ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1991,16 +2032,18 @@ } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -2064,13 +2107,14 @@ zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -2087,13 +2131,16 @@ if (zio->io_error) return; + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_abd); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -2103,7 +2150,8 @@ } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2117,7 +2165,7 @@ * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -2126,13 +2174,14 @@ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -2165,7 +2214,8 @@ ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -2205,6 +2255,12 @@ mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static int zio_write_gang_block(zio_t *pio) { @@ -2215,6 +2271,7 @@ zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -2244,7 +2301,8 @@ } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, + &pio->io_alloc_list, pio); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2275,12 +2333,14 @@ gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -2302,9 +2362,9 @@ zp.zp_nopwrite = B_FALSE; cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, - lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, - &gn->gn_child[g], pio->io_priority, + abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, + lsize, &zp, zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { @@ -2320,7 +2380,6 @@ zp.zp_copies, cio, flags)); } zio_nowait(cio); - } /* @@ -2423,10 +2482,11 @@ ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ - if (zio->io_error == 0 && dde->dde_repair_data == NULL) - dde->dde_repair_data = zio->io_data; + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } @@ -2459,16 +2519,16 @@ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, - zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); @@ -2498,8 +2558,9 @@ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } - if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2537,12 +2598,10 @@ if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || - bcmp(zio->io_data, lio->io_data, - zio->io_size) != 0); + abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, - zio->io_orig_size) != 0); + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } @@ -2552,7 +2611,7 @@ if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; - void *tmpbuf; + abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); @@ -2563,19 +2622,19 @@ ddt_exit(ddt); - tmpbuf = zio_buf_alloc(psize); + tmpabd = abd_alloc_for_io(psize, B_TRUE); - error = zio_wait(zio_read(NULL, spa, &blk, tmpbuf, + error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { - if (bcmp(tmpbuf, zio->io_data, psize) != 0) + if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } - zio_buf_free(tmpbuf, psize); + abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { @@ -2597,7 +2656,7 @@ &aflags, &zio->io_bookmark); if (error == 0) { - if (bcmp(abuf->b_data, zio->io_orig_data, + if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); @@ -2762,12 +2821,12 @@ return (ZIO_PIPELINE_CONTINUE); } - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } @@ -2784,13 +2843,13 @@ ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } @@ -2955,7 +3014,8 @@ flags |= METASLAB_ASYNC_ALLOC; error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio); if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " @@ -3021,18 +3081,24 @@ boolean_t use_slog) { int error = 1; + zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); + metaslab_trace_init(&io_alloc_list); + if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, + &io_alloc_list, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, NULL); + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, + &io_alloc_list, NULL); } + metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); @@ -3130,11 +3196,11 @@ P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -3264,7 +3330,7 @@ { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3398,7 +3464,7 @@ } } - zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } @@ -3537,7 +3603,7 @@ if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { - ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } @@ -3616,6 +3682,7 @@ * Always attempt to keep stack usage minimal here since * we can be called recurisvely up to 19 levels deep. */ + uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; zio_link_t *zl = NULL; @@ -3696,28 +3763,35 @@ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; - - if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + uint64_t asize = P2ROUNDUP(psize, align); + char *abuf = NULL; + abd_t *adata = zio->io_abd; + + if (asize != psize) { + adata = abd_alloc_linear(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); } + if (adata != NULL) + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); - if (asize != zio->io_size) - zio_buf_free(abuf, asize); + if (adata != NULL) + abd_return_buf(adata, abuf, asize); + + if (asize != psize) + abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ - vdev_stat_update(zio, zio->io_size); + vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding @@ -3738,9 +3812,9 @@ * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && - !vdev_is_dead(zio->io_vd)) + !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, - zio->io_vd, zio, 0, 0); + zio->io_vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && @@ -4098,7 +4172,6 @@ EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); -EXPORT_SYMBOL(zio_buf_alloc_flags); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zio_checksum.c zfs-linux-0.7.0-rc3/module/zfs/zio_checksum.c --- zfs-linux-0.7.0-rc2/module/zfs/zio_checksum.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zio_checksum.c 2017-01-20 18:18:28.000000000 +0000 @@ -20,8 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -30,6 +30,7 @@ #include #include #include +#include #include /* @@ -92,45 +93,85 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +/*ARGSUSED*/ +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, - {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, - zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, - {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, - zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, }; @@ -251,7 +292,7 @@ */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *abd, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -266,6 +307,7 @@ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + void *data = abd_to_buf(abd); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -283,18 +325,18 @@ else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); eck->zec_cksum = cksum; } else { - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &bp->blk_cksum); } } int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) + abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; int byteswap; @@ -308,25 +350,32 @@ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; zio_cksum_t verifier; + size_t eck_offset; + uint64_t data_size = size; + void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) + if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; - else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) + } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); - else + } else { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } - if (nused > size) + if (nused > data_size) { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) @@ -341,11 +390,15 @@ if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - ci->ci_func[byteswap](data, size, + abd_return_buf_copy(abd, data, data_size); + + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); - eck->zec_cksum = expected_cksum; + abd_copy_from_buf_off(abd, &expected_cksum, + eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, @@ -354,7 +407,7 @@ } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](data, size, + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } @@ -383,7 +436,7 @@ uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zio_compress.c zfs-linux-0.7.0-rc3/module/zfs/zio_compress.c --- zfs-linux-0.7.0-rc2/module/zfs/zio_compress.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zio_compress.c 2017-01-20 18:18:28.000000000 +0000 @@ -28,7 +28,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include @@ -41,24 +41,23 @@ /* * Compression vectors. */ - zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {NULL, NULL, 0, "inherit"}, - {NULL, NULL, 0, "on"}, - {NULL, NULL, 0, "uncompressed"}, - {lzjb_compress, lzjb_decompress, 0, "lzjb"}, - {NULL, NULL, 0, "empty"}, - {gzip_compress, gzip_decompress, 1, "gzip-1"}, - {gzip_compress, gzip_decompress, 2, "gzip-2"}, - {gzip_compress, gzip_decompress, 3, "gzip-3"}, - {gzip_compress, gzip_decompress, 4, "gzip-4"}, - {gzip_compress, gzip_decompress, 5, "gzip-5"}, - {gzip_compress, gzip_decompress, 6, "gzip-6"}, - {gzip_compress, gzip_decompress, 7, "gzip-7"}, - {gzip_compress, gzip_decompress, 8, "gzip-8"}, - {gzip_compress, gzip_decompress, 9, "gzip-9"}, - {zle_compress, zle_decompress, 64, "zle"}, - {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, + {"inherit", 0, NULL, NULL}, + {"on", 0, NULL, NULL}, + {"uncompressed", 0, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress}, + {"empty", 0, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress}, + {"gzip-2", 2, gzip_compress, gzip_decompress}, + {"gzip-3", 3, gzip_compress, gzip_decompress}, + {"gzip-4", 4, gzip_compress, gzip_decompress}, + {"gzip-5", 5, gzip_compress, gzip_decompress}, + {"gzip-6", 6, gzip_compress, gzip_decompress}, + {"gzip-7", 7, gzip_compress, gzip_decompress}, + {"gzip-8", 8, gzip_compress, gzip_decompress}, + {"gzip-9", 9, gzip_compress, gzip_decompress}, + {"zle", 64, zle_compress, zle_decompress}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs} }; enum zio_compress @@ -85,12 +84,26 @@ return (result); } +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + uint64_t *word; + + for (word = data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + size_t -zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) { - uint64_t *word, *word_end; size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; + void *tmp; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); @@ -99,12 +112,7 @@ * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by returning zero size. */ - word_end = (uint64_t *)((char *)src + s_len); - for (word = src; word < word_end; word++) - if (*word != 0) - break; - - if (word == word_end) + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) return (0); if (c == ZIO_COMPRESS_EMPTY) @@ -112,7 +120,11 @@ /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); - c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); + + /* No compression algorithms can read from ABDs directly */ + tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + abd_return_buf(src, tmp, s_len); if (c_len > d_len) return (s_len); @@ -122,13 +134,23 @@ } int -zio_decompress_data(enum zio_compress c, void *src, void *dst, +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len) { zio_compress_info_t *ci = &zio_compress_table[c]; - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + abd_return_buf(src, tmp, s_len); + + return (ret); +} diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zio_inject.c zfs-linux-0.7.0-rc3/module/zfs/zio_inject.c --- zfs-linux-0.7.0-rc2/module/zfs/zio_inject.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zio_inject.c 2017-01-20 18:18:28.000000000 +0000 @@ -41,7 +41,7 @@ */ #include -#include +#include #include #include #include diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zpl_export.c zfs-linux-0.7.0-rc3/module/zfs/zpl_export.c --- zfs-linux-0.7.0-rc2/module/zfs/zpl_export.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zpl_export.c 2017-01-20 18:18:28.000000000 +0000 @@ -37,6 +37,7 @@ #else zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) { + /* CSTYLED */ struct inode *ip = dentry->d_inode; #endif /* HAVE_ENCODE_FH_WITH_INODE */ fstrans_cookie_t cookie; diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zpl_file.c zfs-linux-0.7.0-rc3/module/zfs/zpl_file.c --- zfs-linux-0.7.0-rc2/module/zfs/zpl_file.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zpl_file.c 2017-01-20 18:18:28.000000000 +0000 @@ -130,12 +130,15 @@ return (error); } +#ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { struct file *filp = kiocb->ki_filp; return (zpl_fsync(filp, file_dentry(filp), datasync)); } +#endif + #elif defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, @@ -161,11 +164,14 @@ return (error); } +#ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } +#endif + #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 - 3.x API, @@ -196,11 +202,14 @@ return (error); } +#ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } +#endif + #else #error "Unsupported fops->fsync() implementation" #endif @@ -728,8 +737,7 @@ * is outside of our jurisdiction. */ -#define fchange(f0, f1, b0, b1) ((((f0) & (b0)) == (b0)) != \ - (((b1) & (f1)) == (f1))) +#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int zpl_ioctl_setflags(struct file *filp, void __user *arg) @@ -838,7 +846,9 @@ #endif .mmap = zpl_mmap, .fsync = zpl_fsync, +#ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, +#endif #ifdef HAVE_FILE_FALLOCATE .fallocate = zpl_fallocate, #endif /* HAVE_FILE_FALLOCATE */ diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zpl_inode.c zfs-linux-0.7.0-rc3/module/zfs/zpl_inode.c --- zfs-linux-0.7.0-rc2/module/zfs/zpl_inode.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zpl_inode.c 2017-01-20 18:18:28.000000000 +0000 @@ -214,6 +214,45 @@ return (error); } +#ifdef HAVE_TMPFILE +static int +zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) +{ + cred_t *cr = CRED(); + struct inode *ip; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + /* d_tmpfile will do drop_nlink, so we should set it first */ + set_nlink(ip, 1); + d_tmpfile(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + /* + * don't need to handle error here, file is already in + * unlinked set. + */ + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} +#endif + static int zpl_unlink(struct inode *dir, struct dentry *dentry) { @@ -275,7 +314,7 @@ } static int -zpl_rmdir(struct inode * dir, struct dentry *dentry) +zpl_rmdir(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; @@ -340,7 +379,7 @@ if (vap->va_mask & ATTR_ATIME) ip->i_atime = timespec_trunc(ia->ia_atime, - ip->i_sb->s_time_gran); + ip->i_sb->s_time_gran); cookie = spl_fstrans_mark(); error = -zfs_setattr(ip, vap, 0, cr); @@ -618,6 +657,7 @@ zpl_revalidate(struct dentry *dentry, unsigned int flags) { #endif /* HAVE_D_REVALIDATE_NAMEIDATA */ + /* CSTYLED */ zfs_sb_t *zsb = dentry->d_sb->s_fs_info; int error; @@ -677,6 +717,9 @@ .fallocate = zpl_fallocate, #endif /* HAVE_INODE_FALLOCATE */ #if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) @@ -701,6 +744,9 @@ #else .rename = zpl_rename, #endif +#ifdef HAVE_TMPFILE + .tmpfile = zpl_tmpfile, +#endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR @@ -710,6 +756,9 @@ #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) @@ -721,7 +770,9 @@ }; const struct inode_operations zpl_symlink_inode_operations = { +#ifdef HAVE_GENERIC_READLINK .readlink = generic_readlink, +#endif #if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) .get_link = zpl_get_link, #elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) @@ -750,6 +801,9 @@ #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif #if defined(HAVE_GET_ACL) .get_acl = zpl_get_acl, #elif defined(HAVE_CHECK_ACL) diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zpl_xattr.c zfs-linux-0.7.0-rc3/module/zfs/zpl_xattr.c --- zfs-linux-0.7.0-rc2/module/zfs/zpl_xattr.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zpl_xattr.c 2017-01-20 18:18:28.000000000 +0000 @@ -50,7 +50,7 @@ * are the security.selinux xattrs which are less than 100 bytes and * exist for every file when xattr labeling is enabled. * - * The Linux xattr implemenation has been written to take advantage of + * The Linux xattr implementation has been written to take advantage of * this typical usage. When the dataset property 'xattr=sa' is set, * then xattrs will be preferentially stored as System Attributes (SA). * This allows tiny xattrs (~100 bytes) to be stored with the dnode and @@ -936,7 +936,7 @@ */ #ifdef CONFIG_FS_POSIX_ACL int -zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl) +zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) { struct super_block *sb = ITOZSB(ip)->z_sb; char *name, *value = NULL; @@ -1140,7 +1140,7 @@ umode_t mode; if (S_ISDIR(ip->i_mode)) { - error = zpl_set_acl(ip, ACL_TYPE_DEFAULT, acl); + error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); if (error) goto out; } @@ -1151,7 +1151,7 @@ ip->i_mode = mode; zfs_mark_inode_dirty(ip); if (error > 0) - error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); } } out: @@ -1178,7 +1178,7 @@ error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); if (!error) - error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); zpl_posix_acl_release(acl); @@ -1308,7 +1308,7 @@ acl = NULL; } - error = zpl_set_acl(ip, type, acl); + error = zpl_set_acl(ip, acl, type); zpl_posix_acl_release(acl); return (error); @@ -1348,7 +1348,7 @@ acl = NULL; } - error = zpl_set_acl(ip, type, acl); + error = zpl_set_acl(ip, acl, type); zpl_posix_acl_release(acl); return (error); @@ -1441,3 +1441,103 @@ return (NULL); } + +#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) +struct acl_rel_struct { + struct acl_rel_struct *next; + struct posix_acl *acl; + clock_t time; +}; + +#define ACL_REL_GRACE (60*HZ) +#define ACL_REL_WINDOW (1*HZ) +#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) + +/* + * Lockless multi-producer single-consumer fifo list. + * Nodes are added to tail and removed from head. Tail pointer is our + * synchronization point. It always points to the next pointer of the last + * node, or head if list is empty. + */ +static struct acl_rel_struct *acl_rel_head = NULL; +static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; + +static void +zpl_posix_acl_free(void *arg) +{ + struct acl_rel_struct *freelist = NULL; + struct acl_rel_struct *a; + clock_t new_time; + boolean_t refire = B_FALSE; + + ASSERT3P(acl_rel_head, !=, NULL); + while (acl_rel_head) { + a = acl_rel_head; + if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { + /* + * If a is the last node we need to reset tail, but we + * need to use cmpxchg to make sure it is still the + * last node. + */ + if (acl_rel_tail == &a->next) { + acl_rel_head = NULL; + if (cmpxchg(&acl_rel_tail, &a->next, + &acl_rel_head) == &a->next) { + ASSERT3P(a->next, ==, NULL); + a->next = freelist; + freelist = a; + break; + } + } + /* + * a is not last node, make sure next pointer is set + * by the adder and advance the head. + */ + while (ACCESS_ONCE(a->next) == NULL) + cpu_relax(); + acl_rel_head = a->next; + a->next = freelist; + freelist = a; + } else { + /* + * a is still in grace period. We are responsible to + * reschedule the free task, since adder will only do + * so if list is empty. + */ + new_time = a->time + ACL_REL_SCHED; + refire = B_TRUE; + break; + } + } + + if (refire) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, new_time); + + while (freelist) { + a = freelist; + freelist = a->next; + kfree(a->acl); + kmem_free(a, sizeof (struct acl_rel_struct)); + } +} + +void +zpl_posix_acl_release_impl(struct posix_acl *acl) +{ + struct acl_rel_struct *a, **prev; + + a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); + a->next = NULL; + a->acl = acl; + a->time = ddi_get_lbolt(); + /* atomically points tail to us and get the previous tail */ + prev = xchg(&acl_rel_tail, &a->next); + ASSERT3P(*prev, ==, NULL); + *prev = a; + /* if it was empty before, schedule the free task */ + if (prev == &acl_rel_head) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); +} +#endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zrlock.c zfs-linux-0.7.0-rc3/module/zfs/zrlock.c --- zfs-linux-0.7.0-rc2/module/zfs/zrlock.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zrlock.c 2017-01-20 18:18:28.000000000 +0000 @@ -70,11 +70,7 @@ } void -#ifdef ZFS_DEBUG -zrl_add_debug(zrlock_t *zrl, const char *zc) -#else -zrl_add(zrlock_t *zrl) -#endif +zrl_add_impl(zrlock_t *zrl, const char *zc) { uint32_t n = (uint32_t)zrl->zr_refcount; @@ -199,11 +195,7 @@ #if defined(_KERNEL) && defined(HAVE_SPL) -#ifdef ZFS_DEBUG -EXPORT_SYMBOL(zrl_add_debug); -#else -EXPORT_SYMBOL(zrl_add); -#endif +EXPORT_SYMBOL(zrl_add_impl); EXPORT_SYMBOL(zrl_remove); #endif diff -Nru zfs-linux-0.7.0-rc2/module/zfs/zvol.c zfs-linux-0.7.0-rc3/module/zfs/zvol.c --- zfs-linux-0.7.0-rc2/module/zfs/zvol.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zfs/zvol.c 2017-01-20 18:18:28.000000000 +0000 @@ -61,12 +61,16 @@ static kmutex_t zvol_state_lock; static list_t zvol_state_list; -void *zvol_tag = "zvol_tag"; + +#define ZVOL_HT_SIZE 1024 +static struct hlist_head *zvol_htable; +#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) +static DEFINE_IDA(zvol_ida); /* * The in-core state of each volume. */ -typedef struct zvol_state { +struct zvol_state { char zv_name[MAXNAMELEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize; /* volume block size */ @@ -81,7 +85,11 @@ struct gendisk *zv_disk; /* generic disk */ struct request_queue *zv_queue; /* request queue */ list_node_t zv_next; /* next zvol_state_t linkage */ -} zvol_state_t; + uint64_t zv_hash; /* name hash */ + struct hlist_node zv_hlink; /* hash link */ + atomic_t zv_suspend_ref; /* refcount for suspend */ + krwlock_t zv_suspend_lock; /* suspend lock */ +}; typedef enum { ZVOL_ASYNC_CREATE_MINORS, @@ -102,30 +110,17 @@ #define ZVOL_RDONLY 0x1 -/* - * Find the next available range of ZVOL_MINORS minor numbers. The - * zvol_state_list is kept in ascending minor order so we simply need - * to scan the list for the first gap in the sequence. This allows us - * to recycle minor number as devices are created and removed. - */ -static int -zvol_find_minor(unsigned *minor) +static uint64_t +zvol_name_hash(const char *name) { - zvol_state_t *zv; - - *minor = 0; - ASSERT(MUTEX_HELD(&zvol_state_lock)); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { - if (MINOR(zv->zv_dev) != MINOR(*minor)) - break; + int i; + uint64_t crc = -1ULL; + uint8_t *p = (uint8_t *)name; + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } - - /* All minors are in use */ - if (*minor >= (1 << MINORBITS)) - return (SET_ERROR(ENXIO)); - - return (0); + return (crc); } /* @@ -147,23 +142,33 @@ } /* - * Find a zvol_state_t given the name provided at zvol_alloc() time. + * Find a zvol_state_t given the name and hash generated by zvol_name_hash. */ static zvol_state_t * -zvol_find_by_name(const char *name) +zvol_find_by_name_hash(const char *name, uint64_t hash) { zvol_state_t *zv; + struct hlist_node *p; ASSERT(MUTEX_HELD(&zvol_state_lock)); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0) + hlist_for_each(p, ZVOL_HT_HEAD(hash)) { + zv = hlist_entry(p, zvol_state_t, zv_hlink); + if (zv->zv_hash == hash && + strncmp(zv->zv_name, name, MAXNAMELEN) == 0) return (zv); } - return (NULL); } +/* + * Find a zvol_state_t given the name provided at zvol_alloc() time. + */ +static zvol_state_t * +zvol_find_by_name(const char *name) +{ + return (zvol_find_by_name_hash(name, zvol_name_hash(name))); +} + /* * Given a path, return TRUE if path is a ZVOL. @@ -281,7 +286,7 @@ return (SET_ERROR(EINVAL)); #ifdef _ILP32 - if (volsize - 1 > MAXOFFSET_T) + if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); @@ -369,6 +374,7 @@ if (zv != NULL) zv->zv_objset = os; } else { + rw_enter(&zv->zv_suspend_lock, RW_READER); os = zv->zv_objset; } @@ -388,6 +394,8 @@ dmu_objset_disown(os, FTAG); if (zv != NULL) zv->zv_objset = NULL; + } else { + rw_exit(&zv->zv_suspend_lock); } mutex_exit(&zvol_state_lock); return (error); @@ -453,6 +461,8 @@ goto out; } + rw_enter(&zv->zv_suspend_lock, RW_READER); + tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); @@ -467,6 +477,7 @@ if (error == 0) zv->zv_volblocksize = volblocksize; } + rw_exit(&zv->zv_suspend_lock); out: mutex_exit(&zvol_state_lock); @@ -568,9 +579,9 @@ return; immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - ? 0 : zvol_immediate_write_sz; + ? 0 : zvol_immediate_write_sz; slogging = spa_has_slogs(zilog->zl_spa) && - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); while (size) { itx_t *itx; @@ -781,6 +792,8 @@ #endif int error = 0; + rw_enter(&zv->zv_suspend_lock, RW_READER); + uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio.uio_skip = BIO_BI_SKIP(bio); uio.uio_resid = BIO_BI_SIZE(bio); @@ -832,6 +845,7 @@ generic_end_io_acct(rw, &zv->zv_disk->part0, start); out1: BIO_END_IO(bio, -error); + rw_exit(&zv->zv_suspend_lock); spl_fstrans_unmark(cookie); #ifdef HAVE_MAKE_REQUEST_FN_RET_INT return (0); @@ -921,60 +935,50 @@ } /* - * The zvol_state_t's are inserted in increasing MINOR(dev_t) order. + * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ static void -zvol_insert(zvol_state_t *zv_insert) +zvol_insert(zvol_state_t *zv) { - zvol_state_t *zv = NULL; - ASSERT(MUTEX_HELD(&zvol_state_lock)); - ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) - break; - } - - list_insert_before(&zvol_state_list, zv, zv_insert); + ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0); + list_insert_head(&zvol_state_list, zv); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void -zvol_remove(zvol_state_t *zv_remove) +zvol_remove(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zvol_state_lock)); - list_remove(&zvol_state_list, zv_remove); + list_remove(&zvol_state_list, zv); + hlist_del(&zv->zv_hlink); } +/* + * Setup zv after we just own the zv->objset + */ static int -zvol_first_open(zvol_state_t *zv) +zvol_setup_zv(zvol_state_t *zv) { - objset_t *os; uint64_t volsize; int error; uint64_t ro; - - /* lie and say we're read-only */ - error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); - if (error) - return (SET_ERROR(-error)); - - zv->zv_objset = os; + objset_t *os = zv->zv_objset; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) - goto out_owned; + return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) - goto out_owned; + return (SET_ERROR(error)); - error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + error = dmu_bonus_hold(os, ZVOL_OBJ, zv, &zv->zv_dbuf); if (error) - goto out_owned; + return (SET_ERROR(error)); set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; @@ -988,23 +992,20 @@ set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } - -out_owned: - if (error) { - dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; - } - - return (SET_ERROR(-error)); + return (0); } +/* + * Shutdown every zv_objset related stuff except zv_objset itself. + * The is the reverse of zvol_setup_zv. + */ static void -zvol_last_close(zvol_state_t *zv) +zvol_shutdown_zv(zvol_state_t *zv) { zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_buf_rele(zv->zv_dbuf, zvol_tag); + dmu_buf_rele(zv->zv_dbuf, zv); zv->zv_dbuf = NULL; /* @@ -1014,8 +1015,98 @@ !(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); +} + +/* + * return the proper tag for rollback and recv + */ +void * +zvol_tag(zvol_state_t *zv) +{ + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + return (zv->zv_open_count > 0 ? zv : NULL); +} - dmu_objset_disown(zv->zv_objset, zvol_tag); +/* + * Suspend the zvol for recv and rollback. + */ +zvol_state_t * +zvol_suspend(const char *name) +{ + zvol_state_t *zv; + + mutex_enter(&zvol_state_lock); + zv = zvol_find_by_name(name); + if (zv == NULL) + goto out; + + /* block all I/O, release in zvol_resume. */ + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + + atomic_inc(&zv->zv_suspend_ref); + + if (zv->zv_open_count > 0) + zvol_shutdown_zv(zv); +out: + mutex_exit(&zvol_state_lock); + return (zv); +} + +int +zvol_resume(zvol_state_t *zv) +{ + int error = 0; + + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + if (zv->zv_open_count > 0) { + VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); + VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); + VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); + dmu_objset_rele(zv->zv_objset, zv); + + error = zvol_setup_zv(zv); + } + rw_exit(&zv->zv_suspend_lock); + /* + * We need this because we don't hold zvol_state_lock while releasing + * zv_suspend_lock. zvol_remove_minors_impl thus cannot check + * zv_suspend_lock to determine it is safe to free because rwlock is + * not inherent atomic. + */ + atomic_dec(&zv->zv_suspend_ref); + + return (SET_ERROR(error)); +} + +static int +zvol_first_open(zvol_state_t *zv) +{ + objset_t *os; + int error; + + /* lie and say we're read-only */ + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os); + if (error) + return (SET_ERROR(-error)); + + zv->zv_objset = os; + + error = zvol_setup_zv(zv); + + if (error) { + dmu_objset_disown(os, zv); + zv->zv_objset = NULL; + } + + return (SET_ERROR(-error)); +} + +static void +zvol_last_close(zvol_state_t *zv) +{ + zvol_shutdown_zv(zv); + + dmu_objset_disown(zv->zv_objset, zv); zv->zv_objset = NULL; } @@ -1023,7 +1114,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; - int error = 0, drop_mutex = 0; + int error = 0, drop_mutex = 0, drop_suspend = 0; /* * If the caller is already holding the mutex do not take it @@ -1038,7 +1129,7 @@ /* * Obtain a copy of private_data under the lock to make sure - * that either the result of zvol_freeg() setting + * that either the result of zvol_free() setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ @@ -1049,6 +1140,10 @@ } if (zv->zv_open_count == 0) { + /* make sure zvol is not suspended when first open */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + drop_suspend = 1; + error = zvol_first_open(zv); if (error) goto out_mutex; @@ -1066,8 +1161,9 @@ out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); - out_mutex: + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); if (drop_mutex) mutex_exit(&zvol_state_lock); @@ -1091,9 +1187,15 @@ drop_mutex = 1; } + /* make sure zvol is not suspended when last close */ + if (zv->zv_open_count == 1) + rw_enter(&zv->zv_suspend_lock, RW_READER); + zv->zv_open_count--; - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { zvol_last_close(zv); + rw_exit(&zv->zv_suspend_lock); + } if (drop_mutex) mutex_exit(&zvol_state_lock); @@ -1112,6 +1214,7 @@ ASSERT(zv && zv->zv_open_count > 0); + rw_enter(&zv->zv_suspend_lock, RW_READER); switch (cmd) { case BLKFLSBUF: zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -1123,8 +1226,8 @@ default: error = -ENOTTY; break; - } + rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } @@ -1298,6 +1401,7 @@ strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rlock_init(&zv->zv_range_lock); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zv->zv_disk->major = zvol_major; zv->zv_disk->first_minor = (dev & MINORMASK); @@ -1318,14 +1422,16 @@ } /* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * Used for taskq, if used out side zvol_state_lock, you need to clear + * zv_disk->private_data inside lock first. */ static void -zvol_free(zvol_state_t *zv) +zvol_free_impl(void *arg) { - ASSERT(MUTEX_HELD(&zvol_state_lock)); + zvol_state_t *zv = arg; ASSERT(zv->zv_open_count == 0); + rw_destroy(&zv->zv_suspend_lock); zfs_rlock_destroy(&zv->zv_range_lock); zv->zv_disk->private_data = NULL; @@ -1334,10 +1440,21 @@ blk_cleanup_queue(zv->zv_queue); put_disk(zv->zv_disk); + ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS); kmem_free(zv, sizeof (zvol_state_t)); } /* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zvol_state_lock)); + zvol_free_impl(zv); +} + +/* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. @@ -1352,10 +1469,17 @@ uint64_t len; unsigned minor = 0; int error = 0; + int idx; + uint64_t hash = zvol_name_hash(name); + + idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + if (idx < 0) + return (SET_ERROR(-idx)); + minor = idx << ZVOL_MINOR_BITS; mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); + zv = zvol_find_by_name_hash(name, hash); if (zv) { error = SET_ERROR(EEXIST); goto out; @@ -1363,7 +1487,7 @@ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (error) goto out_doi; @@ -1375,15 +1499,12 @@ if (error) goto out_dmu_objset_disown; - error = zvol_find_minor(&minor); - if (error) - goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } + zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; @@ -1427,12 +1548,12 @@ if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, - ZIO_PRIORITY_SYNC_READ); + ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: - dmu_objset_disown(os, zvol_tag); + dmu_objset_disown(os, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); out: @@ -1449,6 +1570,7 @@ add_disk(zv->zv_disk); } else { mutex_exit(&zvol_state_lock); + ida_simple_remove(&zvol_ida, idx); } return (SET_ERROR(error)); @@ -1464,7 +1586,14 @@ ASSERT(MUTEX_HELD(&zvol_state_lock)); + rw_enter(&zv->zv_suspend_lock, RW_READER); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + rw_exit(&zv->zv_suspend_lock); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing @@ -1478,6 +1607,32 @@ set_disk_ro(zv->zv_disk, readonly); } +typedef struct minors_job { + list_t *list; + list_node_t link; + /* input */ + char *name; + /* output */ + int error; +} minors_job_t; + +/* + * Prefetch zvol dnodes for the minors_job + */ +static void +zvol_prefetch_minors_impl(void *arg) +{ + minors_job_t *job = arg; + char *dsname = job->name; + objset_t *os = NULL; + + job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG, + &os); + if (job->error == 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_objset_disown(os, FTAG); + } +} /* * Mask errors to continue dmu_objset_find() traversal @@ -1485,7 +1640,9 @@ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { - const char *name = (const char *)arg; + minors_job_t *j = arg; + list_t *minors_list = j->list; + const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); @@ -1496,9 +1653,21 @@ /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " - "%s is not a shapshot name\n", dsname); + "%s is not a shapshot name\n", dsname); } else { - (void) zvol_create_minor_impl(dsname); + minors_job_t *job; + char *n = strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); } return (0); @@ -1512,6 +1681,7 @@ { uint64_t snapdev; int error; + list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); @@ -1527,23 +1697,32 @@ * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { - /* create minor for the 'dsname' explicitly */ - error = zvol_create_minor_impl(dsname); - if ((error == 0 || error == EEXIST) && - (snapdev == ZFS_SNAPDEV_VISIBLE)) { - fstrans_cookie_t cookie = spl_fstrans_mark(); + minors_job_t *job; + char *n = strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); + + if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find((char *)dsname, - zvol_create_snap_minor_cb, (void *)dsname, + zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); - spl_fstrans_unmark(cookie); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", - dsname); + dsname); } return (0); @@ -1562,7 +1741,7 @@ * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * - * If the name represents a snapshot, a check is perfromed if the snapshot is + * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ @@ -1572,10 +1751,24 @@ int error = 0; fstrans_cookie_t cookie; char *atp, *parent; + list_t minors_list; + minors_job_t *job; if (zvol_inhibit_dev) return (0); + /* + * This is the list for prefetch jobs. Whenever we found a match + * during dmu_objset_find, we insert a minors_job to the list and do + * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need + * any lock because all list operation is done on the current thread. + * + * We will use this list to do zvol_create_minor_impl after prefetch + * so we don't have to traverse using dmu_objset_find again. + */ + list_create(&minors_list, sizeof (minors_job_t), + offsetof(minors_job_t, link)); + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) strlcpy(parent, name, MAXPATHLEN); @@ -1591,11 +1784,26 @@ } else { cookie = spl_fstrans_mark(); error = dmu_objset_find(parent, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN); + &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } kmem_free(parent, MAXPATHLEN); + taskq_wait_outstanding(system_taskq, 0); + + /* + * Prefetch is completed, we can do zvol_create_minor_impl + * sequentially. + */ + while ((job = list_head(&minors_list)) != NULL) { + list_remove(&minors_list, job); + if (!job->error) + zvol_create_minor_impl(job->name); + strfree(job->name); + kmem_free(job, sizeof (minors_job_t)); + } + + list_destroy(&minors_list); return (SET_ERROR(error)); } @@ -1608,6 +1816,7 @@ { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); + taskqid_t t, tid = TASKQID_INVALID; if (zvol_inhibit_dev) return; @@ -1623,15 +1832,27 @@ zv->zv_name[namelen] == '@'))) { /* If in use, leave alone */ - if (zv->zv_open_count > 0) + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); - zvol_free(zv); + + /* clear this so zvol_open won't open it */ + zv->zv_disk->private_data = NULL; + + /* try parallel zv_free, if failed do it in place */ + t = taskq_dispatch(system_taskq, zvol_free_impl, zv, + TQ_SLEEP); + if (t == TASKQID_INVALID) + zvol_free(zv); + else + tid = t; } } - mutex_exit(&zvol_state_lock); + if (tid != TASKQID_INVALID) + taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific snapshot only */ @@ -1653,7 +1874,8 @@ if (strcmp(zv->zv_name, name) == 0) { /* If in use, leave alone */ - if (zv->zv_open_count > 0) + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); zvol_free(zv); @@ -1848,7 +2070,7 @@ return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, - task, TQ_SLEEP); + task, TQ_SLEEP); return (0); } @@ -1895,7 +2117,7 @@ return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); - if ((async == B_FALSE) && (id != 0)) + if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } @@ -1910,7 +2132,7 @@ return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); - if ((async == B_FALSE) && (id != 0)) + if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } @@ -1926,23 +2148,32 @@ return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); - if ((async == B_FALSE) && (id != 0)) + if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } int zvol_init(void) { - int error; + int i, error; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), + KM_SLEEP); + if (!zvol_htable) { + error = ENOMEM; + goto out; + } + for (i = 0; i < ZVOL_HT_SIZE; i++) + INIT_HLIST_HEAD(&zvol_htable[i]); + error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - goto out; + goto out_free; } blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, @@ -1950,6 +2181,8 @@ return (0); +out_free: + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); out: mutex_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); @@ -1964,11 +2197,15 @@ blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); mutex_destroy(&zvol_state_lock); + + ida_destroy(&zvol_ida); } +/* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); @@ -1980,3 +2217,4 @@ module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); +/* END CSTYLED */ diff -Nru zfs-linux-0.7.0-rc2/module/zpios/pios.c zfs-linux-0.7.0-rc3/module/zpios/pios.c --- zfs-linux-0.7.0-rc2/module/zpios/pios.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/module/zpios/pios.c 2017-01-20 18:18:28.000000000 +0000 @@ -1,7 +1,7 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel - * against ZFS while still being flexibly controled from user space. + * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -179,7 +179,7 @@ rc = dmu_tx_assign(tx, TXG_WAIT); if (rc) { zpios_print(run_args->file, - "dmu_tx_assign() failed: %d\n", rc); + "dmu_tx_assign() failed: %d\n", rc); dmu_tx_abort(tx); return (rc); } @@ -187,7 +187,7 @@ rc = dmu_object_free(os, obj, tx); if (rc) { zpios_print(run_args->file, - "dmu_object_free() failed: %d\n", rc); + "dmu_object_free() failed: %d\n", rc); dmu_tx_abort(tx); return (rc); } @@ -213,14 +213,14 @@ rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL); if (rc) { zpios_print(run_args->file, "Error dmu_objset_create(%s, ...) " - "failed: %d\n", name, rc); + "failed: %d\n", name, rc); goto out; } rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); if (rc) { zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) " - "failed: %d\n", name, rc); + "failed: %d\n", name, rc); goto out_destroy; } @@ -229,7 +229,7 @@ if (obj == 0) { rc = -EBADF; zpios_print(run_args->file, "Error zpios_dmu_" - "object_create() failed, %d\n", rc); + "object_create() failed, %d\n", rc); goto out_destroy; } } @@ -268,7 +268,7 @@ rc2 = dsl_destroy_head(name); if (rc2) zpios_print(run_args->file, "Error dsl_destroy_head" - "(%s, ...) failed: %d\n", name, rc2); + "(%s, ...) failed: %d\n", name, rc2); } out: t->stop = zpios_timespec_now(); @@ -497,7 +497,7 @@ continue; } zpios_print(run_args->file, - "Error in dmu_tx_assign(), %d", rc); + "Error in dmu_tx_assign(), %d", rc); dmu_tx_abort(tx); return (rc); } @@ -588,7 +588,7 @@ if (rc) { zpios_print(run_args->file, "IO error while doing " - "dmu_write(): %d\n", rc); + "dmu_write(): %d\n", rc); break; } @@ -651,13 +651,13 @@ t.start = zpios_timespec_now(); rc = zpios_dmu_read(run_args, obj.os, obj.obj, - offset, chunk_size, buf); + offset, chunk_size, buf); t.stop = zpios_timespec_now(); t.delta = zpios_timespec_sub(t.stop, t.start); if (rc) { zpios_print(run_args->file, "IO error while doing " - "dmu_read(): %d\n", rc); + "dmu_read(): %d\n", rc); break; } @@ -928,7 +928,7 @@ spin_lock_init(&info->info_lock); info->info_size = ZPIOS_INFO_BUFFER_SIZE; info->info_buffer = - (char *) vmem_alloc(ZPIOS_INFO_BUFFER_SIZE, KM_SLEEP); + (char *)vmem_alloc(ZPIOS_INFO_BUFFER_SIZE, KM_SLEEP); info->info_head = info->info_buffer; file->private_data = (void *)info; @@ -1035,7 +1035,7 @@ break; default: zpios_print(file, "Bad config command %d\n", - kcfg.cfg_cmd); + kcfg.cfg_cmd); rc = -EINVAL; break; } @@ -1055,7 +1055,7 @@ rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof (zpios_cmd_t)); if (rc) { zpios_print(file, "Unable to copy command structure " - "from user to kernel memory, %d\n", rc); + "from user to kernel memory, %d\n", rc); goto out_cmd; } @@ -1074,7 +1074,7 @@ cmd_data_str)), kcmd->cmd_data_size); if (rc) { zpios_print(file, "Unable to copy data buffer " - "from user to kernel memory, %d\n", rc); + "from user to kernel memory, %d\n", rc); goto out_data; } } @@ -1090,7 +1090,7 @@ cmd_data_str)), data, kcmd->cmd_data_size); if (rc) { zpios_print(file, "Unable to copy data buffer " - "from kernel to user memory, %d\n", rc); + "from kernel to user memory, %d\n", rc); rc = -EFAULT; } diff -Nru zfs-linux-0.7.0-rc2/README.markdown zfs-linux-0.7.0-rc3/README.markdown --- zfs-linux-0.7.0-rc2/README.markdown 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/README.markdown 2017-01-20 18:18:28.000000000 +0000 @@ -1,10 +1,19 @@ -Native ZFS for Linux! - +

ZFS is an advanced file system and volume manager which was originally developed for Solaris and is now maintained by the Illumos community. ZFS on Linux, which is also known as ZoL, is currently feature complete. It -includes fully functional and stable SPA, DMU, ZVOL, and ZPL layers. +includes fully functional and stable SPA, DMU, ZVOL, and ZPL layers. And it's native! + +# Official Resources + * [Site](http://zfsonlinux.org) + * [Wiki](https://github.com/zfsonlinux/zfs/wiki) + * [Mailing lists](https://github.com/zfsonlinux/zfs/wiki/Mailing-Lists) + * [OpenZFS site](http://open-zfs.org/) +# Installation Full documentation for installing ZoL on your favorite Linux distribution can -be found at: +be found at [our site](http://zfsonlinux.org/). + +# Contribute & Develop +We have a separate document with [contribution guidelines](./.github/CONTRIBUTING.md). \ No newline at end of file diff -Nru zfs-linux-0.7.0-rc2/rpm/generic/zfs.spec.in zfs-linux-0.7.0-rc3/rpm/generic/zfs.spec.in --- zfs-linux-0.7.0-rc2/rpm/generic/zfs.spec.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/rpm/generic/zfs.spec.in 2017-01-20 18:18:28.000000000 +0000 @@ -39,6 +39,7 @@ # Generic enable switch for systemd %if %{with systemd} %define _systemd 1 +%define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target %endif # RHEL >= 7 comes with systemd @@ -73,7 +74,6 @@ Requires: libnvpair1 = %{version} Requires: libuutil1 = %{version} Requires: libzfs2 = %{version} -Requires: device-mapper Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version} @@ -85,7 +85,6 @@ BuildRequires: zlib-devel BuildRequires: libuuid-devel BuildRequires: libblkid-devel -BuildRequires: device-mapper-devel BuildRequires: libudev-devel BuildRequires: libattr-devel %endif @@ -239,7 +238,7 @@ %post %if 0%{?_systemd} -%systemd_post zfs.target +%systemd_post %{systemd_svcs} %else if [ -x /sbin/chkconfig ]; then /sbin/chkconfig --add zfs-import @@ -252,7 +251,7 @@ %preun %if 0%{?_systemd} -%systemd_preun zfs.target +%systemd_preun %{systemd_svcs} %else if [ $1 -eq 0 ] && [ -x /sbin/chkconfig ]; then /sbin/chkconfig --del zfs-import @@ -265,7 +264,7 @@ %postun %if 0%{?_systemd} -%systemd_postun zfs.target +%systemd_postun %{systemd_svcs} %endif %files diff -Nru zfs-linux-0.7.0-rc2/rpm/redhat/zfs.spec.in zfs-linux-0.7.0-rc3/rpm/redhat/zfs.spec.in --- zfs-linux-0.7.0-rc2/rpm/redhat/zfs.spec.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/rpm/redhat/zfs.spec.in 2017-01-20 18:18:28.000000000 +0000 @@ -39,6 +39,7 @@ # Generic enable switch for systemd %if %{with systemd} %define _systemd 1 +%define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target %endif # RHEL >= 7 comes with systemd @@ -73,7 +74,6 @@ Requires: libnvpair1 = %{version} Requires: libuutil1 = %{version} Requires: libzfs2 = %{version} -Requires: device-mapper Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version} @@ -85,7 +85,6 @@ BuildRequires: zlib-devel BuildRequires: libuuid-devel BuildRequires: libblkid-devel -BuildRequires: device-mapper-devel BuildRequires: libudev-devel BuildRequires: libattr-devel %endif @@ -239,7 +238,7 @@ %post %if 0%{?_systemd} -%systemd_post zfs.target +%systemd_post %{systemd_svcs} %else if [ -x /sbin/chkconfig ]; then /sbin/chkconfig --add zfs-import @@ -252,7 +251,7 @@ %preun %if 0%{?_systemd} -%systemd_preun zfs.target +%systemd_preun %{systemd_svcs} %else if [ $1 -eq 0 ] && [ -x /sbin/chkconfig ]; then /sbin/chkconfig --del zfs-import @@ -265,7 +264,7 @@ %postun %if 0%{?_systemd} -%systemd_postun zfs.target +%systemd_postun %{systemd_svcs} %endif %files diff -Nru zfs-linux-0.7.0-rc2/scripts/common.sh.in zfs-linux-0.7.0-rc3/scripts/common.sh.in --- zfs-linux-0.7.0-rc2/scripts/common.sh.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/common.sh.in 2017-01-20 18:18:28.000000000 +0000 @@ -567,8 +567,8 @@ udev_setup() { local SRC_PATH=$1 - # When running in tree manually contruct symlinks in tree to - # the proper devices. Symlinks are installed for all entires + # When running in tree manually construct symlinks in tree to + # the proper devices. Symlinks are installed for all entries # in the config file regardless of if that device actually # exists. When installed as a package udev can be relied on for # this and it will only create links for devices which exist. diff -Nru zfs-linux-0.7.0-rc2/scripts/cstyle.pl zfs-linux-0.7.0-rc3/scripts/cstyle.pl --- zfs-linux-0.7.0-rc2/scripts/cstyle.pl 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/cstyle.pl 2017-01-20 18:18:28.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # CDDL HEADER START # @@ -51,6 +51,7 @@ # require 5.0; +use warnings; use IO::File; use Getopt::Std; use strict; @@ -64,7 +65,7 @@ -C don't check anything in header block comments -P check for use of non-POSIX types -o constructs - allow a comma-seperated list of optional constructs: + allow a comma-separated list of optional constructs: doxygen allow doxygen-style block comments (/** /*!) splint allow splint-style lint comments (/*@ ... @*/) "; diff -Nru zfs-linux-0.7.0-rc2/scripts/paxcheck.sh zfs-linux-0.7.0-rc3/scripts/paxcheck.sh --- zfs-linux-0.7.0-rc2/scripts/paxcheck.sh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/paxcheck.sh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,43 @@ +#!/bin/sh + +if ! type scanelf > /dev/null 2>&1; then + echo "scanelf (from pax-utils) is required for these checks." >&2 + exit 3 +fi + +RET=0 + +# check for exec stacks +OUT="$(scanelf -qyRAF '%e %p' $1)" + +if [ x"${OUT}" != x ]; then + RET=2 + echo "The following files contain writable and executable sections" + echo " Files with such sections will not work properly (or at all!) on some" + echo " architectures/operating systems." + echo " For more information, see:" + echo " https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart" + echo + echo "${OUT}" + echo +fi + + +# check for TEXTRELS +OUT="$(scanelf -qyRAF '%T %p' $1)" + +if [ x"${OUT}" != x ]; then + RET=2 + echo "The following files contain runtime text relocations" + echo " Text relocations force the dynamic linker to perform extra" + echo " work at startup, waste system resources, and may pose a security" + echo " risk. On some architectures, the code may not even function" + echo " properly, if at all." + echo " For more information, see:" + echo " https://wiki.gentoo.org/wiki/Hardened/HOWTO_locate_and_fix_textrels" + echo + echo "${OUT}" + echo +fi + +exit $RET diff -Nru zfs-linux-0.7.0-rc2/scripts/zfs-helpers.sh zfs-linux-0.7.0-rc3/scripts/zfs-helpers.sh --- zfs-linux-0.7.0-rc2/scripts/zfs-helpers.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/zfs-helpers.sh 2017-01-20 18:18:28.000000000 +0000 @@ -42,7 +42,7 @@ OPTIONS: -d Dry run -h Show this message - -i Install the helper utilties + -i Install the helper utilities -r Remove the helper utilities -v Verbose diff -Nru zfs-linux-0.7.0-rc2/scripts/zimport.sh zfs-linux-0.7.0-rc3/scripts/zimport.sh --- zfs-linux-0.7.0-rc2/scripts/zimport.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/zimport.sh 2017-01-20 18:18:28.000000000 +0000 @@ -273,7 +273,7 @@ fi # Given the available images in the zfs-images directory substitute the -# list of available images for the reserved keywork 'all'. +# list of available images for the reserved keyword 'all'. for TAG in $POOL_TAGS; do if [ "$TAG" = "all" ]; then diff -Nru zfs-linux-0.7.0-rc2/scripts/zpios-profile/zpios-profile.sh zfs-linux-0.7.0-rc3/scripts/zpios-profile/zpios-profile.sh --- zfs-linux-0.7.0-rc2/scripts/zpios-profile/zpios-profile.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/scripts/zpios-profile/zpios-profile.sh 2017-01-20 18:18:28.000000000 +0000 @@ -91,7 +91,7 @@ } # NOTE: This whole process is crazy slow but it will do for now -aquire_pids() { +acquire_pids() { echo "--- Aquiring ZFS pids ---" for PID in `ls /proc/ | grep [0-9] | sort -n -u`; do @@ -218,7 +218,7 @@ done } -aquire_pids +acquire_pids log_pids # rm ${PROFILE_PID} diff -Nru zfs-linux-0.7.0-rc2/TEST zfs-linux-0.7.0-rc3/TEST --- zfs-linux-0.7.0-rc2/TEST 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/TEST 2017-01-20 18:18:28.000000000 +0000 @@ -76,7 +76,13 @@ case "$BB_NAME" in Amazon*) ;; -CentOS*) +CentOS-7*) + # ZFS enabled xfstests fails to build + TEST_XFSTESTS_SKIP="yes" + # Sporadic VERIFY(!zilog_is_dirty(zilog)) failed + TEST_ZILTEST_SKIP="yes" + ;; +CentOS-6*) # Sporadic VERIFY(!zilog_is_dirty(zilog)) failed TEST_ZILTEST_SKIP="yes" ;; @@ -88,8 +94,24 @@ ;; SUSE*) ;; +Ubuntu-16.04*) + # ZFS enabled xfstests fails to build + TEST_XFSTESTS_SKIP="yes" + TEST_FILEBENCH_SKIP="yes" + ;; Ubuntu*) ;; *) ;; esac + +### +# +# Disable the following test suites on 32-bit systems. +# +if [ $(getconf LONG_BIT) = "32" ]; then + TEST_ZTEST_SKIP="yes" + TEST_FILEBENCH_SKIP="yes" + TEST_XFSTESTS_SKIP="yes" + TEST_ZFSSTRESS_SKIP="yes" +fi diff -Nru zfs-linux-0.7.0-rc2/tests/runfiles/linux.run zfs-linux-0.7.0-rc3/tests/runfiles/linux.run --- zfs-linux-0.7.0-rc2/tests/runfiles/linux.run 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/runfiles/linux.run 2017-01-20 18:18:28.000000000 +0000 @@ -60,6 +60,9 @@ [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values'] +[tests/functional/chattr] +tests = ['chattr_001_pos', 'chattr_002_neg'] + [tests/functional/checksum] tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos'] @@ -95,7 +98,7 @@ 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', - 'zfs_create_013_pos'] + 'zfs_create_013_pos', 'zfs_create_014_pos'] # DISABLED: # zfs_destroy_005_neg - busy mountpoint behavior @@ -144,7 +147,7 @@ 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', - 'zfs_receive_013_pos'] + 'zfs_receive_013_pos', 'zfs_receive_014_pos'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', @@ -168,25 +171,25 @@ 'zfs_send_007_pos'] # DISABLED: -# mountpoint_003_pos - needs investigation -# ro_props_001_pos - https://github.com/zfsonlinux/zfs/issues/5201 -# user_property_002_pos - needs investigation +# ro_props_001_pos - https://github.com/zfsonlinux/zfs/issues/5511 [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', - 'mountpoint_002_pos', 'reservation_001_neg', + 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', - 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos'] + 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', + 'mountpoint_003_pos'] # DISABLED: Tests need to be updated for Linux share behavior -#[tests/functional/cli_root/zfs_share] -#tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', -# 'zfs_share_004_pos', 'zfs_share_005_pos', 'zfs_share_006_pos', -# 'zfs_share_007_neg', 'zfs_share_008_neg', 'zfs_share_009_neg', -# 'zfs_share_010_neg', 'zfs_share_011_pos'] +# zfs_share_005_pos - needs investigation, probably unsupported NFS share format +[tests/functional/cli_root/zfs_share] +tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', + 'zfs_share_004_pos', 'zfs_share_006_pos', + 'zfs_share_007_neg', 'zfs_share_008_neg', 'zfs_share_009_neg', + 'zfs_share_010_neg', 'zfs_share_011_pos'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', @@ -204,9 +207,11 @@ 'zfs_unmount_007_neg', 'zfs_unmount_008_neg'] # DISABLED: Tests need to be updated for Linux unshare behavior -#[tests/functional/cli_root/zfs_unshare] -#tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', -# 'zfs_unshare_004_neg', 'zfs_unshare_005_neg'] +# zfs_unshare_002_pos - zfs set sharenfs=off won't unshare if it was already off +# zfs_unshare_006_pos - some distros come with Samba "user shares" disabled +[tests/functional/cli_root/zfs_unshare] +tests = ['zfs_unshare_001_pos', 'zfs_unshare_003_pos', + 'zfs_unshare_004_neg', 'zfs_unshare_005_neg'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', @@ -292,7 +297,7 @@ 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', - 'zpool_import_013_neg', + 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos','zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', @@ -362,7 +367,8 @@ [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', - 'zpool_iostat_003_neg', 'zpool_iostat_004_pos'] + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', + 'zpool_iostat_005_pos'] user = [tests/functional/cli_user/zpool_list] @@ -476,11 +482,11 @@ tests = ['enospc_001_pos'] # DISABLED: +# nopwrite_volume - https://github.com/zfsonlinux/zfs/issues/5510 # nopwrite_varying_compression - needs investigation [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', - 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', - 'nopwrite_volume'] + 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync'] # DISABLED: needs investigation #[tests/functional/online_offline] @@ -586,6 +592,9 @@ #[tests/functional/threadsappend] #tests = ['threadsappend_001_pos'] +[tests/functional/tmpfile] +tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos'] + [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos'] @@ -614,7 +623,7 @@ [tests/functional/write_dirs] tests = ['write_dirs_001_pos'] -# DISABLED: No 'runat' command, replace the Linux equivilant and add xattrtest +# DISABLED: No 'runat' command, replace the Linux equivalent and add xattrtest #[tests/functional/xattr] #tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', # 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_008_pos', diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c 2017-01-20 18:18:28.000000000 +0000 @@ -65,7 +65,7 @@ cp1 = argv[1]; if (strlen(cp1) >= (sizeof (dirpath) - strlen("TMP_DIR"))) { (void) printf("The string length of mount point is " - "too large\n"); + "too large\n"); exit(-1); } (void) strcpy(&dirpath[0], (const char *)cp1); diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/mkfile/mkfile.c zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/mkfile/mkfile.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/mkfile/mkfile.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/mkfile/mkfile.c 2017-01-20 18:18:28.000000000 +0000 @@ -215,7 +215,7 @@ (ssize_t)bytes) { saverr = errno; if (result < 0) - result = 0; + result = 0; written += result; (void) fprintf(stderr, gettext( "%s: initialized %lu of %lu bytes: %s\n"), @@ -269,7 +269,7 @@ static void usage() { (void) fprintf(stderr, gettext( - "Usage: mkfile [-nv] [g|k|b|m] [] ...\n")); + "Usage: mkfile [-nv] [g|k|b|m] [] ...\n")); exit(1); /* NOTREACHED */ } diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/xattrtest/xattrtest.c zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/xattrtest/xattrtest.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/cmd/xattrtest/xattrtest.c 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/cmd/xattrtest/xattrtest.c 2017-01-20 18:18:28.000000000 +0000 @@ -289,7 +289,8 @@ } else if (pid > 0) { int status; - while ((rc = waitpid(pid, &status, 0)) == -1 && errno == EINTR); + while ((rc = waitpid(pid, &status, 0)) == -1 && + errno == EINTR) { } if (rc < 0 || !WIFEXITED(status)) return (-1); @@ -369,8 +370,8 @@ file = malloc(PATH_MAX); if (file == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for file name\n", - rc, PATH_MAX); + ERROR("Error %d: malloc(%d) bytes for file name\n", rc, + PATH_MAX); goto out; } @@ -392,7 +393,7 @@ rc = open(file, O_CREAT, 0644); if (rc == -1) { ERROR("Error %d: open(%s, O_CREATE, 0644)\n", - errno, file); + errno, file); rc = errno; goto out; } @@ -454,16 +455,16 @@ value = malloc(XATTR_SIZE_MAX); if (value == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for xattr value\n", - rc, XATTR_SIZE_MAX); + ERROR("Error %d: malloc(%d) bytes for xattr value\n", rc, + XATTR_SIZE_MAX); goto out; } file = malloc(PATH_MAX); if (file == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for file name\n", - rc, PATH_MAX); + ERROR("Error %d: malloc(%d) bytes for file name\n", rc, + PATH_MAX); goto out; } @@ -525,16 +526,16 @@ verify_value = malloc(XATTR_SIZE_MAX); if (verify_value == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for xattr verify\n", - rc, XATTR_SIZE_MAX); + ERROR("Error %d: malloc(%d) bytes for xattr verify\n", rc, + XATTR_SIZE_MAX); goto out; } value = malloc(XATTR_SIZE_MAX); if (value == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for xattr value\n", - rc, XATTR_SIZE_MAX); + ERROR("Error %d: malloc(%d) bytes for xattr value\n", rc, + XATTR_SIZE_MAX); goto out; } @@ -544,8 +545,8 @@ file = malloc(PATH_MAX); if (file == NULL) { rc = ENOMEM; - ERROR("Error %d: malloc(%d) bytes for file name\n", - rc, PATH_MAX); + ERROR("Error %d: malloc(%d) bytes for file name\n", rc, + PATH_MAX); goto out; } diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/include/commands.cfg.in zfs-linux-0.7.0-rc3/tests/zfs-tests/include/commands.cfg.in --- zfs-linux-0.7.0-rc2/tests/zfs-tests/include/commands.cfg.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/include/commands.cfg.in 2017-01-20 18:18:28.000000000 +0000 @@ -48,7 +48,6 @@ export HOSTNAME="@HOSTNAME@" export ID="@ID@" export IOSTAT="@IOSTAT@" -export ISAINFO="@ISAINFO@" export KILL="@KILL@" export KSH="@KSH@" export KSTAT="@KSTAT@" @@ -68,6 +67,7 @@ export MPSTAT="@MPSTAT@" export MV="@MV@" export NAWK="@AWK@" +export NET="@NET@" export NEWFS="@NEWFS@" export NPROC="@NPROC@" export PAGESIZE="@PAGESIZE@" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/include/libtest.shlib zfs-linux-0.7.0-rc3/tests/zfs-tests/include/libtest.shlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/include/libtest.shlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/include/libtest.shlib 2017-01-20 18:18:28.000000000 +0000 @@ -44,6 +44,32 @@ fi } +# Determine if this is a 32-bit system +# +# Return 0 if platform is 32-bit, 1 if otherwise + +function is_32bit +{ + if [[ $(getconf LONG_BIT) == "32" ]]; then + return 0 + else + return 1 + fi +} + +# Determine if kmemleak is enabled +# +# Return 0 if kmemleak is enabled, 1 if otherwise + +function is_kmemleak +{ + if is_linux && [[ -e /sys/kernel/debug/kmemleak ]]; then + return 0 + else + return 1 + fi +} + # Determine whether a dataset is mounted # # $1 dataset name @@ -888,7 +914,7 @@ # filenum: the maximum number of files per subdirectory # bytes: number of bytes to write # num_writes: numer of types to write out bytes -# data: the data that will be writen +# data: the data that will be written # # E.g. # file_fs /testdir 20 25 1024 256 0 @@ -1039,7 +1065,7 @@ } # -# Given a mountpoint, or a dataset name, determine if it is shared. +# Given a mountpoint, or a dataset name, determine if it is shared via NFS. # # Returns 0 if shared, 1 otherwise. # @@ -1048,11 +1074,6 @@ typeset fs=$1 typeset mtpt - if is_linux; then - log_unsupported "Currently unsupported by the test framework" - return 1 - fi - if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 @@ -1067,6 +1088,15 @@ fi fi + if is_linux; then + for mtpt in `$SHARE | $AWK '{print $1}'` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + return 1 + fi + for mtpt in `$SHARE | $AWK '{print $2}'` ; do if [[ $mtpt == $fs ]] ; then return 0 @@ -1082,18 +1112,42 @@ } # -# Given a mountpoint, determine if it is not shared. +# Given a dataset name determine if it is shared via SMB. # -# Returns 0 if not shared, 1 otherwise. +# Returns 0 if shared, 1 otherwise. # -function not_shared +function is_shared_smb { typeset fs=$1 + typeset mtpt + + if datasetnonexists "$fs" ; then + return 1 + else + fs=$(echo $fs | sed 's@/@_@g') + fi if is_linux; then + for mtpt in `$NET usershare list | $AWK '{print $1}'` ; do + if [[ $mtpt == $fs ]] ; then + return 0 + fi + done + return 1 + else log_unsupported "Currently unsupported by the test framework" return 1 fi +} + +# +# Given a mountpoint, determine if it is not shared via NFS. +# +# Returns 0 if not shared, 1 otherwise. +# +function not_shared +{ + typeset fs=$1 is_shared $fs if (($? == 0)); then @@ -1104,18 +1158,30 @@ } # -# Helper function to unshare a mountpoint. +# Given a dataset determine if it is not shared via SMB. # -function unshare_fs #fs +# Returns 0 if not shared, 1 otherwise. +# +function not_shared_smb { typeset fs=$1 - if is_linux; then - log_unsupported "Currently unsupported by the test framework" + is_shared_smb $fs + if (($? == 0)); then return 1 fi - is_shared $fs + return 0 +} + +# +# Helper function to unshare a mountpoint. +# +function unshare_fs #fs +{ + typeset fs=$1 + + is_shared $fs || is_shared_smb $fs if (($? == 0)); then log_must $ZFS unshare $fs fi @@ -1124,6 +1190,78 @@ } # +# Helper function to share a NFS mountpoint. +# +function share_nfs #fs +{ + typeset fs=$1 + + if is_linux; then + is_shared $fs + if (($? != 0)); then + log_must $SHARE "*:$fs" + fi + else + is_shared $fs + if (($? != 0)); then + log_must $SHARE -F nfs $fs + fi + fi + + return 0 +} + +# +# Helper function to unshare a NFS mountpoint. +# +function unshare_nfs #fs +{ + typeset fs=$1 + + if is_linux; then + is_shared $fs + if (($? == 0)); then + log_must $UNSHARE -u "*:$fs" + fi + else + is_shared $fs + if (($? == 0)); then + log_must $UNSHARE -F nfs $fs + fi + fi + + return 0 +} + +# +# Helper function to show NFS shares. +# +function showshares_nfs +{ + if is_linux; then + $SHARE -v + else + $SHARE -F nfs + fi + + return 0 +} + +# +# Helper function to show SMB shares. +# +function showshares_smb +{ + if is_linux; then + $NET usershare list + else + $SHARE -F smb + fi + + return 0 +} + +# # Check NFS server status and trigger it online. # function setup_nfs_server @@ -1136,7 +1274,7 @@ fi if is_linux; then - log_unsupported "Currently unsupported by the test framework" + log_note "NFS server must started prior to running test framework." return fi @@ -1698,7 +1836,7 @@ } # -# Use create_pool()/destroy_pool() to clean up the infomation in +# Use create_pool()/destroy_pool() to clean up the information in # in the given disk to avoid slice overlapping. # function cleanup_devices #vdevs diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/acl/acl_common.kshlib zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/acl/acl_common.kshlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/acl/acl_common.kshlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/acl/acl_common.kshlib 2017-01-20 18:18:28.000000000 +0000 @@ -237,7 +237,7 @@ } # -# Count how many ACEs for the speficied file or directory. +# Count how many ACEs for the specified file or directory. # # $1 file or directroy name # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/bootfs/bootfs_008_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -34,7 +34,7 @@ # # STRATEGY: # 1. create pools based on a valid vdev -# 2. create a filesytem on this pool and set the compression property to gzip1-9 +# 2. create a filesystem on this pool and set the compression property to gzip1-9 # 3. set the pool's bootfs property to filesystem we just configured which # should fail # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/casenorm/case_all_values.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/casenorm/case_all_values.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/casenorm/case_all_values.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/casenorm/case_all_values.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -21,7 +21,7 @@ # Check that we can create FS with any supported casesensitivity value. # # STRATEGY: -# For all suported casesensitivity values: +# For all supported casesensitivity values: # 1. Create FS with given casesensitivity value. verify_runnable "global" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +# +# +# DESCRIPTION: +# Check whether chattr works as expected +# +# +# STRATEGY: +# 1. Create 3 files +# 2. Use chattr to make them writable, immutable and appendonly +# 3. Try to write and append to each file +# + +set -A files writable immutable append + +function cleanup +{ + for i in ${files[*]}; do + log_must chattr -ia $TESTDIR/$i + log_must rm -f $TESTDIR/$i + done +} + +log_onexit cleanup + +log_assert "Check whether chattr works as expected" + +log_must touch $TESTDIR/writable +log_must touch $TESTDIR/immutable +log_must touch $TESTDIR/append + +log_must chattr -i $TESTDIR/writable +log_must chattr +i $TESTDIR/immutable +log_must chattr +a $TESTDIR/append + +log_must echo test > $TESTDIR/writable +log_must echo test >> $TESTDIR/writable +log_mustnot echo test > $TESTDIR/immutable +log_mustnot echo test >> $TESTDIR/immutable +log_mustnot echo test > $TESTDIR/append +log_must echo test >> $TESTDIR/append + +log_pass "chattr works as expected" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/chattr_002_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/chattr_002_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/chattr_002_neg.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/chattr_002_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,81 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +# +# +# DESCRIPTION: +# Check whether unprivileged user can chattr +# +# +# STRATEGY: +# 1. Create 3 files +# 2. Use chattr to make them writable, immutable and appendonly +# 3. Try to chattr with unprivileged user +# + +set -A files writable immutable append + +function cleanup +{ + for i in ${files[*]}; do + log_must chattr -ia $TESTDIR/$i + log_must rm -f $TESTDIR/$i + done + log_must $CHMOD 0755 $TESTDIR +} + +log_onexit cleanup + +log_assert "Check whether unprivileged user can chattr" + +log_must $CHMOD 0777 $TESTDIR + +log_must user_run $QUSER1 touch $TESTDIR/writable +log_must user_run $QUSER1 touch $TESTDIR/immutable +log_must user_run $QUSER1 touch $TESTDIR/append + +log_must chattr -i $TESTDIR/writable +log_must chattr +i $TESTDIR/immutable +log_must chattr +a $TESTDIR/append + +log_must user_run $QUSER1 chattr -i $TESTDIR/writable +log_must user_run $QUSER1 chattr -a $TESTDIR/writable +log_must user_run $QUSER1 chattr +i $TESTDIR/immutable +log_must user_run $QUSER1 chattr +a $TESTDIR/append + +log_mustnot user_run $QUSER1 chattr +i $TESTDIR/writable +log_mustnot user_run $QUSER1 chattr +a $TESTDIR/writable +log_mustnot user_run $QUSER1 chattr -i $TESTDIR/immutable +log_mustnot user_run $QUSER1 chattr -a $TESTDIR/append + +log_pass "Unprivileged user cannot chattr as expected" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/cleanup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/cleanup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/cleanup.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/cleanup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +log_must clean_user_group + +default_cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,6 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/chattr +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + chattr_001_pos.ksh \ + chattr_002_neg.ksh diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/chattr/setup.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/chattr/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,44 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +verify_runnable "both" + +log_must clean_user_group + +log_must add_group $QGROUP +log_must add_user $QGROUP $QUSER1 +log_must add_user $QGROUP $QUSER2 + +DISK=${DISKS%% *} +default_setup $DISK diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_001_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # return an error. # # STRATEGY: -# 1. Create an array containg bad zdb parameters. +# 1. Create an array containing bad zdb parameters. # 2. For each element, execute the sub-command. # 3. Verify it returns an error. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -35,7 +35,7 @@ # DESCRIPTION: # 'zfs clone' should fail with inapplicable scenarios, including: # * Null arguments -# * non-existant snapshots. +# * non-existent snapshots. # * invalid characters in ZFS namesapec # * Leading slash in the target clone name # * The argument contains an empty component. diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -146,7 +146,7 @@ i=1 log_must setup_ds -log_note "Verify zfs clone propery for multiple clones" +log_note "Verify zfs clone property for multiple clones" names=$($ZFS list -rt all -o name $TESTPOOL) log_must verify_clones 3 0 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -17,4 +17,5 @@ zfs_create_010_neg.ksh \ zfs_create_011_pos.ksh \ zfs_create_012_pos.ksh \ - zfs_create_013_pos.ksh + zfs_create_013_pos.ksh \ + zfs_create_014_pos.ksh diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,59 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# check 'zfs create ' works at the name length boundary +# +# STRATEGY: +# 1. Verify creating filesystem with name length 255 would succeed +# 2. Verify creating filesystem with name length 256 would fail +# 3. Verify the pool can be re-imported + +verify_runnable "both" + +# namelen 255 and 256 +TESTFS1=$(for i in $(seq $((254 - ${#TESTPOOL}))); do echo z ; done | tr -d '\n') +TESTFS2=$(for i in $(seq $((255 - ${#TESTPOOL}))); do echo z ; done | tr -d '\n') + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && + log_must $ZFS destroy $TESTPOOL/$TESTFS1 +} + +log_onexit cleanup + +log_assert "'zfs create ' can create a ZFS filesystem with name length 255." + +log_must $ZFS create $TESTPOOL/$TESTFS1 +log_mustnot $ZFS create $TESTPOOL/$TESTFS2 +log_must $ZPOOL export $TESTPOOL +log_must $ZPOOL import $TESTPOOL + +log_pass "'zfs create ' works as expected." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -73,7 +73,7 @@ log_must $ZFS create -V $VOLSIZE $TESTPOOL/$TESTVOL # Max volume size is 1TB on 32-bit systems - [[ $($ISAINFO -b) == 32 ]] && \ + [[ is_32bit ]] && \ BIGVOLSIZE=1Tb log_must $ZFS create -sV $BIGVOLSIZE $TESTPOOL/$TESTVOL1 fi diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -129,25 +129,25 @@ done log_note "Verify that 'zfs destroy -r' fails to destroy dataset " \ - "with clone dependant outside it." + "with dependent clone outside it." for obj in $child_fs $child_fs1 $ctr $ctr1; do log_mustnot $ZFS destroy -r $obj datasetexists $obj || \ - log_fail "'zfs destroy -r' fails to keep clone " \ - "dependant outside the hirearchy." + log_fail "'zfs destroy -r' fails to keep dependent " \ + "clone outside the hirearchy." done log_note "Verify that 'zfs destroy -R' succeeds to destroy dataset " \ - "with clone dependant outside it." + "with dependent clone outside it." log_must $ZFS destroy -R $ctr1 datasetexists $ctr1 && \ log_fail "'zfs destroy -R' fails to destroy dataset with clone outside it." log_note "Verify that 'zfs destroy -r' succeeds to destroy dataset " \ - "without clone dependant outside it." + "without dependent clone outside it." log_must $ZFS destroy -r $ctr datasetexists $ctr && \ diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -35,7 +35,7 @@ # # DESCRIPTION: -# Seperately verify 'zfs destroy -f|-r|-rf|-R|-rR ' will fail in +# Separately verify 'zfs destroy -f|-r|-rf|-R|-rR ' will fail in # different conditions. # # STRATEGY: @@ -50,7 +50,7 @@ verify_runnable "both" -log_assert "Seperately verify 'zfs destroy -f|-r|-rf|-R|-rR ' will " \ +log_assert "Separately verify 'zfs destroy -f|-r|-rf|-R|-rR ' will " \ "fail in different conditions." log_onexit cleanup_testenv diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -30,7 +30,7 @@ ################################################################################ # -# When using 'zfs destroy -R' on a file system heirarchy that inclues a +# When using 'zfs destroy -R' on a file system hierarchy that includes a # snapshot and a clone of that snapshot, and the snapshot has been # defer-destroyed, make sure that the 'zfs destroy -R' works as expected. # In particular make sure that libzfs is not confused by the fact that the diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -90,7 +90,7 @@ verify_snapshots done -log_note "Verify invalid arguements" +log_note "Verify invalid arguments" setup_snapshots for args in $invalid_args; do log_mustnot $ZFS destroy $TESTPOOL/$TESTFS1$args @@ -98,7 +98,7 @@ log_must verify_snapshots 1 done -log_note "Destroy the begining range" +log_note "Destroy the beginning range" log_must $ZFS destroy $TESTPOOL/$TESTFS1@%snap3 log_must $ZFS destroy $TESTPOOL/$TESTVOL@%snap3 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib 2017-01-20 18:18:28.000000000 +0000 @@ -113,7 +113,7 @@ # # Delete volume and related datasets from list, if the test cases was -# runing in local zone. Then check them are existed or non-exists. +# running in local zone. Then check them are existed or non-exists. # # $1 function name # $2-n datasets name diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -36,11 +36,16 @@ # 1. Create a multiple depth filesystem. # 2. 'zfs get -d ' to get the output. # 3. 'zfs get -r|egrep' to get the expected output. -# 4. Compare the two outputs, they shoud be same. +# 4. Compare the two outputs, they should be same. # verify_runnable "both" +# See issue: https://github.com/zfsonlinux/zfs/issues/5479 +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + log_assert "'zfs get -d ' should get expected output." log_onexit depth_fs_cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -85,7 +85,7 @@ log_must $ZFS mount $fs1 log_must $LS $testfile1 $mntpnt1/$TESTFILE2 -# Verify $TESTFILE2 was not created in $fs, and $fs is accessable again. +# Verify $TESTFILE2 was not created in $fs, and $fs is accessible again. log_mustnot $LS $mntpnt/$TESTFILE2 log_must $LS $testfile diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -40,7 +40,7 @@ # 1. Create multiple snapshots and a clone to a middle point snapshot # 2. Promote the clone filesystem # 3. Verify the origin filesystem and promoted filesystem include -# correct datasets seperated by the clone point. +# correct datasets separated by the clone point. # verify_runnable "both" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -14,4 +14,5 @@ zfs_receive_010_pos.ksh \ zfs_receive_011_pos.ksh \ zfs_receive_012_pos.ksh \ - zfs_receive_013_pos.ksh + zfs_receive_013_pos.ksh \ + zfs_receive_014_pos.ksh diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,122 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2016, loli10K. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib + +# +# DESCRIPTION: +# Verify ZFS successfully receive and restore properties. +# +# STRATEGY: +# 1. Create a filesystem. +# 2. Create a full stream with properties and receive it. +# 3. Create also an incremental stream without some properties and a truncated +# stream. +# 4. Fail to receive the truncated incremental stream and verify previously +# received properties are still present. +# 5. Receive the complete incremental send stream and verify that sent +# properties are successfully received. +# + +verify_runnable "both" + +orig=$TESTPOOL/$TESTFS1 +dest=$TESTPOOL/$TESTFS2 +typeset userprop=$(valid_user_property 8) +typeset userval=$(user_property_value 8) +typeset streamfile_full=/var/tmp/streamfile_full.$$ +typeset streamfile_incr=/var/tmp/streamfile_incr.$$ +typeset streamfile_trun=/var/tmp/streamfile_trun.$$ + +function cleanup +{ + log_must $RM $streamfile_full + log_must $RM $streamfile_incr + log_must $RM $streamfile_trun + log_must $ZFS destroy -rf $orig + log_must $ZFS destroy -rf $dest +} + +# +# Verify property $2 is set from source $4 on dataset $1 and has value $3. +# +# $1 checked dataset +# $2 user property +# $3 property value +# $4 source +# +function check_prop_source +{ + typeset dataset=$1 + typeset prop=$2 + typeset value=$3 + typeset source=$4 + typeset chk_value=$(get_prop "$prop" "$dataset") + typeset chk_source=$(get_source "$prop" "$dataset") + if [[ "$chk_value" != "$value" || \ + "$chk_source" != "$4" ]] + then + return 1 + else + return 0 + fi +} + +log_assert "ZFS successfully receive and restore properties." +log_onexit cleanup + +# 1. Create a filesystem. +log_must eval "$ZFS create $orig" +mntpnt=$(get_prop mountpoint $orig) + +# 2. Create a full stream with properties and receive it. +log_must eval "$ZFS set compression='gzip-1' $orig" +log_must eval "$ZFS set '$userprop'='$userval' $orig" +log_must eval "$ZFS snapshot $orig@snap1" +log_must eval "$ZFS send -p $orig@snap1 > $streamfile_full" +log_must eval "$ZFS recv $dest < $streamfile_full" +log_must eval "check_prop_source $dest compression 'gzip-1' received" +log_must eval "check_prop_source $dest '$userprop' '$userval' received" + +# 3. Create also an incremental stream without some properties and a truncated +# stream. +log_must eval "$ZFS set compression='gzip-2' $orig" +log_must eval "$ZFS inherit '$userprop' $orig" +log_must eval "$DD if=/dev/urandom of=$mntpnt/file bs=1024k count=10" +log_must eval "$ZFS snapshot $orig@snap2" +log_must eval "$ZFS send -p -i $orig@snap1 $orig@snap2 > $streamfile_incr" +log_must eval "$DD if=$streamfile_incr of=$streamfile_trun bs=1024k count=9" +log_must eval "$ZFS snapshot $orig@snap3" +log_must eval "$ZFS send -p -i $orig@snap1 $orig@snap3 > $streamfile_incr" + +# 4. Fail to receive the truncated incremental stream and verify previously +# received properties are still present. +log_mustnot eval "$ZFS recv -F $dest < $streamfile_trun" +log_must eval "check_prop_source $dest compression 'gzip-1' received" +log_must eval "check_prop_source $dest '$userprop' '$userval' received" + +# 5. Receive the complete incremental send stream and verify that sent +# properties are successfully received. +log_must eval "$ZFS recv -F $dest < $streamfile_incr" +log_must eval "check_prop_source $dest compression 'gzip-2' received" +log_must eval "check_prop_source $dest '$userprop' '-' '-'" + +log_pass "ZFS properties are successfully received and restored." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -30,7 +30,7 @@ # # DESCRIPTION: -# Seperately verify 'zfs rollback ''|-f|-r|-rf|-R|-rR will fail in +# Separately verify 'zfs rollback ''|-f|-r|-rf|-R|-rR will fail in # different conditions. # # STRATEGY: @@ -53,7 +53,7 @@ done } -log_assert "Seperately verify 'zfs rollback ''|-f|-r|-rf will fail in " \ +log_assert "Separately verify 'zfs rollback ''|-f|-r|-rf will fail in " \ "different conditions." log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -61,13 +61,13 @@ # if is_linux; then set -A args \ - "dev" "/dev/" "nodev" "/nodev/" \ - "exec" "/exec/" "noexec" "/noexec/" \ - "mand" "/mand/" "nomand" "/nomand/" \ - "ro" "read only" "rw" "read/write" \ - "suid" "/suid/" "nosuid" "/nosuid/" \ - "xattr" "/xattr/" "noxattr" "/noxattr/" \ - "atime" "/atime/" "noatime" "/noatime/" + "nodev" "dev" \ + "noexec" "exec" \ + "mand" "nomand" \ + "ro" "rw" \ + "nosuid" "suid" \ + "xattr" "noxattr" \ + "atime" "noatime" else set -A args \ "devices" "/devices/" "nodevices" "/nodevices/" \ @@ -90,26 +90,40 @@ while ((i < ${#args[@]})); do if is_linux; then log_must $MOUNT -t zfs -o ${args[$i]} $testfs $tmpmnt + + msg=$($MOUNT | $GREP "$tmpmnt ") + + $ECHO $msg | $GREP "${args[((i))]}" > /dev/null 2>&1 + if (($? != 0)) ; then + $ECHO $msg | $GREP "${args[((i-1))]}" > /dev/null 2>&1 + if (($? == 0)) ; then + log_fail "Expected option: ${args[((i))]} \n" \ + "Real option: $msg" + fi + fi + + log_must $UMOUNT $tmpmnt + ((i += 1)) else log_must $MOUNT -F zfs -o ${args[$i]} $testfs $tmpmnt - fi - msg=$($MOUNT | $GREP "^$tmpmnt ") - if ! is_linux; then + msg=$($MOUNT | $GREP "^$tmpmnt ") + # In LZ, a user with all zone privileges can never "devices" if ! is_global_zone && [[ ${args[$i]} == devices ]] ; then args[((i+1))]="/nodevices/" fi - fi - $ECHO $msg | $GREP "${args[((i+1))]}" > /dev/null 2>&1 - if (($? != 0)) ; then - log_fail "Expected option: ${args[((i+1))]} \n" \ - "Real option: $msg" - fi + $ECHO $msg | $GREP "${args[((i+1))]}" > /dev/null 2>&1 + if (($? != 0)) ; then + log_fail "Expected option: ${args[((i+1))]} \n" \ + "Real option: $msg" + fi + - log_must $UMOUNT $tmpmnt - ((i += 2)) + log_must $UMOUNT $tmpmnt + ((i += 2)) + fi done log_pass "With legacy mount, FSType-specific option works well passed." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -72,6 +72,8 @@ # Create filesystem and volume's snapshot create_snapshot $TESTPOOL/$TESTFS $TESTSNAP create_snapshot $TESTPOOL/$TESTVOL $TESTSNAP +sync_pool $TESTPOOL +$SLEEP 5 typeset -i i=0 typeset -i j=0 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib 2017-01-20 18:18:28.000000000 +0000 @@ -124,7 +124,7 @@ } # -# Random select charactor from the specified charactor set and combine into a +# Random select character from the specified character set and combine into a # random string # # $1 character set name diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_005_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_005_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_005_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_005_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -68,7 +68,7 @@ log_fail "get sharenfs failed. ($option != ${shareopts[i]})" fi - $SHARE | $GREP $option > /dev/null 2>&1 + showshares_nfs | $GREP $option > /dev/null 2>&1 if (( $? != 0 )); then log_fail "The '$option' option was not found in share output." fi diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -59,7 +59,7 @@ log_note "Setting sharenfs=${badopts[i]} $i " log_mustnot $ZFS set sharenfs="${badopts[i]}" $TESTPOOL/$TESTFS - $SHARE | $GREP $option > /dev/null 2>&1 + showshares_nfs | $GREP $option > /dev/null 2>&1 if (( $? == 0 )); then log_fail "An invalid setting '$option' was propagated." fi diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_009_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_009_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_009_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_009_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -59,7 +59,7 @@ log_must $ZFS set sharenfs=on $fs fi -$SHARE | $GREP $mpt >/dev/null 2>&1 +showshares_nfs | $GREP $mpt >/dev/null 2>&1 if (( $? != 0 )); then log_must $ZFS share $fs fi diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -32,7 +32,7 @@ # are not in the same pool. # # STRATEGY: -# 1. Create 2 separate zpools, zpool name lenghts must be the same. +# 1. Create 2 separate zpools, zpool name lengths must be the same. # 2. Attempt to simultaneously create a snapshot of each pool. # 3. Veriy the snapshot creation failed. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -20,7 +20,7 @@ # # STRATEGY # 1. Create multiple datasets -# 2. Create mutiple snapshots with a list of valid and invalid +# 2. Create multiple snapshots with a list of valid and invalid # snapshot names # 3. Verify the valid snpashot creation diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -6,4 +6,5 @@ zfs_unshare_002_pos.ksh \ zfs_unshare_003_pos.ksh \ zfs_unshare_004_neg.ksh \ - zfs_unshare_005_neg.ksh + zfs_unshare_005_neg.ksh \ + zfs_unshare_006_pos.ksh diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -88,7 +88,7 @@ if [[ $prop_value == "off" ]]; then not_shared $mntp || - log_must $UNSHARE -F nfs $mntp + log_must eval "unshare_nfs $mntp" log_must $ZFS set sharenfs=on $filesystem is_shared $mntp || \ log_fail "'$ZFS set sharenfs=on' fails to make" \ diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -45,7 +45,7 @@ typeset -i i=0 while (( i < ${#mntp_fs[*]} )); do is_shared ${mntp_fs[i]} && \ - log_must $UNSHARE -F nfs ${mntp_fs[i]} + log_must eval "unshare_nfs ${mntp_fs[i]}" ((i = i + 2)) done @@ -86,7 +86,7 @@ log_fail "'zfs set sharenfs=off' fails to make ZFS " \ "filesystem $filesystem unshared." - log_must $SHARE -F nfs $mntp + log_must eval "share_nfs $mntp" is_shared $mntp || \ log_fail "'share' command fails to share ZFS file system." # @@ -150,7 +150,7 @@ # i=0 while (( i < ${#mntp_fs[*]} )); do - $SHARE -F nfs ${mntp_fs[i]} + share_nfs ${mntp_fs[i]} is_shared ${mntp_fs[i]} || \ log_fail "'$SHARE' shares ZFS filesystem failed." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -29,7 +29,7 @@ # # DESCRIPTION: -# Verify that a file system and its dependant are unshared when turn off sharenfs +# Verify that a file system and its dependent are unshared when turn off sharenfs # property. # # STRATEGY: @@ -68,7 +68,7 @@ prop_value=$(get_prop "sharenfs" $filesystem) if [[ $prop_value == "off" ]]; then - is_shared $mntp || $UNSHARE -F nfs $mntp + is_shared $mntp || unshare_nfs $mntp log_must $ZFS set sharenfs=on $filesystem fi @@ -81,10 +81,10 @@ log_fail "Snapshot $mntpt@snapshot is shared (set sharenfs)." } -log_assert "Verify that a file system and its dependant are unshared." +log_assert "Verify that a file system and its dependent are unshared." log_onexit cleanup log_must $ZFS snapshot $TESTPOOL/$TESTFS@snapshot test_snap_unshare $TESTDIR $TESTPOOL/$TESTFS -log_pass "A file system and its dependant are both unshared as expected." +log_pass "A file system and its dependent are both unshared as expected." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2016, loli10K. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that 'zfs unshare [nfs|smb] -a' unshares only filesystems shared by the +# specified protocol. +# +# STRATEGY: +# 1. Share filesystems with different protocols. +# 2. Invoke 'zfs unshare nfs -a' to unshare filesystems. +# 3. Verify that only nfs filesystems are unshared. +# 4. Share all filesystems again. +# 5. Invoke 'zfs unshare smb -a' and verify only smb filesystems are unshared. +# + +verify_runnable "global" + +function cleanup +{ + log_must $ZFS unshare -a + log_must $ZFS destroy -f $TESTPOOL/$TESTFS/shared1 + log_must $ZFS destroy -f $TESTPOOL/$TESTFS/shared2 + log_must $ZFS destroy -f $TESTPOOL/$TESTFS/shared3 +} + +log_assert "Verify '$ZFS unshare [nfs|smb] -a' only works on the specified "\ + "protocol." +log_onexit cleanup + +# 1. Share filesystems with different protocols. +log_must $ZFS create $TESTPOOL/$TESTFS/shared1 +log_must $ZFS create $TESTPOOL/$TESTFS/shared2 +log_must $ZFS create $TESTPOOL/$TESTFS/shared3 +log_must $ZFS set mountpoint=$TESTDIR/1 $TESTPOOL/$TESTFS/shared1 +log_must $ZFS set mountpoint=$TESTDIR/2 $TESTPOOL/$TESTFS/shared2 +log_must $ZFS set mountpoint=$TESTDIR/3 $TESTPOOL/$TESTFS/shared3 +log_must $ZFS set sharenfs=on $TESTPOOL/$TESTFS/shared1 +log_must $ZFS set sharenfs=on $TESTPOOL/$TESTFS/shared2 +log_must $ZFS set sharesmb=on $TESTPOOL/$TESTFS/shared2 +log_must $ZFS set sharesmb=on $TESTPOOL/$TESTFS/shared3 +log_must $ZFS share -a + +# 2. Invoke 'zfs unshare nfs -a' to unshare filesystems. +log_must $ZFS unshare nfs -a + +# 3. Verify that only nfs filesystems are unshared. +log_must eval "not_shared $TESTPOOL/$TESTFS/shared1" +log_must eval "not_shared $TESTPOOL/$TESTFS/shared2" +log_must eval "is_shared_smb $TESTPOOL/$TESTFS/shared2" +log_must eval "is_shared_smb $TESTPOOL/$TESTFS/shared3" + +# 4. Share all filesystems again. +log_must $ZFS share -a + +# 5. Invoke 'zfs unshare smb -a' and verify only smb filesystems are unshared. +log_must $ZFS unshare smb -a +log_must eval "is_shared $TESTPOOL/$TESTFS/shared1" +log_must eval "is_shared $TESTPOOL/$TESTFS/shared2" +log_must eval "not_shared_smb $TESTPOOL/$TESTFS/shared2" +log_must eval "not_shared_smb $TESTPOOL/$TESTFS/shared3" + +log_pass "'$ZFS unshare [nfs|smb] -a' only works on the specified protocol." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # should run successfully. # # STRATEGY: -# 1. Create an array containg each zpool options. +# 1. Create an array containing each zpool options. # 2. For each element, execute the zpool command. # 3. Verify it run successfully. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -45,7 +45,7 @@ if [[ -n $DISK ]]; then # - # Use 'zpool create' to clean up the infomation in + # Use 'zpool create' to clean up the information in # in the given disk to avoid slice overlapping. # cleanup_devices $DISK diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -45,6 +45,11 @@ verify_runnable "global" +# See issue: https://github.com/zfsonlinux/zfs/issues/5479 +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + function cleanup { poolexists $TESTPOOL1 && \ @@ -55,7 +60,6 @@ done } - log_assert "Verify 'zpool clear' can clear errors of a storage pool." log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -47,7 +47,7 @@ if [[ -n $DISK ]]; then # - # Use 'zpool create' to clean up the infomation in + # Use 'zpool create' to clean up the information in # in the given disk to avoid slice overlapping. # cleanup_devices $DISK diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -39,6 +39,11 @@ verify_runnable "global" +# See issue: https://github.com/zfsonlinux/zfs/issues/5479 +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + function cleanup { if [[ -n "$child_pids" ]]; then diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -16,6 +16,7 @@ zpool_import_011_neg.ksh \ zpool_import_012_pos.ksh \ zpool_import_013_neg.ksh \ + zpool_import_014_pos.ksh \ zpool_import_all_001_pos.ksh \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -100,7 +100,7 @@ can be restored after import [-Df]." setup_filesystem "$DEVICE_FILES" $TESTPOOL1 $TESTFS $TESTDIR1 -# create a heirarchy of filesystem +# create a hierarchy of filesystem for pool in ${pools[@]} ; do log_must $ZFS create $pool/$TESTFS/$TESTCTR log_must $ZFS create $pool/$TESTFS/$TESTCTR/$TESTCTR1 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_014_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_014_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_014_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_014_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2016, loli10K. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# Temporary pool names should not be persisted on devices. +# +# STRATEGY: +# 1. Create pool A, then export it. +# 2. Re-import the pool with a temporary name B, then export it. +# 3. Verify device labels still contain the expected pool name (A). +# + +verify_runnable "global" + +function cleanup +{ + typeset dt + for dt in $poolB $poolA; do + destroy_pool $dt + done + + log_must $RM -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must $MKFILE $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +# +# Verify name of (exported) pool from device $1 label is equal to $2 +# $1 device +# $2 pool name +# +function verify_pool_name +{ + typeset device=$1 + typeset poolname=$2 + typeset labelname + + $ZDB -e -l $device | $GREP " name:" | { + while read labelname ; do + if [[ "name: '$poolname'" != "$labelname" ]]; then + return 1 + fi + done + } + return 0 +} + +log_assert "Temporary pool names should not be persisted on devices." +log_onexit cleanup + +poolA=poolA.$$; poolB=poolB.$$; + +log_must $ZPOOL create $poolA $VDEV0 +log_must $ZPOOL export $poolA + +log_must $ZPOOL import -t $poolA $poolB -d $DEVICE_DIR +log_must $ZPOOL export $poolB + +log_must eval "verify_pool_name $VDEV0 $poolA" + +log_pass "Temporary pool names are not persisted on devices." diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -48,6 +48,11 @@ verify_runnable "global" +# See issue: https://github.com/zfsonlinux/zfs/issues/5444 +if is_32bit; then + log_unsupported "Test case fails on 32-bit systems" +fi + log_assert "Resilver prevent scrub from starting until the resilver completes" log_must $ZPOOL detach $TESTPOOL $DISK2 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/misc/zpool_status_001_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/misc/zpool_status_001_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/misc/zpool_status_001_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/misc/zpool_status_001_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -69,6 +69,21 @@ log_must eval "$ZPOOL status -v $TESTPOOL > /tmp/pool-status.$$" check_pool_status +# Make sure -c option works, and that VDEV_PATH and VDEV_UPATH get set. +# +# grep for '^\s+/' to just get the vdevs (not pools). All vdevs will start with +# a '/' when we specify the path (-P) flag. We check for "{}" to see if one +# of the VDEV variables isn't set. +C1=$($ZPOOL status -P | $GREP -E '^\s+/' | $WC -l) +C2=$($ZPOOL status -P -c 'echo vdev_test{$VDEV_PATH}{$VDEV_UPATH}' | \ + $GREP -E '^\s+/' | $GREP -v '{}' | $WC -l) + +if [ "$C1" != "$C2" ] ; then + log_fail "zpool status -c option failed. Expected $C1 vdevs, got $C2" +else + log_pass "zpool status -c option passed. Expected $C1 vdevs, got $C2" +fi + # $TESTPOOL.virt has an offline device, so -x will show it log_must eval "$ZPOOL status -x $TESTPOOL.virt > /tmp/pool-status.$$" check_pool_status diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -38,7 +38,7 @@ # STRATEGY: # 1. 'zfs list -d ' to get the output. # 2. 'zfs list -r|egrep' to get the expected output. -# 3. Compare the two outputs, they shoud be same. +# 3. Compare the two outputs, they should be same. # verify_runnable "both" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -5,4 +5,5 @@ zpool_iostat_001_neg.ksh \ zpool_iostat_002_pos.ksh \ zpool_iostat_003_neg.ksh \ - zpool_iostat_004_pos.ksh + zpool_iostat_004_pos.ksh \ + zpool_iostat_005_pos.ksh diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +# +# Copyright (c) 2016 by Lawrence Livermore National Security, LLC. +# + + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +typeset testpool +if is_global_zone ; then + testpool=$TESTPOOL +else + testpool=${TESTPOOL%%/*} +fi + +# +# DESCRIPTION: +# Verify 'zpool iostat -c CMD' works, and that VDEV_PATH and VDEV_UPATH get set. +# +# STRATEGY: +# grep for '^\s+/' to just get the vdevs (not pools). All vdevs will start with +# a '/' when we specify the path (-P) flag. We check for "{}" to see if one +# of the VDEV variables isn't set. +# +C1=$($ZPOOL iostat -Pv $testpool | $GREP -E '^\s+/' | $WC -l) +C2=$($ZPOOL iostat -Pv -c 'echo vdev_test{$VDEV_PATH}{$VDEV_UPATH}' $testpool \ + | $GREP -E '^\s+/' | $GREP -v '{}' | $WC -l) +if [ "$C1" != "$C2" ] ; then + log_fail "zpool iostat -c failed, expected $C1 vdevs, got $C2" +else + log_note "zpool iostat -c passed, expected $C1 vdevs, got $C2" +fi + +# Call iostat on only a specific vdev, and verify that the command only gets +# run on the vdev. We write the command results to a temp file to verify that +# the command actually gets run, rather than just verifying that the results +# are *displayed* for the specific vdev. +TMP=$($MKTEMP) +FIRST_VDEV=$($ZPOOL iostat -Pv $testpool | $GREP -Eo '^\s+/[^ ]+' | $HEAD -n 1) +log_must $ZPOOL iostat -Pv -c "echo \$VDEV_PATH >> $TMP" $testpool \ + $FIRST_VDEV > /dev/null +C2=$($WC -w < $TMP) +$RM $TMP +if [ "$C2" != "1" ] ; then + log_fail "zpool iostat -c failed, expected 1 vdev, got $C2" +else + log_note "zpool iostat -c passed, expected 1 vdev, got $C2" +fi diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -50,7 +50,7 @@ $RM -f $TESTDIR/* } -log_assert "Changing blocksize doesn't casue system panic with compression settings" +log_assert "Changing blocksize doesn't cause system panic with compression settings" log_onexit cleanup fs=$TESTPOOL/$TESTFS diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -60,8 +60,8 @@ # - rename - mount(8) does not permit non-superuser mounts # - zoned - zones are not supported # - destroy - umount(8) does not permit non-superuser umounts -# - sharenfs - sharing requires superuser priviliges -# - share - sharing requires superuser priviliges +# - sharenfs - sharing requires superuser privileges +# - share - sharing requires superuser privileges # - readonly - mount(8) does not permit non-superuser remounts # set -A perms create true false \ diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/devices/devices_common.kshlib zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/devices/devices_common.kshlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/devices/devices_common.kshlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/devices/devices_common.kshlib 2017-01-20 18:18:28.000000000 +0000 @@ -32,7 +32,7 @@ . $STF_SUITE/include/libtest.shlib # -# Create block file or charactor file according to parameter. +# Create block file or character file according to parameter. # # $1 device file type # $2 file name diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -46,6 +46,11 @@ verify_runnable "both" +# See issue: https://github.com/zfsonlinux/zfs/issues/5479 +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + function cleanup { datasetexists $TEST_FS && log_must $ZFS destroy $TEST_FS diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/async_destroy/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/async_destroy/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/async_destroy/setup.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/async_destroy/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -31,6 +31,10 @@ . $STF_SUITE/include/libtest.shlib +if is_32bit; then + log_unsupported "Test case fails on 32-bit systems" +fi + DISK=${DISKS%% *} default_setup $DISK diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -27,9 +27,11 @@ TEST_SEND_FS=$TESTPOOL/send_large_dnode TEST_RECV_FS=$TESTPOOL/recv_large_dnode TEST_SNAP=$TEST_SEND_FS@ldnsnap +TEST_SNAPINCR=$TEST_SEND_FS@ldnsnap_incr TEST_STREAM=$TESTDIR/ldnsnap +TEST_STREAMINCR=$TESTDIR/ldnsnap_incr TEST_FILE=foo - +TEST_FILEINCR=bar function cleanup { @@ -42,6 +44,7 @@ fi rm -f $TEST_STREAM + rm -f $TEST_STREAMINCR } log_onexit cleanup @@ -49,10 +52,13 @@ log_assert "zfs send stream with large dnodes accepted by new pool" log_must $ZFS create -o dnodesize=1k $TEST_SEND_FS -log_must touch /$TEST_SEND_FS/$TEST_FILE -log_must $ZFS umount $TEST_SEND_FS +log_must $TOUCH /$TEST_SEND_FS/$TEST_FILE log_must $ZFS snap $TEST_SNAP log_must $ZFS send $TEST_SNAP > $TEST_STREAM +log_must $RM -f /$TEST_SEND_FS/$TEST_FILE +log_must $TOUCH /$TEST_SEND_FS/$TEST_FILEINCR +log_must $ZFS snap $TEST_SNAPINCR +log_must $ZFS send -i $TEST_SNAP $TEST_SNAPINCR > $TEST_STREAMINCR log_must eval "$ZFS recv $TEST_RECV_FS < $TEST_STREAM" inode=$(ls -li /$TEST_RECV_FS/$TEST_FILE | awk '{print $1}') @@ -61,4 +67,9 @@ log_fail "dnode size is $dnsize (expected 1K)" fi +log_must eval "$ZFS recv -F $TEST_RECV_FS < $TEST_STREAMINCR" +log_must $DIFF -r /$TEST_SEND_FS /$TEST_RECV_FS +log_must $ZFS umount $TEST_SEND_FS +log_must $ZFS umount $TEST_RECV_FS + log_pass diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -45,6 +45,11 @@ verify_runnable "global" +# See issue: https://github.com/zfsonlinux/zfs/issues/5479 +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + log_assert "Test properties are inherited correctly" # @@ -152,10 +157,10 @@ # The mountpoint property is slightly different from other properties and # so is handled here. For all other properties if they are set to a specific # value at a higher level in the data hierarchy (i.e. checksum=on) then that -# value propogates down the hierarchy unchanged, with the source field being +# value propagates down the hierarchy unchanged, with the source field being # set to 'inherited from '. # -# The mountpoint property is different in that while the value propogates +# The mountpoint property is different in that while the value propagates # down the hierarchy, the value at each level is determined by a combination # of the top-level value and the current level in the hierarchy. # @@ -322,7 +327,7 @@ # set up correctly as specified in the # configX.cfg file (which includes 'set'ting # properties at a higher level and checking - # that they propogate down to the lower levels. + # that they propagate down to the lower levels. # # Note in a few places here, we use # check_failure, rather than log_must - this diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/README.config zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/README.config --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/README.config 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/README.config 2017-01-20 18:18:28.000000000 +0000 @@ -29,7 +29,7 @@ # or set locally. # # Format for this file is as follows: -# +# # # - must be the full dataset name # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state001.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state001.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state001.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state001.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ # No command is actually run (hence '-:-') but rather this state file is # used to verify that the property that was set on the top level pool # via the 'local' keyword (in the config1.cfg file) has correctly -# propogated down the hierarchy. +# propagated down the hierarchy. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state002.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state002.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state002.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state002.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ # No command is actually run (hence '-:-') but rather this state file is # used to verify that the property that was set on the middle level # dataset via the 'local' keyword (in the configX.cfg file) has -# correctly propogated down the hierarchy to the filesystem underneath, +# correctly propagated down the hierarchy to the filesystem underneath, # while leaving the top level pools properties unchanged. # # *** ASSERTION DESCRIPTION *** diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state012.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state012.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state012.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state012.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -32,12 +32,12 @@ # # Verify that running 'zfs inherit -r' at each level of the data hierarchy # when the bottom filesystem level properties have been set locally results -# in the top level property values being propogated down the data +# in the top level property values being propagated down the data # hierarchy. # # Executing inherit -r at the middle level and bottom levels after # running it at the top level is somewhat redundant as the top level value -# should propogate down the entire data hierarchy. Done for completeness +# should propagate down the entire data hierarchy. Done for completeness # sake. # # *** ASSERTION DESCRIPTION *** diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state014.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state014.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state014.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state014.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -32,12 +32,12 @@ # # Verify that running 'zfs inherit -r' at each level of the data hierarchy # when the bottom and middle level properties have been set locally results -# in the top level property values being propogated down the data +# in the top level property values being propagated down the data # hierarchy. # # Note : executing inherit -r at the middle level and bottom levels after # running it at the top level is somewhat redundant as the top level value -# should propogate down the entire data hierarchy. Done for completeness +# should propagate down the entire data hierarchy. Done for completeness # sake. # # *** ASSERTION DESCRIPTION *** diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state015.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state015.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state015.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state015.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -40,7 +40,7 @@ # the bottom level. # # Executing 'zfs inherit' at the bottom level is somewhat redundant but -# is done for completness sake. +# is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state016.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state016.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state016.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state016.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # to the top level (default) values. # # Executing 'zfs inherit -r' at the bottom and middle levels after executing -# at the top level is somewhat redundant but ss done for completness sake. +# at the top level is somewhat redundant but ss done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state017.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state017.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state017.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state017.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -41,7 +41,7 @@ # the values down to the bottom level. # # Executing 'zfs inherit' at the bottom level is somewhat redundant but -# is done for completness sake. +# is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state018.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state018.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state018.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state018.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -34,11 +34,11 @@ # when the top level and middle level datasets properties are set locally, # and the bottom level has inherited its properties from the middle # level, results in the top level properties reverting back to their -# default values and being propogated down to the other datasets in the +# default values and being propagated down to the other datasets in the # hierarchy. # # Executing 'zfs inherit -r' at the middle and bottom levels after executing -# it at the top level is somewhat redundant but is done for completness sake. +# it at the top level is somewhat redundant but is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state019.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state019.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state019.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state019.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # levels inheriting the changed values. # # Executing 'zfs inherit' at the middle and bottom levels is somewhat -# redundant but is done for completness sake. +# redundant but is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state020.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state020.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state020.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state020.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # levels inheriting the changed values. # # Executing 'zfs inherit -r' at the middle and bottom levels is somewhat -# redundant but is done for completness sake. +# redundant but is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state022.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state022.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state022.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state022.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # levels inheriting the changed values. # # Executing 'zfs inherit -r' at the middle and bottom levels is somewhat -# redundant but is done for completness sake. +# redundant but is done for completeness sake. # # *** ASSERTION DESCRIPTION *** # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state024.cfg zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state024.cfg --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inheritance/state024.cfg 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inheritance/state024.cfg 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ # Verify that executing 'zfs inherit -r' at the top level in the hierarchy # when each levels properties are set locally, results in the top level # properties reverting back to their default values, and the changed -# values being propogated down the hierarchy. +# values being propagated down the hierarchy. # # Executing 'zfs inherit -r' at the middle and bottom levels after doing so # at the top level is somewhat redundant but is done for completeness. diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -48,7 +48,7 @@ function cleanup { # - # Essentailly this is the default_cleanup rountine but I cannot get it + # Essentailly this is the default_cleanup routine but I cannot get it # to work correctly. So its reproduced below. Still need to full # understand why default_cleanup does not work correctly from here. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -126,7 +126,7 @@ str=$($ZFS create -sV $volsize $TESTPOOL2/$TESTVOL 2>&1) ret=$? if (( ret != 0 )); then - if [[ $($ISAINFO -b) == 32 && \ + if [[ is_32bit && \ $str == *${VOL_LIMIT_KEYWORD1}* || \ $str == *${VOL_LIMIT_KEYWORD2}* || \ $str == *${VOL_LIMIT_KEYWORD3}* ]] diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/Makefile.am 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -5,6 +5,7 @@ cache \ cachefile \ casenorm \ + chattr \ checksum \ clean_mirror \ cli_root \ @@ -50,6 +51,7 @@ snapused \ sparse \ threadsappend \ + tmpfile \ truncate \ upgrade \ userquota \ diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/migration/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/migration/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/migration/setup.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/migration/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -34,10 +34,6 @@ verify_runnable "global" -if ! $(is_physical_device $ZFS_DISK) ; then - log_unsupported "Only partitionable physical disks can be used" -fi - case $DISK_COUNT in 0) log_untested "Need at least 1 disk device for test" @@ -50,10 +46,7 @@ ;; esac -set_partition ${ZFSSIDE_DISK##*s} "" $FS_SIZE $ZFS_DISK -set_partition ${NONZFSSIDE_DISK##*s} "" $FS_SIZE $NONZFS_DISK - -create_pool $TESTPOOL "$ZFSSIDE_DISK" +create_pool $TESTPOOL "$ZFS_DISK" $RM -rf $TESTDIR || log_unresolved Could not remove $TESTDIR $MKDIR -p $TESTDIR || log_unresolved Could not create $TESTDIR @@ -64,10 +57,10 @@ $RM -rf $NONZFS_TESTDIR || log_unresolved Could not remove $NONZFS_TESTDIR $MKDIR -p $NONZFS_TESTDIR || log_unresolved Could not create $NONZFS_TESTDIR -$ECHO "y" | $NEWFS -v ${DEV_DSKDIR}/$NONZFSSIDE_DISK +$ECHO "y" | $NEWFS -v ${DEV_DSKDIR}/$NONZFS_DISK (( $? != 0 )) && log_untested "Unable to setup a UFS file system" -log_must $MOUNT ${DEV_DSKDIR}/$NONZFSSIDE_DISK $NONZFS_TESTDIR +log_must $MOUNT ${DEV_DSKDIR}/$NONZFS_DISK $NONZFS_TESTDIR log_pass diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -46,7 +46,7 @@ log_must $ZFS set compress=on $origin log_must $ZFS set checksum=sha256 $origin $DD if=/dev/urandom of=$vol bs=8192 count=4096 conv=notrunc >/dev/null \ - 2>&1 || log_fail "dd into $orgin failed." + 2>&1 || log_fail "dd into $origin failed." $ZFS snapshot $origin@a || log_fail "zfs snap failed" log_must $ZFS clone $origin@a $clone log_must $ZFS set compress=on $clone diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/refquota/refquota_005_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/refquota/refquota_005_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/refquota/refquota_005_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/refquota/refquota_005_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # # STRATEGY: # 1. Setting refquota < quota for parent -# 2. Create file in sub-filesytem, take snapshot and remove the file +# 2. Create file in sub-filesystem, take snapshot and remove the file # 3. Verify sub-filesystem snapshot will not consume refquota # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -37,7 +37,7 @@ # # STRATEGY: # 1. Create volume on filesystem -# 2. Setting quota for parenet filesytem +# 2. Setting quota for parent filesystem # 3. Verify volume refreservation is only limited by volsize # 4. Verify volume refreservation can be changed when volsize changed # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rename_dirs/rename_dirs_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rename_dirs/rename_dirs_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rename_dirs/rename_dirs_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rename_dirs/rename_dirs_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ # # DESCRIPTION: -# Create two directory trees in ZFS filesystem, and concurently rename +# Create two directory trees in ZFS filesystem, and concurrently rename # directory across the two trees. ZFS should be able to handle the race # situation. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.sh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.sh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/reservation/reservation_007_pos.sh 2017-01-20 18:18:28.000000000 +0000 @@ -75,7 +75,7 @@ # for. # # Any special arguments for create are passed in via the args -# paramater. +# parameter. # function create_resv_destroy { # args1 dataset1 args2 dataset2 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.sh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.sh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.sh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/reservation/reservation_013_pos.sh 2017-01-20 18:18:28.000000000 +0000 @@ -82,7 +82,7 @@ # When initially created, a regular volume's reservation property is set # equal to its size (unlike a sparse volume), so we don't need to set it -# explictly later on +# explicitly later on log_must $ZFS create -V $resv_set $TESTPOOL/$TESTVOL log_must $ZFS create -s -V $sparse_vol_set_size $TESTPOOL/$TESTVOL2 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rootpool/rootpool_003_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rootpool/rootpool_003_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rootpool/rootpool_003_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rootpool/rootpool_003_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -47,7 +47,7 @@ # verify_runnable "global" -log_assert "system related filesytems can not be renamed or destroyed" +log_assert "system related filesystems can not be renamed or destroyed" typeset rootpool=$(get_rootpool) typeset rootfs=$(get_rootfs) diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -36,7 +36,7 @@ # Changes made by 'zfs promote' can be properly received. # # STRATEGY: -# 1. Seperatly promote pool clone, filesystem clone and volume clone. +# 1. Separately promote pool clone, filesystem clone and volume clone. # 2. Recursively backup all the POOL and restore in POOL2 # 3. Verify all the datesets and property be properly received. # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -28,7 +28,7 @@ # 2. Mess up the contents of the stream state file on disk # 3. Try ZFS receive, which should fail with a checksum mismatch error # 4. ZFS send to the stream state file again using the receive_resume_token -# 5. ZFS receieve and verify the receive completes successfully +# 5. ZFS receive and verify the receive completes successfully # 6. Repeat steps on an incremental ZFS send # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -28,7 +28,7 @@ # 2. Mess up the contents of the stream state file on disk # 3. Try ZFS receive, which should fail with a checksum mismatch error # 4. ZFS send to the stream state file again using the receive_resume_token -# 5. ZFS receieve and verify the receive completes successfully +# 5. ZFS receive and verify the receive completes successfully # verify_runnable "both" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -29,7 +29,7 @@ # 2. Mess up the contents of the stream state file on disk # 3. Try ZFS receive, which should fail with a checksum mismatch error # 4. ZFS send to the stream state file again using the receive_resume_token -# 5. ZFS receieve and verify the receive completes successfully +# 5. ZFS receive and verify the receive completes successfully # 6. Repeat steps on an incremental ZFS send # diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -33,7 +33,7 @@ # 6. Mess up the contents of the stream state file on disk # 7. Try ZFS receive, which should fail with a checksum mismatch error # 8. ZFS send to the stream state file again using the receive_resume_token -# 9. ZFS receieve and verify the receive completes successfully +# 9. ZFS receive and verify the receive completes successfully # verify_runnable "both" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -30,7 +30,7 @@ # 4. Mess up the contents of the stream state file on disk # 5. Try ZFS receive, which should fail with a checksum mismatch error # 6. ZFS send to the stream state file again using the receive_resume_token -# 7. ZFS receieve and verify the receive completes successfully +# 7. ZFS receive and verify the receive completes successfully # verify_runnable "both" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend.kshlib zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend.kshlib --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/rsend/rsend.kshlib 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/rsend/rsend.kshlib 2017-01-20 18:18:28.000000000 +0000 @@ -172,7 +172,7 @@ } # -# Compare all the directores and files in two filesystems +# Compare all the directories and files in two filesystems # # $1 source filesystem # $2 destination filesystem diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -55,7 +55,7 @@ mntpnt=$(get_prop mountpoint $TESTPOOL) # - # Create file in pool to trigger writting in slog devices + # Create file in pool to trigger writing in slog devices # log_must $DD if=/dev/urandom of=$mntpnt/testfile.$$ count=100 diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/threadsappend/threadsappend_001_pos.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/threadsappend/threadsappend_001_pos.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/threadsappend/threadsappend_001_pos.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/threadsappend/threadsappend_001_pos.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -74,7 +74,7 @@ # SIZE=`$LS -l ${TESTDIR}/${TESTFILE} | $AWK '{print $5}'` if [[ $SIZE -ne $FILE_SIZE ]]; then - log_fail "'The length of ${TESTDIR}/${TESTFILE}' doesnt equal 1310720." + log_fail "'The length of ${TESTDIR}/${TESTFILE}' doesn't equal 1310720." fi log_pass "Multiple thread appends succeeded. File size as expected" diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/cleanup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/cleanup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/cleanup.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/cleanup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,34 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/.gitignore zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/.gitignore --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/.gitignore 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/.gitignore 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,3 @@ +/tmpfile_test +/tmpfile_001_pos +/tmpfile_002_pos diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/Makefile.am zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/Makefile.am --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/Makefile.am 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/Makefile.am 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/tmpfile + +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/tmpfile + +pkgexec_PROGRAMS = tmpfile_test tmpfile_001_pos tmpfile_002_pos tmpfile_003_pos +tmpfile_test_SOURCES= tmpfile_test.c +tmpfile_001_pos_SOURCES = tmpfile_001_pos.c +tmpfile_002_pos_SOURCES = tmpfile_002_pos.c +tmpfile_003_pos_SOURCES = tmpfile_003_pos.c diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/setup.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/setup.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/setup.ksh 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/setup.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,39 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +if ! $STF_SUITE/tests/functional/tmpfile/tmpfile_test /tmp; then + log_unsupported "The kernel doesn't support O_TMPFILE." +fi + +DISK=${DISKS%% *} +default_setup $DISK diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* backward compat in case it's not defined */ +#ifndef O_TMPFILE +#define O_TMPFILE (020000000|O_DIRECTORY) +#endif + +/* + * DESCRIPTION: + * Verify we can create tmpfile. + * + * STRATEGY: + * 1. open(2) with O_TMPFILE. + * 2. write(2) random data to it, then read(2) and compare. + * 3. fsetxattr(2) random data, then fgetxattr(2) and compare. + * 4. Verify the above operations run successfully. + * + */ + +#define BSZ 64 + +void +fill_random(char *buf, int len) +{ + int i; + srand(time(NULL)); + for (i = 0; i < len; i++) { + buf[i] = (char)rand(); + } +} + +int +main(int argc, char *argv[]) +{ + int i, fd; + char buf1[BSZ], buf2[BSZ] = {}; + char *penv[] = {"TESTDIR"}; + + (void) fprintf(stdout, "Verify O_TMPFILE is working properly.\n"); + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + (void) fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + fill_random(buf1, BSZ); + + fd = open(penv[0], O_RDWR|O_TMPFILE, 0666); + if (fd < 0) { + perror("open"); + exit(2); + } + + if (write(fd, buf1, BSZ) < 0) { + perror("write"); + close(fd); + exit(3); + } + + if (pread(fd, buf2, BSZ, 0) < 0) { + perror("pread"); + close(fd); + exit(4); + } + + if (memcmp(buf1, buf2, BSZ) != 0) { + fprintf(stderr, "data corrupted\n"); + close(fd); + exit(5); + } + + memset(buf2, 0, BSZ); + + if (fsetxattr(fd, "user.test", buf1, BSZ, 0) < 0) { + perror("fsetxattr"); + close(fd); + exit(6); + } + + if (fgetxattr(fd, "user.test", buf2, BSZ) < 0) { + perror("fgetxattr"); + close(fd); + exit(7); + } + + if (memcmp(buf1, buf2, BSZ) != 0) { + fprintf(stderr, "xattr corrupted\n"); + close(fd); + exit(8); + } + + close(fd); + + return (0); +} diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* backward compat in case it's not defined */ +#ifndef O_TMPFILE +#define O_TMPFILE (020000000|O_DIRECTORY) +#endif + +/* + * DESCRIPTION: + * Verify we can link tmpfile. + * + * STRATEGY: + * 1. open(2) with O_TMPFILE. + * 2. linkat(2). + * 3. freeze the pool, export and re-import the pool. + * 3. stat(2) the path to verify it has been created. + * + */ + +int +main(int argc, char *argv[]) +{ + int i, fd, ret; + char spath[1024], dpath[1024]; + char *penv[] = {"TESTDIR", "TESTFILE0"}; + struct stat sbuf; + + (void) fprintf(stdout, "Verify O_TMPFILE file can be linked.\n"); + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + (void) fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + fd = open(penv[0], O_RDWR|O_TMPFILE, 0666); + if (fd < 0) { + perror("open"); + exit(2); + } + + snprintf(spath, 1024, "/proc/self/fd/%d", fd); + snprintf(dpath, 1024, "%s/%s", penv[0], penv[1]); + if (linkat(AT_FDCWD, spath, AT_FDCWD, dpath, AT_SYMLINK_FOLLOW) < 0) { + perror("linkat"); + close(fd); + exit(3); + } + + if ((ret = system("sudo zpool freeze $TESTPOOL"))) { + if (ret == -1) + perror("system \"zpool freeze\""); + else + fprintf(stderr, "zpool freeze exits with %d\n", + WEXITSTATUS(ret)); + exit(4); + } + + close(fd); + + if ((ret = system("sudo zpool export $TESTPOOL"))) { + if (ret == -1) + perror("system \"zpool export\""); + else + fprintf(stderr, "zpool export exits with %d\n", + WEXITSTATUS(ret)); + exit(4); + } + + if ((ret = system("sudo zpool import $TESTPOOL"))) { + if (ret == -1) + perror("system \"zpool import\""); + else + fprintf(stderr, "zpool import exits with %d\n", + WEXITSTATUS(ret)); + exit(4); + } + + if (stat(dpath, &sbuf) < 0) { + perror("stat"); + unlink(dpath); + exit(5); + } + unlink(dpath); + + return (0); +} diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* backward compat in case it's not defined */ +#ifndef O_TMPFILE +#define O_TMPFILE (020000000|O_DIRECTORY) +#endif + +/* + * DESCRIPTION: + * Verify O_EXCL tmpfile cannot be linked. + * + * STRATEGY: + * 1. open(2) with O_TMPFILE|O_EXCL. + * 2. linkat(2). + * 3. stat(2) the path to verify it wasn't created. + * + */ + +int +main(int argc, char *argv[]) +{ + int i, fd; + char spath[1024], dpath[1024]; + char *penv[] = {"TESTDIR", "TESTFILE0"}; + struct stat sbuf; + + (void) fprintf(stdout, "Verify O_EXCL tmpfile cannot be linked.\n"); + + /* + * Get the environment variable values. + */ + for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { + if ((penv[i] = getenv(penv[i])) == NULL) { + (void) fprintf(stderr, "getenv(penv[%d])\n", i); + exit(1); + } + } + + fd = open(penv[0], O_RDWR|O_TMPFILE|O_EXCL, 0666); + if (fd < 0) { + perror("open"); + exit(2); + } + + snprintf(spath, 1024, "/proc/self/fd/%d", fd); + snprintf(dpath, 1024, "%s/%s", penv[0], penv[1]); + if (linkat(AT_FDCWD, spath, AT_FDCWD, dpath, AT_SYMLINK_FOLLOW) == 0) { + fprintf(stderr, "linkat returns successfully\n"); + close(fd); + exit(3); + } + + if (stat(dpath, &sbuf) == 0) { + fprintf(stderr, "stat returns successfully\n"); + close(fd); + exit(4); + } + close(fd); + + return (0); +} diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c 1970-01-01 00:00:00.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c 2017-01-20 18:18:28.000000000 +0000 @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include + +/* backward compat in case it's not defined */ +#ifndef O_TMPFILE +#define O_TMPFILE (020000000|O_DIRECTORY) +#endif + +/* + * DESCRIPTION: + * Check if the kernel support O_TMPFILE. + */ + +int +main(int argc, char *argv[]) +{ + int fd; + struct stat buf; + + if (argc < 2) { + fprintf(stderr, "Usage: %s dir\n", argv[0]); + return (2); + } + if (stat(argv[1], &buf) < 0) { + perror("stat"); + return (2); + } + if (!S_ISDIR(buf.st_mode)) { + fprintf(stderr, "\"%s\" is not a directory\n", argv[1]); + return (2); + } + + fd = open(argv[1], O_TMPFILE | O_WRONLY, 0666); + if (fd < 0) { + /* + * Only fail on EISDIR. If we get EOPNOTSUPP, that means + * kernel support O_TMPFILE, but the path at argv[1] doesn't. + */ + if (errno == EISDIR) { + fprintf(stderr, "kernel doesn't support O_TMPFILE\n"); + return (1); + } + perror("open"); + } else { + close(fd); + } + return (0); +} diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_reads.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_reads.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_reads.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_reads.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -31,9 +31,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during random read load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_readwrite.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -31,9 +31,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during random read-write load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_writes.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_writes.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/random_writes.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/random_writes.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -30,9 +30,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during random write load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -36,9 +36,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during sequential read load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -30,9 +30,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during sequential read load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_reads.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -31,9 +31,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + log_assert "Measure IO stats during sequential read load" log_onexit cleanup diff -Nru zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh --- zfs-linux-0.7.0-rc2/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/tests/zfs-tests/tests/perf/regression/sequential_writes.ksh 2017-01-20 18:18:28.000000000 +0000 @@ -33,9 +33,15 @@ function cleanup { - log_must $ZFS destroy $TESTFS + # kill fio and iostat + $PKILL ${FIO##*/} + $PKILL ${IOSTAT##*/} + log_must_busy $ZFS destroy $TESTFS + log_must_busy $ZPOOL destroy $PERFPOOL } +trap "log_fail \"Measure IO stats during random read load\"" SIGTERM + export TESTFS=$PERFPOOL/testfs recreate_perfpool log_must $ZFS create $PERF_FS_OPTS $TESTFS diff -Nru zfs-linux-0.7.0-rc2/udev/rules.d/60-zvol.rules.in zfs-linux-0.7.0-rc3/udev/rules.d/60-zvol.rules.in --- zfs-linux-0.7.0-rc2/udev/rules.d/60-zvol.rules.in 2016-10-26 17:36:33.000000000 +0000 +++ zfs-linux-0.7.0-rc3/udev/rules.d/60-zvol.rules.in 2017-01-20 18:18:28.000000000 +0000 @@ -1,6 +1,6 @@ # Persistent links for zvol # # persistent disk links: /dev/zvol/dataset_name -# also creates compatibilty symlink of /dev/dataset_name +# also creates compatibility symlink of /dev/dataset_name KERNEL=="zd*" SUBSYSTEM=="block" ACTION=="add|change" PROGRAM="@udevdir@/zvol_id $tempnode" SYMLINK+="zvol/%c %c"