diff -Nru fio-2.1.3/.appveyor.yml fio-3.16/.appveyor.yml
--- fio-2.1.3/.appveyor.yml	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/.appveyor.yml	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,30 @@
+clone_depth: 1 # NB: this stops FIO-VERSION-GEN making tag based versions
+
+environment:
+  CYG_MIRROR: http://cygwin.mirror.constant.com
+  CYG_ROOT: C:\cygwin64
+  MAKEFLAGS: -j 2
+  matrix:
+    - platform: x64
+      PACKAGE_ARCH: x86_64
+      CONFIGURE_OPTIONS:
+    - platform: x86
+      PACKAGE_ARCH: i686
+      CONFIGURE_OPTIONS: --build-32bit-win --target-win-ver=xp
+
+install:
+  - '%CYG_ROOT%\setup-x86_64.exe --quiet-mode --no-shortcuts --only-site --site "%CYG_MIRROR%" --packages "mingw64-%PACKAGE_ARCH%-zlib" > NUL'
+  - SET PATH=%CYG_ROOT%\bin;%PATH% # NB: Changed env variables persist to later sections
+
+build_script:
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --disable-native --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe'
+
+after_build:
+  - cd os\windows && dobuild.cmd %PLATFORM%
+
+test_script:
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && file.exe fio.exe && make.exe test'
+
+artifacts:
+  - path: os\windows\*.msi
+    name: msi
diff -Nru fio-2.1.3/arch/arch-aarch64.h fio-3.16/arch/arch-aarch64.h
--- fio-2.1.3/arch/arch-aarch64.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/arch/arch-aarch64.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,30 @@
+#ifndef ARCH_AARCH64_H
+#define ARCH_AARCH64_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH	(arch_aarch64)
+
+#define nop		do { __asm__ __volatile__ ("yield"); } while (0)
+#define read_barrier()	do { __sync_synchronize(); } while (0)
+#define write_barrier()	do { __sync_synchronize(); } while (0)
+
+static inline int arch_ffz(unsigned long bitmask)
+{
+	unsigned long count, reversed_bits;
+	if (~bitmask == 0)	/* ffz() in lib/ffz.h does this. */
+		return 63;
+
+	__asm__ __volatile__ ("rbit %1, %2\n"
+			      "clz %0, %1\n" : 
+			      "=r"(count), "=&r"(reversed_bits) :
+			      "r"(~bitmask));
+	return count;
+}
+
+#define ARCH_HAVE_FFZ
+
+#endif
diff -Nru fio-2.1.3/arch/arch-alpha.h fio-3.16/arch/arch-alpha.h
--- fio-2.1.3/arch/arch-alpha.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-alpha.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_alpha)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		442
-#define __NR_ioprio_get		443
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		413
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		468
-#define __NR_sys_tee		470
-#define __NR_sys_vmsplice	471
-#endif
-
 #define nop			do { } while (0)
 #define read_barrier()		__asm__ __volatile__("mb": : :"memory")
 #define write_barrier()		__asm__ __volatile__("wmb": : :"memory")
diff -Nru fio-2.1.3/arch/arch-arm.h fio-3.16/arch/arch-arm.h
--- fio-2.1.3/arch/arch-arm.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-arm.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,31 +3,20 @@
 
 #define FIO_ARCH	(arch_arm)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		314
-#define __NR_ioprio_get		315
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		270
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		340
-#define __NR_sys_tee		342
-#define __NR_sys_vmsplice	343
-#endif
-
 #if defined (__ARM_ARCH_4__) || defined (__ARM_ARCH_4T__) \
-	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
-	|| defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
+	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5E__)\
+	|| defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
+	|| defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \
+	|| defined(__ARM_ARCH_6KZ__) || defined(__ARM_ARCH_6K__)
 #define nop             __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t")
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
 #define write_barrier()	__asm__ __volatile__ ("" : : : "memory")
-#elif defined(__ARM_ARCH_7A__)
+#elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__)
 #define	nop		__asm__ __volatile__ ("nop")
 #define read_barrier()	__sync_synchronize()
 #define write_barrier()	__sync_synchronize()
+#else
+#error "unsupported ARM architecture"
 #endif
 
 #endif
diff -Nru fio-2.1.3/arch/arch.h fio-3.16/arch/arch.h
--- fio-2.1.3/arch/arch.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,9 +1,11 @@
 #ifndef ARCH_H
 #define ARCH_H
 
+#include "../lib/types.h"
+
 enum {
 	arch_x86_64 = 1,
-	arch_i386,
+	arch_x86,
 	arch_ppc,
 	arch_ia64,
 	arch_s390,
@@ -14,6 +16,7 @@
 	arch_sh,
 	arch_hppa,
 	arch_mips,
+	arch_aarch64,
 
 	arch_generic,
 
@@ -29,6 +32,9 @@
 
 extern unsigned long arch_flags;
 
+#define ARCH_CPU_CLOCK_WRAPS
+
+/* IWYU pragma: begin_exports */
 #if defined(__i386__)
 #include "arch-x86.h"
 #elif defined(__x86_64__)
@@ -53,16 +59,15 @@
 #include "arch-sh.h"
 #elif defined(__hppa__)
 #include "arch-hppa.h"
+#elif defined(__aarch64__)
+#include "arch-aarch64.h"
 #else
 #warning "Unknown architecture, attempting to use generic model."
 #include "arch-generic.h"
 #endif
 
-#ifdef ARCH_HAVE_FFZ
-#define ffz(bitmask)	arch_ffz(bitmask)
-#else
 #include "../lib/ffz.h"
-#endif
+/* IWYU pragma: end_exports */
 
 #ifndef ARCH_HAVE_INIT
 static inline int arch_init(char *envp[])
diff -Nru fio-2.1.3/arch/arch-hppa.h fio-3.16/arch/arch-hppa.h
--- fio-2.1.3/arch/arch-hppa.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-hppa.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_hppa)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		267
-#define __NR_ioprio_get		268
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		236
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		291
-#define __NR_sys_tee		293
-#define __NR_sys_vmsplice	294
-#endif
-
 #define nop	do { } while (0)
 
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
diff -Nru fio-2.1.3/arch/arch-ia64.h fio-3.16/arch/arch-ia64.h
--- fio-2.1.3/arch/arch-ia64.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-ia64.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_ia64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		1274
-#define __NR_ioprio_get		1275
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		1234
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		1297
-#define __NR_sys_tee		1301
-#define __NR_sys_vmsplice	1302
-#endif
-
 #define nop		asm volatile ("hint @pause" ::: "memory");
 #define read_barrier()	asm volatile ("mf" ::: "memory")
 #define write_barrier()	asm volatile ("mf" ::: "memory")
@@ -43,10 +28,10 @@
 }
 
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 static inline int arch_init(char *envp[])
 {
-	tsc_reliable = 1;
+	tsc_reliable = true;
 	return 0;
 }
 
diff -Nru fio-2.1.3/arch/arch-mips.h fio-3.16/arch/arch-mips.h
--- fio-2.1.3/arch/arch-mips.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-mips.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_mips)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		314
-#define __NR_ioprio_get		315
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		215
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		263
-#define __NR_sys_tee		265
-#define __NR_sys_vmsplice	266
-#endif
-
 #define read_barrier()		__asm__ __volatile__("": : :"memory")
 #define write_barrier()		__asm__ __volatile__("": : :"memory")
 #define nop			__asm__ __volatile__("": : :"memory")
diff -Nru fio-2.1.3/arch/arch-ppc.h fio-3.16/arch/arch-ppc.h
--- fio-2.1.3/arch/arch-ppc.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-ppc.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,5 @@
 #ifndef ARCH_PPC_H
-#define ARCH_PPH_H
+#define ARCH_PPC_H
 
 #include <unistd.h>
 #include <stdlib.h>
@@ -8,21 +8,6 @@
 
 #define FIO_ARCH	(arch_ppc)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		273
-#define __NR_ioprio_get		274
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		233
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		283
-#define __NR_sys_tee		284
-#define __NR_sys_vmsplice	285
-#endif
-
 #define nop	do { } while (0)
 
 #ifdef __powerpc64__
@@ -33,18 +18,24 @@
 
 #define write_barrier()	__asm__ __volatile__ ("sync" : : : "memory")
 
+#ifdef __powerpc64__
+#define PPC_CNTLZL "cntlzd"
+#else
+#define PPC_CNTLZL "cntlzw"
+#endif
+
 static inline int __ilog2(unsigned long bitmask)
 {
 	int lz;
 
-	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (bitmask));
-	return 31 - lz;
+	asm (PPC_CNTLZL " %0,%1" : "=r" (lz) : "r" (bitmask));
+	return BITS_PER_LONG - 1 - lz;
 }
 
 static inline int arch_ffz(unsigned long bitmask)
 {
 	if ((bitmask = ~bitmask) == 0)
-		return 32;
+		return BITS_PER_LONG;
 	return  __ilog2(bitmask & -bitmask);
 }
 
@@ -61,6 +52,22 @@
 #define SPRN_ATBL  0x20E /* Alternate Time Base Lower */
 #define SPRN_ATBU  0x20F /* Alternate Time Base Upper */
 
+#ifdef __powerpc64__
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long rval;
+
+	asm volatile(
+		"90:	mfspr %0, %1;\n"
+		"	cmpwi %0,0;\n"
+		"	beq-  90b;\n"
+	: "=r" (rval)
+	: "i" (SPRN_TBRL)
+	: "cr0");
+
+	return rval;
+}
+#else
 static inline unsigned long long get_cpu_clock(void)
 {
 	unsigned int tbl, tbu0, tbu1;
@@ -81,7 +88,9 @@
 	ret = (((unsigned long long)tbu0) << 32) | tbl;
 	return ret;
 }
+#endif
 
+#if 0
 static void atb_child(void)
 {
 	arch_flags |= ARCH_FLAG_1;
@@ -106,14 +115,17 @@
 			arch_flags |= ARCH_FLAG_1;
 	}
 }
+#endif
 
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 
 static inline int arch_init(char *envp[])
 {
-	tsc_reliable = 1;
+#if 0
+	tsc_reliable = true;
 	atb_clocktest();
+#endif
 	return 0;
 }
 
@@ -126,4 +138,12 @@
  * #define ARCH_HAVE_CPU_CLOCK
  */
 
+/*
+ * Let's have it defined for ppc64
+ */
+
+#ifdef __powerpc64__
+#define ARCH_HAVE_CPU_CLOCK
+#endif
+
 #endif
diff -Nru fio-2.1.3/arch/arch-s390.h fio-3.16/arch/arch-s390.h
--- fio-2.1.3/arch/arch-s390.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-s390.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,22 +3,7 @@
 
 #define FIO_ARCH	(arch_s390)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		282
-#define __NR_ioprio_get		283
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		253
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		306
-#define __NR_sys_tee		308
-#define __NR_sys_vmsplice	309
-#endif
-
-#define nop		asm volatile ("diag 0,0,68" : : : "memory")
+#define nop		asm volatile("nop" : : : "memory")
 #define read_barrier()	asm volatile("bcr 15,0" : : : "memory")
 #define write_barrier()	asm volatile("bcr 15,0" : : : "memory")
 
@@ -26,18 +11,28 @@
 {
 	unsigned long long clk;
 
+#ifdef CONFIG_S390_Z196_FACILITIES
+	/*
+	 * Fio needs monotonic (never lower), but not strict monotonic (never
+	 * the same) so store clock fast is enough.
+	 */
+	__asm__ __volatile__("stckf %0" : "=Q" (clk) : : "cc");
+#else
 	__asm__ __volatile__("stck %0" : "=Q" (clk) : : "cc");
-	return clk;
+#endif
+	return clk>>12;
 }
 
+#define ARCH_CPU_CLOCK_CYCLES_PER_USEC 1
+#define ARCH_HAVE_CPU_CLOCK
+#undef ARCH_CPU_CLOCK_WRAPS
+
 #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
 static inline int arch_init(char *envp[])
 {
-	tsc_reliable = 1;
+	tsc_reliable = true;
 	return 0;
 }
 
-#define ARCH_HAVE_CPU_CLOCK
-
 #endif
diff -Nru fio-2.1.3/arch/arch-sh.h fio-3.16/arch/arch-sh.h
--- fio-2.1.3/arch/arch-sh.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-sh.h	2019-09-20 01:01:52.000000000 +0000
@@ -5,21 +5,6 @@
 
 #define FIO_ARCH	(arch_sh)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set	288
-#define __NR_ioprio_get	289
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64	250
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		313
-#define __NR_sys_tee		315
-#define __NR_sys_vmsplice	316
-#endif
-
 #define nop             __asm__ __volatile__ ("nop": : :"memory")
 
 #define mb()								\
diff -Nru fio-2.1.3/arch/arch-sparc64.h fio-3.16/arch/arch-sparc64.h
--- fio-2.1.3/arch/arch-sparc64.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-sparc64.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_sparc64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		196
-#define __NR_ioprio_get		218
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		209
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		232
-#define __NR_sys_tee		280
-#define __NR_sys_vmsplice	25
-#endif
-
 #define nop	do { } while (0)
 
 #define membar_safe(type) \
diff -Nru fio-2.1.3/arch/arch-sparc.h fio-3.16/arch/arch-sparc.h
--- fio-2.1.3/arch/arch-sparc.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-sparc.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_sparc)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		196
-#define __NR_ioprio_get		218
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		209
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		232
-#define __NR_sys_tee		280
-#define __NR_sys_vmsplice	25
-#endif
-
 #define nop	do { } while (0)
 
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
diff -Nru fio-2.1.3/arch/arch-x86_64.h fio-3.16/arch/arch-x86_64.h
--- fio-2.1.3/arch/arch-x86_64.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-x86_64.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,46 +1,24 @@
-#ifndef ARCH_X86_64_h
-#define ARCH_X86_64_h
+#ifndef ARCH_X86_64_H
+#define ARCH_X86_64_H
 
 static inline void do_cpuid(unsigned int *eax, unsigned int *ebx,
 			    unsigned int *ecx, unsigned int *edx)
 {
 	asm volatile("cpuid"
-		: "=a" (*eax), "=b" (*ebx), "=r" (*ecx), "=d" (*edx)
+		: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
 		: "0" (*eax), "2" (*ecx)
 		: "memory");
 }
 
-#include "arch-x86-common.h"
+#include "arch-x86-common.h" /* IWYU pragma: export */
 
 #define FIO_ARCH	(arch_x86_64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		251
-#define __NR_ioprio_get		252
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		221
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		275
-#define __NR_sys_tee		276
-#define __NR_sys_vmsplice	278
-#endif
-
-#ifndef __NR_shmget
-#define __NR_shmget		 29
-#define __NR_shmat		 30
-#define __NR_shmctl		 31
-#define __NR_shmdt		 67
-#endif
-
 #define	FIO_HUGE_PAGE		2097152
 
 #define nop		__asm__ __volatile__("rep;nop": : :"memory")
-#define read_barrier()	__asm__ __volatile__("lfence":::"memory")
-#define write_barrier()	__asm__ __volatile__("sfence":::"memory")
+#define read_barrier()	__asm__ __volatile__("":::"memory")
+#define write_barrier()	__asm__ __volatile__("":::"memory")
 
 static inline unsigned long arch_ffz(unsigned long bitmask)
 {
@@ -60,4 +38,34 @@
 #define ARCH_HAVE_SSE4_2
 #define ARCH_HAVE_CPU_CLOCK
 
+#define RDRAND_LONG	".byte 0x48,0x0f,0xc7,0xf0"
+#define RDSEED_LONG	".byte 0x48,0x0f,0xc7,0xf8"
+#define RDRAND_RETRY	100
+
+static inline int arch_rand_long(unsigned long *val)
+{
+	int ok;
+
+	asm volatile("1: " RDRAND_LONG "\n\t"
+		     "jc 2f\n\t"
+		     "decl %0\n\t"
+		     "jnz 1b\n\t"
+		     "2:"
+		     : "=r" (ok), "=a" (*val)
+		     : "0" (RDRAND_RETRY));
+
+	return ok;
+}
+
+static inline int arch_rand_seed(unsigned long *seed)
+{
+	unsigned char ok;
+
+	asm volatile(RDSEED_LONG "\n\t"
+			"setc %0"
+			: "=qm" (ok), "=a" (*seed));
+
+	return 0;
+}
+
 #endif
diff -Nru fio-2.1.3/arch/arch-x86-common.h fio-3.16/arch/arch-x86-common.h
--- fio-2.1.3/arch/arch-x86-common.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-x86-common.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,6 +3,16 @@
 
 #include <string.h>
 
+#ifndef __NR_sys_io_uring_setup
+#define __NR_sys_io_uring_setup		425
+#endif
+#ifndef __NR_sys_io_uring_enter
+#define __NR_sys_io_uring_enter		426
+#endif
+#ifndef __NR_sys_io_uring_register
+#define __NR_sys_io_uring_register	427
+#endif
+
 static inline void cpuid(unsigned int op,
 			 unsigned int *eax, unsigned int *ebx,
 			 unsigned int *ecx, unsigned int *edx)
@@ -13,10 +23,12 @@
 }
 
 #define ARCH_HAVE_INIT
+#define ARCH_HAVE_IOURING
 
-extern int tsc_reliable;
+extern bool tsc_reliable;
+extern int arch_random;
 
-static inline int arch_init_intel(unsigned int level)
+static inline void arch_init_intel(void)
 {
 	unsigned int eax, ebx, ecx = 0, edx;
 
@@ -26,46 +38,51 @@
 	eax = 1;
 	do_cpuid(&eax, &ebx, &ecx, &edx);
 	if (!(edx & (1U << 4)))
-		return 0;
+		return;
 
 	/*
 	 * Check for constant rate and synced (across cores) TSC
 	 */
 	eax = 0x80000007;
 	do_cpuid(&eax, &ebx, &ecx, &edx);
-	return edx & (1U << 8);
+	tsc_reliable = (edx & (1U << 8)) != 0;
+
+	/*
+	 * Check for FDRAND
+	 */
+	eax = 0x1;
+	do_cpuid(&eax, &ebx, &ecx, &edx);
+	arch_random = (ecx & (1U << 30)) != 0;
 }
 
-static inline int arch_init_amd(unsigned int level)
+static inline void arch_init_amd(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
 	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
 	if (eax < 0x80000007)
-		return 0;
+		return;
 
 	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-	if (edx & (1 << 8))
-		return 1;
-
-	return 0;
+	tsc_reliable = (edx & (1U << 8)) != 0;
 }
 
-static inline int arch_init(char *envp[])
+static inline void arch_init(char *envp[])
 {
 	unsigned int level;
-	char str[12];
+	char str[13];
+
+	arch_random = tsc_reliable = 0;
 
 	cpuid(0, &level, (unsigned int *) &str[0],
 			 (unsigned int *) &str[8],
 			 (unsigned int *) &str[4]);
 
+	str[12] = '\0';
 	if (!strcmp(str, "GenuineIntel"))
-		tsc_reliable = arch_init_intel(level);
-	else if (!strcmp(str, "AuthenticAMD"))
-		tsc_reliable = arch_init_amd(level);
-
-	return 0;
+		arch_init_intel();
+	else if (!strcmp(str, "AuthenticAMD") || !strcmp(str, "HygonGenuine"))
+		arch_init_amd();
 }
 
 #endif
diff -Nru fio-2.1.3/arch/arch-x86.h fio-3.16/arch/arch-x86.h
--- fio-2.1.3/arch/arch-x86.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/arch/arch-x86.h	2019-09-20 01:01:52.000000000 +0000
@@ -10,24 +10,9 @@
 		: "memory");
 }
 
-#include "arch-x86-common.h"
+#include "arch-x86-common.h" /* IWYU pragma: export */
 
-#define FIO_ARCH	(arch_i386)
-
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		250
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		313
-#define __NR_sys_tee		315
-#define __NR_sys_vmsplice	316
-#endif
+#define FIO_ARCH	(arch_x86)
 
 #define	FIO_HUGE_PAGE		4194304
 
diff -Nru fio-2.1.3/backend.c fio-3.16/backend.c
--- fio-2.1.3/backend.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/backend.c	2019-09-20 01:01:52.000000000 +0000
@@ -18,49 +18,43 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
 #include <unistd.h>
-#include <fcntl.h>
 #include <string.h>
-#include <limits.h>
 #include <signal.h>
-#include <time.h>
-#include <locale.h>
 #include <assert.h>
-#include <time.h>
 #include <inttypes.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
-#include <sys/ipc.h>
-#include <sys/mman.h>
+#include <math.h>
+#include <pthread.h>
 
 #include "fio.h"
-#ifndef FIO_NO_HAVE_SHM_H
-#include <sys/shm.h>
-#endif
-#include "hash.h"
 #include "smalloc.h"
 #include "verify.h"
-#include "trim.h"
 #include "diskutil.h"
 #include "cgroup.h"
 #include "profile.h"
 #include "lib/rand.h"
-#include "memalign.h"
+#include "lib/memalign.h"
 #include "server.h"
 #include "lib/getrusage.h"
 #include "idletime.h"
+#include "err.h"
+#include "workqueue.h"
+#include "lib/mountcheck.h"
+#include "rate-submit.h"
+#include "helper_thread.h"
+#include "pshared.h"
+#include "zone-dist.h"
 
-static pthread_t disk_util_thread;
-static struct fio_mutex *disk_thread_mutex;
-static struct fio_mutex *startup_mutex;
-static struct fio_mutex *writeout_mutex;
+static struct fio_sem *startup_sem;
 static struct flist_head *cgroup_list;
-static char *cgroup_mnt;
+static struct cgroup_mnt *cgroup_mnt;
 static int exit_value;
-static volatile int fio_abort;
+static volatile bool fio_abort;
 static unsigned int nr_process = 0;
 static unsigned int nr_thread = 0;
 
@@ -72,10 +66,7 @@
 int shm_id = 0;
 int temp_stall_ts;
 unsigned long done_secs = 0;
-volatile int disk_util_exit = 0;
-
-#define PAGE_ALIGN(buf)	\
-	(char *) (((uintptr_t) (buf) + page_mask) & ~page_mask)
+pthread_mutex_t overlap_check = PTHREAD_MUTEX_INITIALIZER;
 
 #define JOB_START_TIMEOUT	(5 * 1000)
 
@@ -86,7 +77,7 @@
 			fio_server_got_signal(sig);
 		else {
 			log_info("\nfio: terminating on signal %d\n", sig);
-			fflush(stdout);
+			log_info_flush();
 			exit_value = 128;
 		}
 
@@ -94,7 +85,7 @@
 	}
 }
 
-static void sig_show_status(int sig)
+void sig_show_status(int sig)
 {
 	show_running_run_stats();
 }
@@ -137,8 +128,8 @@
 /*
  * Check if we are above the minimum rate given.
  */
-static int __check_min_rate(struct thread_data *td, struct timeval *now,
-			    enum fio_ddir ddir)
+static bool __check_min_rate(struct thread_data *td, struct timespec *now,
+			     enum fio_ddir ddir)
 {
 	unsigned long long bytes = 0;
 	unsigned long iops = 0;
@@ -151,13 +142,13 @@
 	assert(ddir_rw(ddir));
 
 	if (!td->o.ratemin[ddir] && !td->o.rate_iops_min[ddir])
-		return 0;
+		return false;
 
 	/*
 	 * allow a 2 second settle period in the beginning
 	 */
 	if (mtime_since(&td->start, now) < 2000)
-		return 0;
+		return false;
 
 	iops += td->this_io_blocks[ddir];
 	bytes += td->this_io_bytes[ddir];
@@ -171,24 +162,27 @@
 	if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
 		spent = mtime_since(&td->lastrate[ddir], now);
 		if (spent < td->o.ratecycle)
-			return 0;
+			return false;
 
-		if (td->o.rate[ddir]) {
+		if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
 			/*
 			 * check bandwidth specified rate
 			 */
 			if (bytes < td->rate_bytes[ddir]) {
-				log_err("%s: min rate %u not met\n", td->o.name,
-								ratemin);
-				return 1;
+				log_err("%s: rate_min=%uB/s not met, only transferred %lluB\n",
+					td->o.name, ratemin, bytes);
+				return true;
 			} else {
-				rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
+				if (spent)
+					rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
+				else
+					rate = 0;
+
 				if (rate < ratemin ||
 				    bytes < td->rate_bytes[ddir]) {
-					log_err("%s: min rate %u not met, got"
-						" %luKB/sec\n", td->o.name,
-							ratemin, rate);
-					return 1;
+					log_err("%s: rate_min=%uB/s not met, got %luB/s\n",
+						td->o.name, ratemin, rate);
+					return true;
 				}
 			}
 		} else {
@@ -196,16 +190,20 @@
 			 * checks iops specified rate
 			 */
 			if (iops < rate_iops) {
-				log_err("%s: min iops rate %u not met\n",
-						td->o.name, rate_iops);
-				return 1;
+				log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
+						td->o.name, rate_iops, iops);
+				return true;
 			} else {
-				rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
+				if (spent)
+					rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
+				else
+					rate = 0;
+
 				if (rate < rate_iops_min ||
 				    iops < td->rate_blocks[ddir]) {
-					log_err("%s: min iops rate %u not met,"
-						" got %lu\n", td->o.name,
-							rate_iops_min, rate);
+					log_err("%s: rate_iops_min=%u not met, got %lu IOPS\n",
+						td->o.name, rate_iops_min, rate);
+					return true;
 				}
 			}
 		}
@@ -214,19 +212,18 @@
 	td->rate_bytes[ddir] = bytes;
 	td->rate_blocks[ddir] = iops;
 	memcpy(&td->lastrate[ddir], now, sizeof(*now));
-	return 0;
+	return false;
 }
 
-static int check_min_rate(struct thread_data *td, struct timeval *now,
-			  uint64_t *bytes_done)
+static bool check_min_rate(struct thread_data *td, struct timespec *now)
 {
-	int ret = 0;
+	bool ret = false;
 
-	if (bytes_done[DDIR_READ])
+	if (td->bytes_done[DDIR_READ])
 		ret |= __check_min_rate(td, now, DDIR_READ);
-	if (bytes_done[DDIR_WRITE])
+	if (td->bytes_done[DDIR_WRITE])
 		ret |= __check_min_rate(td, now, DDIR_WRITE);
-	if (bytes_done[DDIR_TRIM])
+	if (td->bytes_done[DDIR_TRIM])
 		ret |= __check_min_rate(td, now, DDIR_TRIM);
 
 	return ret;
@@ -240,10 +237,13 @@
 {
 	int r;
 
+	if (td->error)
+		return;
+
 	/*
 	 * get immediately available events, if any
 	 */
-	r = io_u_queued_complete(td, 0, NULL);
+	r = io_u_queued_complete(td, 0);
 	if (r < 0)
 		return;
 
@@ -264,53 +264,52 @@
 	}
 
 	if (td->cur_depth)
-		r = io_u_queued_complete(td, td->cur_depth, NULL);
+		r = io_u_queued_complete(td, td->cur_depth);
 }
 
 /*
  * Helper to handle the final sync of a file. Works just like the normal
  * io path, just does everything sync.
  */
-static int fio_io_sync(struct thread_data *td, struct fio_file *f)
+static bool fio_io_sync(struct thread_data *td, struct fio_file *f)
 {
 	struct io_u *io_u = __get_io_u(td);
-	int ret;
+	enum fio_q_status ret;
 
 	if (!io_u)
-		return 1;
+		return true;
 
 	io_u->ddir = DDIR_SYNC;
 	io_u->file = f;
 
 	if (td_io_prep(td, io_u)) {
 		put_io_u(td, io_u);
-		return 1;
+		return true;
 	}
 
 requeue:
 	ret = td_io_queue(td, io_u);
-	if (ret < 0) {
-		td_verror(td, io_u->error, "td_io_queue");
-		put_io_u(td, io_u);
-		return 1;
-	} else if (ret == FIO_Q_QUEUED) {
-		if (io_u_queued_complete(td, 1, NULL) < 0)
-			return 1;
-	} else if (ret == FIO_Q_COMPLETED) {
+	switch (ret) {
+	case FIO_Q_QUEUED:
+		td_io_commit(td);
+		if (io_u_queued_complete(td, 1) < 0)
+			return true;
+		break;
+	case FIO_Q_COMPLETED:
 		if (io_u->error) {
 			td_verror(td, io_u->error, "td_io_queue");
-			return 1;
+			return true;
 		}
 
-		if (io_u_sync_complete(td, io_u, NULL) < 0)
-			return 1;
-	} else if (ret == FIO_Q_BUSY) {
-		if (td_io_commit(td))
-			return 1;
+		if (io_u_sync_complete(td, io_u) < 0)
+			return true;
+		break;
+	case FIO_Q_BUSY:
+		td_io_commit(td);
 		goto requeue;
 	}
 
-	return 0;
+	return false;
 }
 
 static int fio_file_fsync(struct thread_data *td, struct fio_file *f)
@@ -328,31 +327,48 @@
 	return ret;
 }
 
-static inline void __update_tv_cache(struct thread_data *td)
+static inline void __update_ts_cache(struct thread_data *td)
 {
-	fio_gettime(&td->tv_cache, NULL);
+	fio_gettime(&td->ts_cache, NULL);
 }
 
-static inline void update_tv_cache(struct thread_data *td)
+static inline void update_ts_cache(struct thread_data *td)
 {
-	if ((++td->tv_cache_nr & td->tv_cache_mask) == td->tv_cache_mask)
-		__update_tv_cache(td);
+	if ((++td->ts_cache_nr & td->ts_cache_mask) == td->ts_cache_mask)
+		__update_ts_cache(td);
 }
 
-static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
+static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t)
 {
 	if (in_ramp_time(td))
-		return 0;
+		return false;
 	if (!td->o.timeout)
-		return 0;
-	if (mtime_since(&td->epoch, t) >= td->o.timeout * 1000)
-		return 1;
+		return false;
+	if (utime_since(&td->epoch, t) >= td->o.timeout)
+		return true;
 
-	return 0;
+	return false;
+}
+
+/*
+ * We need to update the runtime consistently in ms, but keep a running
+ * tally of the current elapsed time in microseconds for sub millisecond
+ * updates.
+ */
+static inline void update_runtime(struct thread_data *td,
+				  unsigned long long *elapsed_us,
+				  const enum fio_ddir ddir)
+{
+	if (ddir == DDIR_WRITE && td_write(td) && td->o.verify_only)
+		return;
+
+	td->ts.runtime[ddir] -= (elapsed_us[ddir] + 999) / 1000;
+	elapsed_us[ddir] += utime_since_now(&td->start);
+	td->ts.runtime[ddir] += (elapsed_us[ddir] + 999) / 1000;
 }
 
-static int break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
-			       int *retptr)
+static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
+				int *retptr)
 {
 	int ret = *retptr;
 
@@ -365,7 +381,7 @@
 
 		eb = td_error_type(ddir, err);
 		if (!(td->o.continue_on_error & (1 << eb)))
-			return 1;
+			return true;
 
 		if (td_non_fatal_error(td, eb, err)) {
 		        /*
@@ -375,26 +391,26 @@
 			update_error_count(td, err);
 			td_clear_error(td);
 			*retptr = 0;
-			return 0;
+			return false;
 		} else if (td->o.fill_device && err == ENOSPC) {
 			/*
 			 * We expect to hit this error if
 			 * fill_device option is set.
 			 */
 			td_clear_error(td);
-			td->terminate = 1;
-			return 1;
+			fio_mark_td_terminate(td);
+			return true;
 		} else {
 			/*
 			 * Stop the I/O in case of a fatal
 			 * error.
 			 */
 			update_error_count(td, err);
-			return 1;
+			return true;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 static void check_update_rusage(struct thread_data *td)
@@ -402,8 +418,199 @@
 	if (td->update_rusage) {
 		td->update_rusage = 0;
 		update_rusage_stat(td);
-		fio_mutex_up(td->rusage_sem);
+		fio_sem_up(td->rusage_sem);
+	}
+}
+
+static int wait_for_completions(struct thread_data *td, struct timespec *time)
+{
+	const int full = queue_full(td);
+	int min_evts = 0;
+	int ret;
+
+	if (td->flags & TD_F_REGROW_LOGS)
+		return io_u_quiesce(td);
+
+	/*
+	 * if the queue is full, we MUST reap at least 1 event
+	 */
+	min_evts = min(td->o.iodepth_batch_complete_min, td->cur_depth);
+	if ((full && !min_evts) || !td->o.iodepth_batch_complete_min)
+		min_evts = 1;
+
+	if (time && __should_check_rate(td))
+		fio_gettime(time, NULL);
+
+	do {
+		ret = io_u_queued_complete(td, min_evts);
+		if (ret < 0)
+			break;
+	} while (full && (td->cur_depth > td->o.iodepth_low));
+
+	return ret;
+}
+
+int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timespec *comp_time)
+{
+	switch (*ret) {
+	case FIO_Q_COMPLETED:
+		if (io_u->error) {
+			*ret = -io_u->error;
+			clear_io_u(td, io_u);
+		} else if (io_u->resid) {
+			long long bytes = io_u->xfer_buflen - io_u->resid;
+			struct fio_file *f = io_u->file;
+
+			if (bytes_issued)
+				*bytes_issued += bytes;
+
+			if (!from_verify)
+				trim_io_piece(io_u);
+
+			/*
+			 * zero read, fail
+			 */
+			if (!bytes) {
+				if (!from_verify)
+					unlog_io_piece(td, io_u);
+				td_verror(td, EIO, "full resid");
+				put_io_u(td, io_u);
+				break;
+			}
+
+			io_u->xfer_buflen = io_u->resid;
+			io_u->xfer_buf += bytes;
+			io_u->offset += bytes;
+
+			if (ddir_rw(io_u->ddir))
+				td->ts.short_io_u[io_u->ddir]++;
+
+			if (io_u->offset == f->real_file_size)
+				goto sync_done;
+
+			requeue_io_u(td, &io_u);
+		} else {
+sync_done:
+			if (comp_time && __should_check_rate(td))
+				fio_gettime(comp_time, NULL);
+
+			*ret = io_u_sync_complete(td, io_u);
+			if (*ret < 0)
+				break;
+		}
+
+		if (td->flags & TD_F_REGROW_LOGS)
+			regrow_logs(td);
+
+		/*
+		 * when doing I/O (not when verifying),
+		 * check for any errors that are to be ignored
+		 */
+		if (!from_verify)
+			break;
+
+		return 0;
+	case FIO_Q_QUEUED:
+		/*
+		 * if the engine doesn't have a commit hook,
+		 * the io_u is really queued. if it does have such
+		 * a hook, it has to call io_u_queued() itself.
+		 */
+		if (td->io_ops->commit == NULL)
+			io_u_queued(td, io_u);
+		if (bytes_issued)
+			*bytes_issued += io_u->xfer_buflen;
+		break;
+	case FIO_Q_BUSY:
+		if (!from_verify)
+			unlog_io_piece(td, io_u);
+		requeue_io_u(td, &io_u);
+		td_io_commit(td);
+		break;
+	default:
+		assert(*ret < 0);
+		td_verror(td, -(*ret), "td_io_queue");
+		break;
+	}
+
+	if (break_on_this_error(td, ddir, ret))
+		return 1;
+
+	return 0;
+}
+
+static inline bool io_in_polling(struct thread_data *td)
+{
+	return !td->o.iodepth_batch_complete_min &&
+		   !td->o.iodepth_batch_complete_max;
+}
+/*
+ * Unlinks files from thread data fio_file structure
+ */
+static int unlink_all_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int ret = 0;
+
+	for_each_file(td, f, i) {
+		if (f->filetype != FIO_TYPE_FILE)
+			continue;
+		ret = td_io_unlink_file(td, f);
+		if (ret)
+			break;
 	}
+
+	if (ret)
+		td_verror(td, ret, "unlink_all_files");
+
+	return ret;
+}
+
+/*
+ * Check if io_u will overlap an in-flight IO in the queue
+ */
+bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u)
+{
+	bool overlap;
+	struct io_u *check_io_u;
+	unsigned long long x1, x2, y1, y2;
+	int i;
+
+	x1 = io_u->offset;
+	x2 = io_u->offset + io_u->buflen;
+	overlap = false;
+	io_u_qiter(q, check_io_u, i) {
+		if (check_io_u->flags & IO_U_F_FLIGHT) {
+			y1 = check_io_u->offset;
+			y2 = check_io_u->offset + check_io_u->buflen;
+
+			if (x1 < y2 && y1 < x2) {
+				overlap = true;
+				dprint(FD_IO, "in-flight overlap: %llu/%llu, %llu/%llu\n",
+						x1, io_u->buflen,
+						y1, check_io_u->buflen);
+				break;
+			}
+		}
+	}
+
+	return overlap;
+}
+
+static enum fio_q_status io_u_submit(struct thread_data *td, struct io_u *io_u)
+{
+	/*
+	 * Check for overlap if the user asked us to, and we have
+	 * at least one IO in flight besides this one.
+	 */
+	if (td->o.serialize_overlap && td->cur_depth > 1 &&
+	    in_flight_overlap(&td->io_u_all, io_u))
+		return FIO_Q_BUSY;
+
+	return td_io_queue(td, io_u);
 }
 
 /*
@@ -412,7 +619,6 @@
  */
 static void do_verify(struct thread_data *td, uint64_t verify_bytes)
 {
-	uint64_t bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
 	struct fio_file *f;
 	struct io_u *io_u;
 	int ret, min_events;
@@ -438,20 +644,29 @@
 	if (td->error)
 		return;
 
+	/*
+	 * verify_state needs to be reset before verification
+	 * proceeds so that expected random seeds match actual
+	 * random seeds in headers. The main loop will reset
+	 * all random number generators if randrepeat is set.
+	 */
+	if (!td->o.rand_repeatable)
+		td_fill_verify_state_seed(td);
+
 	td_set_runstate(td, TD_VERIFYING);
 
 	io_u = NULL;
 	while (!td->terminate) {
 		enum fio_ddir ddir;
-		int ret2, full;
+		int full;
 
-		update_tv_cache(td);
+		update_ts_cache(td);
 		check_update_rusage(td);
 
-		if (runtime_exceeded(td, &td->tv_cache)) {
-			__update_tv_cache(td);
-			if (runtime_exceeded(td, &td->tv_cache)) {
-				td->terminate = 1;
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
+				fio_mark_td_terminate(td);
 				break;
 			}
 		}
@@ -474,10 +689,16 @@
 				break;
 			}
 		} else {
-			if (ddir_rw_sum(bytes_done) + td->o.rw_min_bs > verify_bytes)
+			if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes)
 				break;
 
 			while ((io_u = get_io_u(td)) != NULL) {
+				if (IS_ERR_OR_NULL(io_u)) {
+					io_u = NULL;
+					ret = FIO_Q_BUSY;
+					goto reap;
+				}
+
 				/*
 				 * We are only interested in the places where
 				 * we wrote or trimmed IOs. Turn those into
@@ -493,10 +714,11 @@
 					continue;
 				} else if (io_u->ddir == DDIR_TRIM) {
 					io_u->ddir = DDIR_READ;
-					io_u->flags |= IO_U_F_TRIMMED;
+					io_u_set(td, io_u, IO_U_F_TRIMMED);
 					break;
 				} else if (io_u->ddir == DDIR_WRITE) {
 					io_u->ddir = DDIR_READ;
+					populate_verify_io_u(td, io_u);
 					break;
 				} else {
 					put_io_u(td, io_u);
@@ -508,65 +730,23 @@
 				break;
 		}
 
+		if (verify_state_should_stop(td, io_u)) {
+			put_io_u(td, io_u);
+			break;
+		}
+
 		if (td->o.verify_async)
 			io_u->end_io = verify_io_u_async;
 		else
 			io_u->end_io = verify_io_u;
 
 		ddir = io_u->ddir;
+		if (!td->o.disable_slat)
+			fio_gettime(&io_u->start_time, NULL);
 
-		ret = td_io_queue(td, io_u);
-		switch (ret) {
-		case FIO_Q_COMPLETED:
-			if (io_u->error) {
-				ret = -io_u->error;
-				clear_io_u(td, io_u);
-			} else if (io_u->resid) {
-				int bytes = io_u->xfer_buflen - io_u->resid;
-
-				/*
-				 * zero read, fail
-				 */
-				if (!bytes) {
-					td_verror(td, EIO, "full resid");
-					put_io_u(td, io_u);
-					break;
-				}
-
-				io_u->xfer_buflen = io_u->resid;
-				io_u->xfer_buf += bytes;
-				io_u->offset += bytes;
-
-				if (ddir_rw(io_u->ddir))
-					td->ts.short_io_u[io_u->ddir]++;
-
-				f = io_u->file;
-				if (io_u->offset == f->real_file_size)
-					goto sync_done;
-
-				requeue_io_u(td, &io_u);
-			} else {
-sync_done:
-				ret = io_u_sync_complete(td, io_u, bytes_done);
-				if (ret < 0)
-					break;
-			}
-			continue;
-		case FIO_Q_QUEUED:
-			break;
-		case FIO_Q_BUSY:
-			requeue_io_u(td, &io_u);
-			ret2 = td_io_commit(td);
-			if (ret2 < 0)
-				ret = ret2;
-			break;
-		default:
-			assert(ret < 0);
-			td_verror(td, -ret, "td_io_queue");
-			break;
-		}
+		ret = io_u_submit(td, io_u);
 
-		if (break_on_this_error(td, ddir, &ret))
+		if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
 			break;
 
 		/*
@@ -574,28 +754,11 @@
 		 * completed io_u's first. Note that we can get BUSY even
 		 * without IO queued, if the system is resource starved.
 		 */
+reap:
 		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete) {
-			min_events = min(td->o.iodepth_batch_complete,
-					 td->cur_depth);
-			/*
-			 * if the queue is full, we MUST reap at least 1 event
-			 */
-			if (full && !min_events)
-				min_events = 1;
+		if (full || io_in_polling(td))
+			ret = wait_for_completions(td, NULL);
 
-			do {
-				/*
-				 * Reap required number of io units, if any,
-				 * and do the verification on them through
-				 * the callback handler
-				 */
-				if (io_u_queued_complete(td, min_events, bytes_done) < 0) {
-					ret = -1;
-					break;
-				}
-			} while (full && (td->cur_depth > td->o.iodepth_low));
-		}
 		if (ret < 0)
 			break;
 	}
@@ -606,7 +769,7 @@
 		min_events = td->cur_depth;
 
 		if (min_events)
-			ret = io_u_queued_complete(td, min_events, NULL);
+			ret = io_u_queued_complete(td, min_events);
 	} else
 		cleanup_pending_aio(td);
 
@@ -615,20 +778,125 @@
 	dprint(FD_VERIFY, "exiting loop\n");
 }
 
-static int io_bytes_exceeded(struct thread_data *td)
+static bool exceeds_number_ios(struct thread_data *td)
 {
-	unsigned long long bytes;
+	unsigned long long number_ios;
+
+	if (!td->o.number_ios)
+		return false;
+
+	number_ios = ddir_rw_sum(td->io_blocks);
+	number_ios += td->io_u_queued + td->io_u_in_flight;
+
+	return number_ios >= (td->o.number_ios * td->loops);
+}
+
+static bool io_bytes_exceeded(struct thread_data *td, uint64_t *this_bytes)
+{
+	unsigned long long bytes, limit;
 
 	if (td_rw(td))
-		bytes = td->this_io_bytes[DDIR_READ] + td->this_io_bytes[DDIR_WRITE];
+		bytes = this_bytes[DDIR_READ] + this_bytes[DDIR_WRITE];
 	else if (td_write(td))
-		bytes = td->this_io_bytes[DDIR_WRITE];
+		bytes = this_bytes[DDIR_WRITE];
 	else if (td_read(td))
-		bytes = td->this_io_bytes[DDIR_READ];
+		bytes = this_bytes[DDIR_READ];
 	else
-		bytes = td->this_io_bytes[DDIR_TRIM];
+		bytes = this_bytes[DDIR_TRIM];
 
-	return bytes >= td->o.size;
+	if (td->o.io_size)
+		limit = td->o.io_size;
+	else
+		limit = td->o.size;
+
+	limit *= td->loops;
+	return bytes >= limit || exceeds_number_ios(td);
+}
+
+static bool io_issue_bytes_exceeded(struct thread_data *td)
+{
+	return io_bytes_exceeded(td, td->io_issue_bytes);
+}
+
+static bool io_complete_bytes_exceeded(struct thread_data *td)
+{
+	return io_bytes_exceeded(td, td->this_io_bytes);
+}
+
+/*
+ * used to calculate the next io time for rate control
+ *
+ */
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+	uint64_t bps = td->rate_bps[ddir];
+
+	assert(!(td->flags & TD_F_CHILD));
+
+	if (td->o.rate_process == RATE_PROCESS_POISSON) {
+		uint64_t val, iops;
+
+		iops = bps / td->o.bs[ddir];
+		val = (int64_t) (1000000 / iops) *
+				-logf(__rand_0_1(&td->poisson_state[ddir]));
+		if (val) {
+			dprint(FD_RATE, "poisson rate iops=%llu, ddir=%d\n",
+					(unsigned long long) 1000000 / val,
+					ddir);
+		}
+		td->last_usec[ddir] += val;
+		return td->last_usec[ddir];
+	} else if (bps) {
+		uint64_t bytes = td->rate_io_issue_bytes[ddir];
+		uint64_t secs = bytes / bps;
+		uint64_t remainder = bytes % bps;
+
+		return remainder * 1000000 / bps + secs * 1000000;
+	}
+
+	return 0;
+}
+
+static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned long long b;
+	uint64_t total;
+	int left;
+
+	b = ddir_rw_sum(td->io_blocks);
+	if (b % td->o.thinktime_blocks)
+		return;
+
+	io_u_quiesce(td);
+
+	total = 0;
+	if (td->o.thinktime_spin)
+		total = usec_spin(td->o.thinktime_spin);
+
+	left = td->o.thinktime - total;
+	if (left)
+		total += usec_sleep(td, left);
+
+	/*
+	 * If we're ignoring thinktime for the rate, add the number of bytes
+	 * we would have done while sleeping, minus one block to ensure we
+	 * start issuing immediately after the sleep.
+	 */
+	if (total && td->rate_bps[ddir] && td->o.rate_ign_think) {
+		uint64_t missed = (td->rate_bps[ddir] * total) / 1000000ULL;
+		uint64_t bs = td->o.min_bs[ddir];
+		uint64_t usperop = bs * 1000000ULL / td->rate_bps[ddir];
+		uint64_t over;
+
+		if (usperop <= total)
+			over = bs;
+		else
+			over = (usperop - total) / usperop * -bs;
+
+		td->rate_io_issue_bytes[ddir] += (missed - over);
+		/* adjust for rate_process=poisson */
+		td->last_usec[ddir] += total;
+	}
 }
 
 /*
@@ -637,25 +905,49 @@
  *
  * Returns number of bytes written and trimmed.
  */
-static uint64_t do_io(struct thread_data *td)
+static void do_io(struct thread_data *td, uint64_t *bytes_done)
 {
-	uint64_t bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
 	unsigned int i;
 	int ret = 0;
-	uint64_t bytes_issued = 0;
+	uint64_t total_bytes, bytes_issued = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i];
 
 	if (in_ramp_time(td))
 		td_set_runstate(td, TD_RAMP);
 	else
 		td_set_runstate(td, TD_RUNNING);
 
+	lat_target_init(td);
+
+	total_bytes = td->o.size;
+	/*
+	* Allow random overwrite workloads to write up to io_size
+	* before starting verification phase as 'size' doesn't apply.
+	*/
+	if (td_write(td) && td_random(td) && td->o.norandommap)
+		total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
+	/*
+	 * If verify_backlog is enabled, we'll run the verify in this
+	 * handler as well. For that case, we may need up to twice the
+	 * amount of bytes.
+	 */
+	if (td->o.verify != VERIFY_NONE &&
+	   (td_write(td) && td->o.verify_backlog))
+		total_bytes += td->o.size;
+
+	/* In trimwrite mode, each byte is trimmed and then written, so
+	 * allow total_bytes to be twice as big */
+	if (td_trimwrite(td))
+		total_bytes += td->total_io_size;
+
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
-		(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td) ||
+		(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
 		td->o.time_based) {
-		struct timeval comp_time;
-		int min_evts = 0;
+		struct timespec comp_time;
 		struct io_u *io_u;
-		int ret2, full;
+		int full;
 		enum fio_ddir ddir;
 
 		check_update_rusage(td);
@@ -663,12 +955,12 @@
 		if (td->terminate || td->done)
 			break;
 
-		update_tv_cache(td);
+		update_ts_cache(td);
 
-		if (runtime_exceeded(td, &td->tv_cache)) {
-			__update_tv_cache(td);
-			if (runtime_exceeded(td, &td->tv_cache)) {
-				td->terminate = 1;
+		if (runtime_exceeded(td, &td->ts_cache)) {
+			__update_ts_cache(td);
+			if (runtime_exceeded(td, &td->ts_cache)) {
+				fio_mark_td_terminate(td);
 				break;
 			}
 		}
@@ -676,12 +968,35 @@
 		if (flow_threshold_exceeded(td))
 			continue;
 
-		if (bytes_issued >= (uint64_t) td->o.size)
+		/*
+		 * Break if we exceeded the bytes. The exception is time
+		 * based runs, but we still need to break out of the loop
+		 * for those to run verification, if enabled.
+		 * Jobs read from iolog do not use this stop condition.
+		 */
+		if (bytes_issued >= total_bytes &&
+		    !td->o.read_iolog_file &&
+		    (!td->o.time_based ||
+		     (td->o.time_based && td->o.verify != VERIFY_NONE)))
 			break;
 
 		io_u = get_io_u(td);
-		if (!io_u)
+		if (IS_ERR_OR_NULL(io_u)) {
+			int err = PTR_ERR(io_u);
+
+			io_u = NULL;
+			ddir = DDIR_INVAL;
+			if (err == -EBUSY) {
+				ret = FIO_Q_BUSY;
+				goto reap;
+			}
+			if (td->o.latency_target)
+				goto reap;
 			break;
+		}
+
+		if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY)
+			populate_verify_io_u(td, io_u);
 
 		ddir = io_u->ddir;
 
@@ -692,6 +1007,18 @@
 		 */
 		if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ &&
 		    ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) {
+
+			if (!td->o.verify_pattern_bytes) {
+				io_u->rand_seed = __rand(&td->verify_state);
+				if (sizeof(int) != sizeof(long *))
+					io_u->rand_seed *= __rand(&td->verify_state);
+			}
+
+			if (verify_state_should_stop(td, io_u)) {
+				put_io_u(td, io_u);
+				break;
+			}
+
 			if (td->o.verify_async)
 				io_u->end_io = verify_io_u_async;
 			else
@@ -702,134 +1029,75 @@
 		else
 			td_set_runstate(td, TD_RUNNING);
 
-		ret = td_io_queue(td, io_u);
-		switch (ret) {
-		case FIO_Q_COMPLETED:
-			if (io_u->error) {
-				ret = -io_u->error;
-				clear_io_u(td, io_u);
-			} else if (io_u->resid) {
-				int bytes = io_u->xfer_buflen - io_u->resid;
-				struct fio_file *f = io_u->file;
+		/*
+		 * Always log IO before it's issued, so we know the specific
+		 * order of it. The logged unit will track when the IO has
+		 * completed.
+		 */
+		if (td_write(td) && io_u->ddir == DDIR_WRITE &&
+		    td->o.do_verify &&
+		    td->o.verify != VERIFY_NONE &&
+		    !td->o.experimental_verify)
+			log_io_piece(td, io_u);
+
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			const unsigned long long blen = io_u->xfer_buflen;
+			const enum fio_ddir __ddir = acct_ddir(io_u);
 
-				bytes_issued += bytes;
-				/*
-				 * zero read, fail
-				 */
-				if (!bytes) {
-					td_verror(td, EIO, "full resid");
-					put_io_u(td, io_u);
-					break;
-				}
+			if (td->error)
+				break;
 
-				io_u->xfer_buflen = io_u->resid;
-				io_u->xfer_buf += bytes;
-				io_u->offset += bytes;
+			workqueue_enqueue(&td->io_wq, &io_u->work);
+			ret = FIO_Q_QUEUED;
 
-				if (ddir_rw(io_u->ddir))
-					td->ts.short_io_u[io_u->ddir]++;
+			if (ddir_rw(__ddir)) {
+				td->io_issues[__ddir]++;
+				td->io_issue_bytes[__ddir] += blen;
+				td->rate_io_issue_bytes[__ddir] += blen;
+			}
 
-				if (io_u->offset == f->real_file_size)
-					goto sync_done;
+			if (should_check_rate(td))
+				td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir);
 
-				requeue_io_u(td, &io_u);
-			} else {
-sync_done:
-				if (__should_check_rate(td, DDIR_READ) ||
-				    __should_check_rate(td, DDIR_WRITE) ||
-				    __should_check_rate(td, DDIR_TRIM))
-					fio_gettime(&comp_time, NULL);
+		} else {
+			ret = io_u_submit(td, io_u);
 
-				ret = io_u_sync_complete(td, io_u, bytes_done);
-				if (ret < 0)
-					break;
-				bytes_issued += io_u->xfer_buflen;
-			}
-			break;
-		case FIO_Q_QUEUED:
-			/*
-			 * if the engine doesn't have a commit hook,
-			 * the io_u is really queued. if it does have such
-			 * a hook, it has to call io_u_queued() itself.
-			 */
-			if (td->io_ops->commit == NULL)
-				io_u_queued(td, io_u);
-			bytes_issued += io_u->xfer_buflen;
-			break;
-		case FIO_Q_BUSY:
-			requeue_io_u(td, &io_u);
-			ret2 = td_io_commit(td);
-			if (ret2 < 0)
-				ret = ret2;
-			break;
-		default:
-			assert(ret < 0);
-			put_io_u(td, io_u);
-			break;
-		}
+			if (should_check_rate(td))
+				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
 
-		if (break_on_this_error(td, ddir, &ret))
-			break;
+			if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
+				break;
 
-		/*
-		 * See if we need to complete some commands. Note that we
-		 * can get BUSY even without IO queued, if the system is
-		 * resource starved.
-		 */
-		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete) {
-			min_evts = min(td->o.iodepth_batch_complete,
-					td->cur_depth);
 			/*
-			 * if the queue is full, we MUST reap at least 1 event
+			 * See if we need to complete some commands. Note that
+			 * we can get BUSY even without IO queued, if the
+			 * system is resource starved.
 			 */
-			if (full && !min_evts)
-				min_evts = 1;
-
-			if (__should_check_rate(td, DDIR_READ) ||
-			    __should_check_rate(td, DDIR_WRITE) ||
-			    __should_check_rate(td, DDIR_TRIM))
-				fio_gettime(&comp_time, NULL);
-
-			do {
-				ret = io_u_queued_complete(td, min_evts, bytes_done);
-				if (ret < 0)
-					break;
-
-			} while (full && (td->cur_depth > td->o.iodepth_low));
+reap:
+			full = queue_full(td) ||
+				(ret == FIO_Q_BUSY && td->cur_depth);
+			if (full || io_in_polling(td))
+				ret = wait_for_completions(td, &comp_time);
 		}
-
 		if (ret < 0)
 			break;
-		if (!ddir_rw_sum(bytes_done) && !(td->io_ops->flags & FIO_NOIO))
+		if (!ddir_rw_sum(td->bytes_done) &&
+		    !td_ioengine_flagged(td, FIO_NOIO))
 			continue;
 
-		if (!in_ramp_time(td) && should_check_rate(td, bytes_done)) {
-			if (check_min_rate(td, &comp_time, bytes_done)) {
-				if (exitall_on_terminate)
+		if (!in_ramp_time(td) && should_check_rate(td)) {
+			if (check_min_rate(td, &comp_time)) {
+				if (exitall_on_terminate || td->o.exitall_error)
 					fio_terminate_threads(td->groupid);
 				td_verror(td, EIO, "check_min_rate");
 				break;
 			}
 		}
+		if (!in_ramp_time(td) && td->o.latency_target)
+			lat_target_check(td);
 
-		if (td->o.thinktime) {
-			unsigned long long b;
-
-			b = ddir_rw_sum(td->io_blocks);
-			if (!(b % td->o.thinktime_blocks)) {
-				int left;
-
-				io_u_quiesce(td);
-
-				if (td->o.thinktime_spin)
-					usec_spin(td->o.thinktime_spin);
-
-				left = td->o.thinktime - td->o.thinktime_spin;
-				if (left)
-					usec_sleep(td, left);
-			}
-		}
+		if (ddir_rw(ddir) && td->o.thinktime)
+			handle_thinktime(td, ddir);
 	}
 
 	check_update_rusage(td);
@@ -839,14 +1107,19 @@
 
 	if (td->o.fill_device && td->error == ENOSPC) {
 		td->error = 0;
-		td->terminate = 1;
+		fio_mark_td_terminate(td);
 	}
 	if (!td->error) {
 		struct fio_file *f;
 
-		i = td->cur_depth;
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			workqueue_flush(&td->io_wq);
+			i = 0;
+		} else
+			i = td->cur_depth;
+
 		if (i) {
-			ret = io_u_queued_complete(td, i, bytes_done);
+			ret = io_u_queued_complete(td, i);
 			if (td->o.fill_device && td->error == ENOSPC)
 				td->error = 0;
 		}
@@ -871,7 +1144,43 @@
 	if (!ddir_rw_sum(td->this_io_bytes))
 		td->done = 1;
 
-	return bytes_done[DDIR_WRITE] + bytes_done[DDIR_TRIM];
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i] - bytes_done[i];
+}
+
+static void free_file_completion_logging(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (!f->last_write_comp)
+			break;
+		sfree(f->last_write_comp);
+	}
+}
+
+static int init_file_completion_logging(struct thread_data *td,
+					unsigned int depth)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (td->o.verify == VERIFY_NONE || !td->o.verify_state_save)
+		return 0;
+
+	for_each_file(td, f, i) {
+		f->last_write_comp = scalloc(depth, sizeof(uint64_t));
+		if (!f->last_write_comp)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	free_file_completion_logging(td);
+	log_err("fio: failed to alloc write comp data\n");
+	return 1;
 }
 
 static void cleanup_io_u(struct thread_data *td)
@@ -883,54 +1192,112 @@
 		if (td->io_ops->io_u_free)
 			td->io_ops->io_u_free(td, io_u);
 
-		fio_memfree(io_u, sizeof(*io_u));
+		fio_memfree(io_u, sizeof(*io_u), td_offload_overlap(td));
 	}
 
 	free_io_mem(td);
 
 	io_u_rexit(&td->io_u_requeues);
-	io_u_qexit(&td->io_u_freelist);
-	io_u_qexit(&td->io_u_all);
+	io_u_qexit(&td->io_u_freelist, false);
+	io_u_qexit(&td->io_u_all, td_offload_overlap(td));
+
+	free_file_completion_logging(td);
 }
 
 static int init_io_u(struct thread_data *td)
 {
 	struct io_u *io_u;
-	unsigned int max_bs, min_write;
 	int cl_align, i, max_units;
-	int data_xfer = 1, err;
-	char *p;
+	int err;
 
 	max_units = td->o.iodepth;
-	max_bs = td_max_bs(td);
-	min_write = td->o.min_bs[DDIR_WRITE];
-	td->orig_buffer_size = (unsigned long long) max_bs
-					* (unsigned long long) max_units;
-
-	if ((td->io_ops->flags & FIO_NOIO) || !(td_read(td) || td_write(td)))
-		data_xfer = 0;
 
 	err = 0;
-	err += io_u_rinit(&td->io_u_requeues, td->o.iodepth);
-	err += io_u_qinit(&td->io_u_freelist, td->o.iodepth);
-	err += io_u_qinit(&td->io_u_all, td->o.iodepth);
+	err += !io_u_rinit(&td->io_u_requeues, td->o.iodepth);
+	err += !io_u_qinit(&td->io_u_freelist, td->o.iodepth, false);
+	err += !io_u_qinit(&td->io_u_all, td->o.iodepth, td_offload_overlap(td));
 
 	if (err) {
 		log_err("fio: failed setting up IO queues\n");
 		return 1;
 	}
 
+	cl_align = os_cache_line_size();
+
+	for (i = 0; i < max_units; i++) {
+		void *ptr;
+
+		if (td->terminate)
+			return 1;
+
+		ptr = fio_memalign(cl_align, sizeof(*io_u), td_offload_overlap(td));
+		if (!ptr) {
+			log_err("fio: unable to allocate aligned memory\n");
+			break;
+		}
+
+		io_u = ptr;
+		memset(io_u, 0, sizeof(*io_u));
+		INIT_FLIST_HEAD(&io_u->verify_list);
+		dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i);
+
+		io_u->index = i;
+		io_u->flags = IO_U_F_FREE;
+		io_u_qpush(&td->io_u_freelist, io_u);
+
+		/*
+		 * io_u never leaves this stack, used for iteration of all
+		 * io_u buffers.
+		 */
+		io_u_qpush(&td->io_u_all, io_u);
+
+		if (td->io_ops->io_u_init) {
+			int ret = td->io_ops->io_u_init(td, io_u);
+
+			if (ret) {
+				log_err("fio: failed to init engine data: %d\n", ret);
+				return 1;
+			}
+		}
+	}
+
+	init_io_u_buffers(td);
+
+	if (init_file_completion_logging(td, max_units))
+		return 1;
+
+	return 0;
+}
+
+int init_io_u_buffers(struct thread_data *td)
+{
+	struct io_u *io_u;
+	unsigned long long max_bs, min_write;
+	int i, max_units;
+	int data_xfer = 1;
+	char *p;
+
+	max_units = td->o.iodepth;
+	max_bs = td_max_bs(td);
+	min_write = td->o.min_bs[DDIR_WRITE];
+	td->orig_buffer_size = (unsigned long long) max_bs
+					* (unsigned long long) max_units;
+
+	if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td)))
+		data_xfer = 0;
+
 	/*
 	 * if we may later need to do address alignment, then add any
 	 * possible adjustment here so that we don't cause a buffer
 	 * overflow later. this adjustment may be too much if we get
 	 * lucky and the allocator gives us an aligned address.
 	 */
-	if (td->o.odirect || td->o.mem_align || (td->io_ops->flags & FIO_RAWIO))
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_RAWIO))
 		td->orig_buffer_size += page_mask + td->o.mem_align;
 
 	if (td->o.mem_type == MEM_SHMHUGE || td->o.mem_type == MEM_MMAPHUGE) {
-		unsigned long bs;
+		unsigned long long bs;
 
 		bs = td->orig_buffer_size + td->o.hugepage_size - 1;
 		td->orig_buffer_size = bs & ~(td->o.hugepage_size - 1);
@@ -944,29 +1311,14 @@
 	if (data_xfer && allocate_io_mem(td))
 		return 1;
 
-	if (td->o.odirect || td->o.mem_align ||
-	    (td->io_ops->flags & FIO_RAWIO))
-		p = PAGE_ALIGN(td->orig_buffer) + td->o.mem_align;
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_RAWIO))
+		p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
 	else
 		p = td->orig_buffer;
 
-	cl_align = os_cache_line_size();
-
 	for (i = 0; i < max_units; i++) {
-		void *ptr;
-
-		if (td->terminate)
-			return 1;
-
-		ptr = fio_memalign(cl_align, sizeof(*io_u));
-		if (!ptr) {
-			log_err("fio: unable to allocate aligned memory\n");
-			break;
-		}
-
-		io_u = ptr;
-		memset(io_u, 0, sizeof(*io_u));
-		INIT_FLIST_HEAD(&io_u->verify_list);
+		io_u = td->io_u_all.io_us[i];
 		dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i);
 
 		if (data_xfer) {
@@ -980,45 +1332,31 @@
 				 * Fill the buffer with the pattern if we are
 				 * going to be doing writes.
 				 */
-				fill_pattern(td, io_u->buf, max_bs, io_u, 0, 0);
-			}
-		}
-
-		io_u->index = i;
-		io_u->flags = IO_U_F_FREE;
-		io_u_qpush(&td->io_u_freelist, io_u);
-
-		/*
-		 * io_u never leaves this stack, used for iteration of all
-		 * io_u buffers.
-		 */
-		io_u_qpush(&td->io_u_all, io_u);
-
-		if (td->io_ops->io_u_init) {
-			int ret = td->io_ops->io_u_init(td, io_u);
-
-			if (ret) {
-				log_err("fio: failed to init engine data: %d\n", ret);
-				return 1;
+				fill_verify_pattern(td, io_u->buf, max_bs, io_u, 0, 0);
 			}
 		}
-
 		p += max_bs;
 	}
 
 	return 0;
 }
 
+/*
+ * This function is Linux specific.
+ * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
+ */
 static int switch_ioscheduler(struct thread_data *td)
 {
-	char tmp[256], tmp2[128];
+#ifdef FIO_HAVE_IOSCHED_SWITCH
+	char tmp[256], tmp2[128], *p;
 	FILE *f;
 	int ret;
 
-	if (td->io_ops->flags & FIO_DISKLESSIO)
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
 		return 0;
 
-	sprintf(tmp, "%s/queue/scheduler", td->sysfs_root);
+	assert(td->files && td->files[0]);
+	sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
 
 	f = fopen(tmp, "r+");
 	if (!f) {
@@ -1046,11 +1384,27 @@
 	/*
 	 * Read back and check that the selected scheduler is now the default.
 	 */
-	ret = fread(tmp, 1, sizeof(tmp), f);
+	ret = fread(tmp, 1, sizeof(tmp) - 1, f);
 	if (ferror(f) || ret < 0) {
 		td_verror(td, errno, "fread");
 		fclose(f);
-		return 1;
+		return 1;
+	}
+	tmp[ret] = '\0';
+	/*
+	 * either a list of io schedulers or "none\n" is expected. Strip the
+	 * trailing newline.
+	 */
+	p = tmp;
+	strsep(&p, "\n");
+
+	/*
+	 * Write to "none" entry doesn't fail, so check the result here.
+	 */
+	if (!strcmp(tmp, "none")) {
+		log_err("fio: io scheduler is not tunable\n");
+		fclose(f);
+		return 0;
 	}
 
 	sprintf(tmp2, "[%s]", td->o.ioscheduler);
@@ -1063,39 +1417,57 @@
 
 	fclose(f);
 	return 0;
+#else
+	return 0;
+#endif
 }
 
-static int keep_running(struct thread_data *td)
+static bool keep_running(struct thread_data *td)
 {
+	unsigned long long limit;
+
 	if (td->done)
-		return 0;
+		return false;
+	if (td->terminate)
+		return false;
 	if (td->o.time_based)
-		return 1;
+		return true;
 	if (td->o.loops) {
 		td->o.loops--;
-		return 1;
+		return true;
 	}
+	if (exceeds_number_ios(td))
+		return false;
+
+	if (td->o.io_size)
+		limit = td->o.io_size;
+	else
+		limit = td->o.size;
 
-	if (td->o.size != -1ULL && ddir_rw_sum(td->io_bytes) < td->o.size) {
+	if (limit != -1ULL && ddir_rw_sum(td->io_bytes) < limit) {
 		uint64_t diff;
 
 		/*
-		 * If the difference is less than the minimum IO size, we
+		 * If the difference is less than the maximum IO size, we
 		 * are done.
 		 */
-		diff = td->o.size - ddir_rw_sum(td->io_bytes);
+		diff = limit - ddir_rw_sum(td->io_bytes);
 		if (diff < td_max_bs(td))
-			return 0;
+			return false;
 
-		return 1;
+		if (fio_files_done(td) && !td->o.io_size)
+			return false;
+
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 static int exec_string(struct thread_options *o, const char *string, const char *mode)
 {
-	int ret, newlen = strlen(string) + strlen(o->name) + strlen(mode) + 9 + 1;
+	size_t newlen = strlen(string) + strlen(o->name) + strlen(mode) + 9 + 1;
+	int ret;
 	char *str;
 
 	str = malloc(newlen);
@@ -1111,25 +1483,78 @@
 }
 
 /*
+ * Dry run to compute correct state of numberio for verification.
+ */
+static uint64_t do_dry_run(struct thread_data *td)
+{
+	td_set_runstate(td, TD_RUNNING);
+
+	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
+		(!flist_empty(&td->trim_list)) || !io_complete_bytes_exceeded(td)) {
+		struct io_u *io_u;
+		int ret;
+
+		if (td->terminate || td->done)
+			break;
+
+		io_u = get_io_u(td);
+		if (IS_ERR_OR_NULL(io_u))
+			break;
+
+		io_u_set(td, io_u, IO_U_F_FLIGHT);
+		io_u->error = 0;
+		io_u->resid = 0;
+		if (ddir_rw(acct_ddir(io_u)))
+			td->io_issues[acct_ddir(io_u)]++;
+		if (ddir_rw(io_u->ddir)) {
+			io_u_mark_depth(td, 1);
+			td->ts.total_io_u[io_u->ddir]++;
+		}
+
+		if (td_write(td) && io_u->ddir == DDIR_WRITE &&
+		    td->o.do_verify &&
+		    td->o.verify != VERIFY_NONE &&
+		    !td->o.experimental_verify)
+			log_io_piece(td, io_u);
+
+		ret = io_u_sync_complete(td, io_u);
+		(void) ret;
+	}
+
+	return td->bytes_done[DDIR_WRITE] + td->bytes_done[DDIR_TRIM];
+}
+
+struct fork_data {
+	struct thread_data *td;
+	struct sk_out *sk_out;
+};
+
+/*
  * Entry point for the thread based jobs. The process based jobs end up
  * here as well, after a little setup.
  */
 static void *thread_main(void *data)
 {
-	unsigned long long elapsed;
-	struct thread_data *td = data;
+	struct fork_data *fd = data;
+	unsigned long long elapsed_us[DDIR_RWDIR_CNT] = { 0, };
+	struct thread_data *td = fd->td;
 	struct thread_options *o = &td->o;
-	pthread_condattr_t attr;
-	int clear_state;
+	struct sk_out *sk_out = fd->sk_out;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
+	int deadlock_loop_cnt;
+	bool clear_state;
 	int ret;
 
+	sk_out_assign(sk_out);
+	free(fd);
+
 	if (!o->use_thread) {
 		setsid();
 		td->pid = getpid();
 	} else
 		td->pid = gettid();
 
-	fio_local_clock_init(o->use_thread);
+	fio_local_clock_init();
 
 	dprint(FD_PROCESS, "jobs pid=%d started\n", (int) td->pid);
 
@@ -1140,27 +1565,25 @@
 	INIT_FLIST_HEAD(&td->io_hist_list);
 	INIT_FLIST_HEAD(&td->verify_list);
 	INIT_FLIST_HEAD(&td->trim_list);
-	INIT_FLIST_HEAD(&td->next_rand_list);
-	pthread_mutex_init(&td->io_u_lock, NULL);
 	td->io_hist_tree = RB_ROOT;
 
-	pthread_condattr_init(&attr);
-	pthread_cond_init(&td->verify_cond, &attr);
-	pthread_cond_init(&td->free_cond, &attr);
+	ret = mutex_cond_init_pshared(&td->io_u_lock, &td->free_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_init_pshared");
+		goto err;
+	}
+	ret = cond_init_pshared(&td->verify_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_pshared");
+		goto err;
+	}
 
 	td_set_runstate(td, TD_INITIALIZED);
-	dprint(FD_MUTEX, "up startup_mutex\n");
-	fio_mutex_up(startup_mutex);
-	dprint(FD_MUTEX, "wait on td->mutex\n");
-	fio_mutex_down(td->mutex);
-	dprint(FD_MUTEX, "done waiting on td->mutex\n");
-
-	/*
-	 * the ->mutex mutex is now no longer used, close it to avoid
-	 * eating a file descriptor
-	 */
-	fio_mutex_remove(td->mutex);
-	td->mutex = NULL;
+	dprint(FD_MUTEX, "up startup_sem\n");
+	fio_sem_up(startup_sem);
+	dprint(FD_MUTEX, "wait on td->sem\n");
+	fio_sem_down(td->sem);
+	dprint(FD_MUTEX, "done waiting on td->sem\n");
 
 	/*
 	 * A new gid requires privilege, so we need to do this before setting
@@ -1175,6 +1598,16 @@
 		goto err;
 	}
 
+	td_zone_gen_index(td);
+
+	/*
+	 * Do this early, we don't want the compress threads to be limited
+	 * to the same CPUs as the IO workers. So do this before we set
+	 * any potential CPU affinity
+	 */
+	if (iolog_compress_init(td, sk_out))
+		goto err;
+
 	/*
 	 * If we have a gettimeofday() thread, make sure we exclude that
 	 * thread from this job
@@ -1186,7 +1619,16 @@
 	 * Set affinity first, in case it has an impact on the memory
 	 * allocations.
 	 */
-	if (o->cpumask_set) {
+	if (fio_option_is_set(o, cpumask)) {
+		if (o->cpus_allowed_policy == FIO_CPUS_SPLIT) {
+			ret = fio_cpus_split(&o->cpumask, td->thread_number - 1);
+			if (!ret) {
+				log_err("fio: no CPUs set\n");
+				log_err("fio: Try increasing number of available CPUs\n");
+				td_verror(td, EINVAL, "cpus_split");
+				goto err;
+			}
+		}
 		ret = fio_setaffinity(td->pid, o->cpumask);
 		if (ret == -1) {
 			td_verror(td, errno, "cpu_set_affinity");
@@ -1196,16 +1638,19 @@
 
 #ifdef CONFIG_LIBNUMA
 	/* numa node setup */
-	if (o->numa_cpumask_set || o->numa_memmask_set) {
-		int ret;
+	if (fio_option_is_set(o, numa_cpunodes) ||
+	    fio_option_is_set(o, numa_memnodes)) {
+		struct bitmask *mask;
 
 		if (numa_available() < 0) {
 			td_verror(td, errno, "Does not support NUMA API\n");
 			goto err;
 		}
 
-		if (o->numa_cpumask_set) {
-			ret = numa_run_on_node_mask(o->numa_cpunodesmask);
+		if (fio_option_is_set(o, numa_cpunodes)) {
+			mask = numa_parse_nodestring(o->numa_cpunodes);
+			ret = numa_run_on_node_mask(mask);
+			numa_free_nodemask(mask);
 			if (ret == -1) {
 				td_verror(td, errno, \
 					"numa_run_on_node_mask failed\n");
@@ -1213,14 +1658,17 @@
 			}
 		}
 
-		if (o->numa_memmask_set) {
+		if (fio_option_is_set(o, numa_memnodes)) {
+			mask = NULL;
+			if (o->numa_memnodes)
+				mask = numa_parse_nodestring(o->numa_memnodes);
 
 			switch (o->numa_mem_mode) {
 			case MPOL_INTERLEAVE:
-				numa_set_interleave_mask(o->numa_memnodesmask);
+				numa_set_interleave_mask(mask);
 				break;
 			case MPOL_BIND:
-				numa_set_membind(o->numa_memnodesmask);
+				numa_set_membind(mask);
 				break;
 			case MPOL_LOCAL:
 				numa_set_localalloc();
@@ -1233,6 +1681,9 @@
 				break;
 			}
 
+			if (mask)
+				numa_free_nodemask(mask);
+
 		}
 	}
 #endif
@@ -1244,16 +1695,23 @@
 	 * May alter parameters that init_io_u() will use, so we need to
 	 * do this first.
 	 */
-	if (init_iolog(td))
+	if (!init_iolog(td))
+		goto err;
+
+	if (td_io_init(td))
 		goto err;
 
 	if (init_io_u(td))
 		goto err;
 
+	if (td->io_ops->post_init && td->io_ops->post_init(td))
+		goto err;
+
 	if (o->verify_async && verify_async_init(td))
 		goto err;
 
-	if (o->ioprio) {
+	if (fio_option_is_set(o, ioprio) ||
+	    fio_option_is_set(o, ioprio_class)) {
 		ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
 		if (ret == -1) {
 			td_verror(td, errno, "ioprio_set");
@@ -1276,138 +1734,167 @@
 	if (!o->create_serialize && setup_files(td))
 		goto err;
 
-	if (td_io_init(td))
-		goto err;
-
-	if (init_random_map(td))
+	if (!init_random_map(td))
 		goto err;
 
 	if (o->exec_prerun && exec_string(o, o->exec_prerun, (const char *)"prerun"))
 		goto err;
 
-	if (o->pre_read) {
-		if (pre_read_files(td) < 0)
-			goto err;
-	}
+	if (o->pre_read && !pre_read_files(td))
+		goto err;
 
 	fio_verify_init(td);
 
-	fio_gettime(&td->epoch, NULL);
+	if (rate_submit_init(td, sk_out))
+		goto err;
+
+	set_epoch_time(td, o->log_unix_epoch);
 	fio_getrusage(&td->ru_start);
-	clear_state = 0;
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+
+	if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
+			o->ratemin[DDIR_TRIM]) {
+	        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	}
+
+	memset(bytes_done, 0, sizeof(bytes_done));
+	clear_state = false;
+
 	while (keep_running(td)) {
 		uint64_t verify_bytes;
 
 		fio_gettime(&td->start, NULL);
-		memcpy(&td->bw_sample_time, &td->start, sizeof(td->start));
-		memcpy(&td->iops_sample_time, &td->start, sizeof(td->start));
-		memcpy(&td->tv_cache, &td->start, sizeof(td->start));
-
-		if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
-				o->ratemin[DDIR_TRIM]) {
-		        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		}
+		memcpy(&td->ts_cache, &td->start, sizeof(td->start));
 
-		if (clear_state)
-			clear_io_state(td);
+		if (clear_state) {
+			clear_io_state(td, 0);
 
-		prune_io_piece_log(td);
+			if (o->unlink_each_loop && unlink_all_files(td))
+				break;
+		}
 
-		verify_bytes = do_io(td);
+		prune_io_piece_log(td);
 
-		clear_state = 1;
+		if (td->o.verify_only && td_write(td))
+			verify_bytes = do_dry_run(td);
+		else {
+			do_io(td, bytes_done);
 
-		if (td_read(td) && td->io_bytes[DDIR_READ]) {
-			elapsed = utime_since_now(&td->start);
-			td->ts.runtime[DDIR_READ] += elapsed;
-		}
-		if (td_write(td) && td->io_bytes[DDIR_WRITE]) {
-			elapsed = utime_since_now(&td->start);
-			td->ts.runtime[DDIR_WRITE] += elapsed;
-		}
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM]) {
-			elapsed = utime_since_now(&td->start);
-			td->ts.runtime[DDIR_TRIM] += elapsed;
+			if (!ddir_rw_sum(bytes_done)) {
+				fio_mark_td_terminate(td);
+				verify_bytes = 0;
+			} else {
+				verify_bytes = bytes_done[DDIR_WRITE] +
+						bytes_done[DDIR_TRIM];
+			}
 		}
 
+		/*
+		 * If we took too long to shut down, the main thread could
+		 * already consider us reaped/exited. If that happens, break
+		 * out and clean up.
+		 */
+		if (td->runstate >= TD_EXITED)
+			break;
+
+		clear_state = true;
+
+		/*
+		 * Make sure we've successfully updated the rusage stats
+		 * before waiting on the stat mutex. Otherwise we could have
+		 * the stat thread holding stat mutex and waiting for
+		 * the rusage_sem, which would never get upped because
+		 * this thread is waiting for the stat mutex.
+		 */
+		deadlock_loop_cnt = 0;
+		do {
+			check_update_rusage(td);
+			if (!fio_sem_down_trylock(stat_sem))
+				break;
+			usleep(1000);
+			if (deadlock_loop_cnt++ > 5000) {
+				log_err("fio seems to be stuck grabbing stat_sem, forcibly exiting\n");
+				td->error = EDEADLK;
+				goto err;
+			}
+		} while (1);
+
+		if (td_read(td) && td->io_bytes[DDIR_READ])
+			update_runtime(td, elapsed_us, DDIR_READ);
+		if (td_write(td) && td->io_bytes[DDIR_WRITE])
+			update_runtime(td, elapsed_us, DDIR_WRITE);
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+			update_runtime(td, elapsed_us, DDIR_TRIM);
+		fio_gettime(&td->start, NULL);
+		fio_sem_up(stat_sem);
+
 		if (td->error || td->terminate)
 			break;
 
 		if (!o->do_verify ||
 		    o->verify == VERIFY_NONE ||
-		    (td->io_ops->flags & FIO_UNIDIR))
+		    td_ioengine_flagged(td, FIO_UNIDIR))
 			continue;
 
-		clear_io_state(td);
+		clear_io_state(td, 0);
 
 		fio_gettime(&td->start, NULL);
 
 		do_verify(td, verify_bytes);
 
-		td->ts.runtime[DDIR_READ] += utime_since_now(&td->start);
+		/*
+		 * See comment further up for why this is done here.
+		 */
+		check_update_rusage(td);
+
+		fio_sem_down(stat_sem);
+		update_runtime(td, elapsed_us, DDIR_READ);
+		fio_gettime(&td->start, NULL);
+		fio_sem_up(stat_sem);
 
 		if (td->error || td->terminate)
 			break;
 	}
 
+	/*
+	 * Acquire this lock if we were doing overlap checking in
+	 * offload mode so that we don't clean up this job while
+	 * another thread is checking its io_u's for overlap
+	 */
+	if (td_offload_overlap(td))
+		pthread_mutex_lock(&overlap_check);
+	td_set_runstate(td, TD_FINISHING);
+	if (td_offload_overlap(td))
+		pthread_mutex_unlock(&overlap_check);
+
 	update_rusage_stat(td);
-	td->ts.runtime[DDIR_READ] = (td->ts.runtime[DDIR_READ] + 999) / 1000;
-	td->ts.runtime[DDIR_WRITE] = (td->ts.runtime[DDIR_WRITE] + 999) / 1000;
-	td->ts.runtime[DDIR_TRIM] = (td->ts.runtime[DDIR_TRIM] + 999) / 1000;
 	td->ts.total_run_time = mtime_since_now(&td->epoch);
 	td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
 	td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
 	td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 
+	if (td->o.verify_state_save && !(td->flags & TD_F_VSTATE_SAVED) &&
+	    (td->o.verify != VERIFY_NONE && td_write(td)))
+		verify_save_state(td->thread_number);
+
 	fio_unpin_memory(td);
 
-	fio_mutex_down(writeout_mutex);
-	if (td->bw_log) {
-		if (o->bw_log_file) {
-			finish_log_named(td, td->bw_log,
-						o->bw_log_file, "bw");
-		} else
-			finish_log(td, td->bw_log, "bw");
-	}
-	if (td->lat_log) {
-		if (o->lat_log_file) {
-			finish_log_named(td, td->lat_log,
-						o->lat_log_file, "lat");
-		} else
-			finish_log(td, td->lat_log, "lat");
-	}
-	if (td->slat_log) {
-		if (o->lat_log_file) {
-			finish_log_named(td, td->slat_log,
-						o->lat_log_file, "slat");
-		} else
-			finish_log(td, td->slat_log, "slat");
-	}
-	if (td->clat_log) {
-		if (o->lat_log_file) {
-			finish_log_named(td, td->clat_log,
-						o->lat_log_file, "clat");
-		} else
-			finish_log(td, td->clat_log, "clat");
-	}
-	if (td->iops_log) {
-		if (o->iops_log_file) {
-			finish_log_named(td, td->iops_log,
-						o->iops_log_file, "iops");
-		} else
-			finish_log(td, td->iops_log, "iops");
-	}
+	td_writeout_logs(td, true);
+
+	iolog_compress_exit(td);
+	rate_submit_exit(td);
 
-	fio_mutex_up(writeout_mutex);
 	if (o->exec_postrun)
 		exec_string(o, o->exec_postrun, (const char *)"postrun");
 
-	if (exitall_on_terminate)
+	if (exitall_on_terminate || (o->exitall_error && td->error))
 		fio_terminate_threads(td->groupid);
 
 err:
@@ -1421,12 +1908,14 @@
 	close_and_free_files(td);
 	cleanup_io_u(td);
 	close_ioengine(td);
-	cgroup_shutdown(td, &cgroup_mnt);
-
-	if (o->cpumask_set) {
-		int ret = fio_cpuset_exit(&o->cpumask);
-
-		td_verror(td, ret, "fio_cpuset_exit");
+	cgroup_shutdown(td, cgroup_mnt);
+	verify_free_state(td);
+	td_zone_free_index(td);
+
+	if (fio_option_is_set(o, cpumask)) {
+		ret = fio_cpuset_exit(&o->cpumask);
+		if (ret)
+			td_verror(td, ret, "fio_cpuset_exit");
 	}
 
 	/*
@@ -1434,50 +1923,26 @@
 	 */
 	if (o->write_iolog_file)
 		write_iolog_close(td);
-
-	fio_mutex_remove(td->rusage_sem);
-	td->rusage_sem = NULL;
+	if (td->io_log_rfile)
+		fclose(td->io_log_rfile);
 
 	td_set_runstate(td, TD_EXITED);
-	return (void *) (uintptr_t) td->error;
-}
-
-
-/*
- * We cannot pass the td data into a forked process, so attach the td and
- * pass it to the thread worker.
- */
-static int fork_main(int shmid, int offset)
-{
-	struct thread_data *td;
-	void *data, *ret;
-
-#ifndef __hpux
-	data = shmat(shmid, NULL, 0);
-	if (data == (void *) -1) {
-		int __err = errno;
 
-		perror("shmat");
-		return __err;
-	}
-#else
 	/*
-	 * HP-UX inherits shm mappings?
+	 * Do this last after setting our runstate to exited, so we
+	 * know that the stat thread is signaled.
 	 */
-	data = threads;
-#endif
+	check_update_rusage(td);
 
-	td = data + offset * sizeof(struct thread_data);
-	ret = thread_main(td);
-	shmdt(data);
-	return (int) (uintptr_t) ret;
+	sk_out_drop();
+	return (void *) (uintptr_t) td->error;
 }
 
 /*
  * Run over the job map and reap the threads that have exited, if any.
  */
-static void reap_threads(unsigned int *nr_running, unsigned int *t_rate,
-			 unsigned int *m_rate)
+static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
+			 uint64_t *m_rate)
 {
 	struct thread_data *td;
 	unsigned int cputhreads, realthreads, pending;
@@ -1490,11 +1955,7 @@
 	for_each_td(td, i) {
 		int flags = 0;
 
-		/*
-		 * ->io_ops is NULL for a thread that has closed its
-		 * io engine
-		 */
-		if (td->io_ops && !strcmp(td->io_ops->name, "cpuio"))
+		 if (!strcmp(td->o.ioengine, "cpuio"))
 			cputhreads++;
 		else
 			realthreads++;
@@ -1551,6 +2012,22 @@
 		}
 
 		/*
+		 * If the job is stuck, do a forceful timeout of it and
+		 * move on.
+		 */
+		if (td->terminate &&
+		    td->runstate < TD_FSYNCING &&
+		    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
+			log_err("fio: job '%s' (state=%d) hasn't exited in "
+				"%lu seconds, it appears to be stuck. Doing "
+				"forceful exit of this job.\n",
+				td->o.name, td->runstate,
+				(unsigned long) time_since_now(&td->terminate_time));
+			td_set_runstate(td, TD_REAPED);
+			goto reaped;
+		}
+
+		/*
 		 * thread is not dead, continue
 		 */
 		pending++;
@@ -1573,49 +2050,185 @@
 		fio_terminate_threads(TERMINATE_ALL);
 }
 
+static bool __check_trigger_file(void)
+{
+	struct stat sb;
+
+	if (!trigger_file)
+		return false;
+
+	if (stat(trigger_file, &sb))
+		return false;
+
+	if (unlink(trigger_file) < 0)
+		log_err("fio: failed to unlink %s: %s\n", trigger_file,
+							strerror(errno));
+
+	return true;
+}
+
+static bool trigger_timedout(void)
+{
+	if (trigger_timeout)
+		if (time_since_genesis() >= trigger_timeout) {
+			trigger_timeout = 0;
+			return true;
+		}
+
+	return false;
+}
+
+void exec_trigger(const char *cmd)
+{
+	int ret;
+
+	if (!cmd || cmd[0] == '\0')
+		return;
+
+	ret = system(cmd);
+	if (ret == -1)
+		log_err("fio: failed executing %s trigger\n", cmd);
+}
+
+void check_trigger_file(void)
+{
+	if (__check_trigger_file() || trigger_timedout()) {
+		if (nr_clients)
+			fio_clients_send_trigger(trigger_remote_cmd);
+		else {
+			verify_save_state(IO_LIST_ALL);
+			fio_terminate_threads(TERMINATE_ALL);
+			exec_trigger(trigger_cmd);
+		}
+	}
+}
+
+static int fio_verify_load_state(struct thread_data *td)
+{
+	int ret;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	if (is_backend) {
+		void *data;
+
+		ret = fio_server_get_verify_state(td->o.name,
+					td->thread_number - 1, &data);
+		if (!ret)
+			verify_assign_state(td, data);
+	} else
+		ret = verify_load_state(td, "local");
+
+	return ret;
+}
+
 static void do_usleep(unsigned int usecs)
 {
 	check_for_running_stats();
+	check_trigger_file();
 	usleep(usecs);
 }
 
+static bool check_mount_writes(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (!td_write(td) || td->o.allow_mounted_write)
+		return false;
+
+	/*
+	 * If FIO_HAVE_CHARDEV_SIZE is defined, it's likely that chrdevs
+	 * are mkfs'd and mounted.
+	 */
+	for_each_file(td, f, i) {
+#ifdef FIO_HAVE_CHARDEV_SIZE
+		if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+#else
+		if (f->filetype != FIO_TYPE_BLOCK)
+#endif
+			continue;
+		if (device_is_mounted(f->file_name))
+			goto mounted;
+	}
+
+	return false;
+mounted:
+	log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.\n", f->file_name);
+	return true;
+}
+
+static bool waitee_running(struct thread_data *me)
+{
+	const char *waitee = me->o.wait_for;
+	const char *self = me->o.name;
+	struct thread_data *td;
+	int i;
+
+	if (!waitee)
+		return false;
+
+	for_each_td(td, i) {
+		if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee))
+			continue;
+
+		if (td->runstate < TD_EXITED) {
+			dprint(FD_PROCESS, "%s fenced by %s(%s)\n",
+					self, td->o.name,
+					runstate_to_name(td->runstate));
+			return true;
+		}
+	}
+
+	dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee);
+	return false;
+}
+
 /*
  * Main function for kicking off and reaping jobs, as needed.
  */
-static void run_threads(void)
+static void run_threads(struct sk_out *sk_out)
 {
 	struct thread_data *td;
-	unsigned long spent;
-	unsigned int i, todo, nr_running, m_rate, t_rate, nr_started;
+	unsigned int i, todo, nr_running, nr_started;
+	uint64_t m_rate, t_rate;
+	uint64_t spent;
 
 	if (fio_gtod_offload && fio_start_gtod_thread())
 		return;
-	
+
 	fio_idle_prof_init();
 
 	set_sig_handlers();
 
 	nr_thread = nr_process = 0;
 	for_each_td(td, i) {
+		if (check_mount_writes(td))
+			return;
 		if (td->o.use_thread)
 			nr_thread++;
 		else
 			nr_process++;
 	}
 
-	if (output_format == FIO_OUTPUT_NORMAL) {
-		log_info("Starting ");
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		struct buf_output out;
+
+		buf_output_init(&out);
+		__log_buf(&out, "Starting ");
 		if (nr_thread)
-			log_info("%d thread%s", nr_thread,
+			__log_buf(&out, "%d thread%s", nr_thread,
 						nr_thread > 1 ? "s" : "");
 		if (nr_process) {
 			if (nr_thread)
-				log_info(" and ");
-			log_info("%d process%s", nr_process,
+				__log_buf(&out, " and ");
+			__log_buf(&out, "%d process%s", nr_process,
 						nr_process > 1 ? "es" : "");
 		}
-		log_info("\n");
-		fflush(stdout);
+		__log_buf(&out, "\n");
+		log_info_buf(out.buf, out.buflen);
+		buf_output_free(&out);
 	}
 
 	todo = thread_number;
@@ -1629,12 +2242,16 @@
 		if (!td->o.create_serialize)
 			continue;
 
+		if (fio_verify_load_state(td))
+			goto reap;
+
 		/*
 		 * do file setup here so it happens sequentially,
 		 * we don't want X number of threads getting their
 		 * client data interspersed on disk
 		 */
 		if (setup_files(td)) {
+reap:
 			exit_value++;
 			if (td->error)
 				log_err("fio: pid=%d, err=%d/%s\n",
@@ -1664,8 +2281,9 @@
 
 	while (todo) {
 		struct thread_data *map[REAL_MAX_JOBS];
-		struct timeval this_start;
+		struct timespec this_start;
 		int this_jobs = 0, left;
+		struct fork_data *fd;
 
 		/*
 		 * create threads (TD_NOT_CREATED -> TD_CREATED)
@@ -1684,9 +2302,9 @@
 			}
 
 			if (td->o.start_delay) {
-				spent = mtime_since_genesis();
+				spent = utime_since_genesis();
 
-				if (td->o.start_delay * 1000 > spent)
+				if (td->o.start_delay > spent)
 					continue;
 			}
 
@@ -1696,9 +2314,15 @@
 				break;
 			}
 
+			if (waitee_running(td)) {
+				dprint(FD_PROCESS, "%s: waiting for %s\n",
+						td->o.name, td->o.wait_for);
+				continue;
+			}
+
 			init_disk_util(td);
 
-			td->rusage_sem = fio_mutex_init(FIO_MUTEX_LOCKED);
+			td->rusage_sem = fio_sem_init(FIO_SEM_LOCKED);
 			td->update_rusage = 0;
 
 			/*
@@ -1709,18 +2333,24 @@
 			map[this_jobs++] = td;
 			nr_started++;
 
+			fd = calloc(1, sizeof(*fd));
+			fd->td = td;
+			fd->sk_out = sk_out;
+
 			if (td->o.use_thread) {
 				int ret;
 
 				dprint(FD_PROCESS, "will pthread_create\n");
 				ret = pthread_create(&td->thread, NULL,
-							thread_main, td);
+							thread_main, fd);
 				if (ret) {
 					log_err("pthread_create: %s\n",
 							strerror(ret));
+					free(fd);
 					nr_started--;
 					break;
 				}
+				fd = NULL;
 				ret = pthread_detach(td->thread);
 				if (ret)
 					log_err("pthread_detach: %s",
@@ -1730,21 +2360,23 @@
 				dprint(FD_PROCESS, "will fork\n");
 				pid = fork();
 				if (!pid) {
-					int ret = fork_main(shm_id, i);
+					int ret;
 
+					ret = (int)(uintptr_t)thread_main(fd);
 					_exit(ret);
 				} else if (i == fio_debug_jobno)
 					*fio_debug_jobp = pid;
 			}
-			dprint(FD_MUTEX, "wait on startup_mutex\n");
-			if (fio_mutex_down_timeout(startup_mutex, 10)) {
+			dprint(FD_MUTEX, "wait on startup_sem\n");
+			if (fio_sem_down_timeout(startup_sem, 10000)) {
 				log_err("fio: job startup hung? exiting.\n");
 				fio_terminate_threads(TERMINATE_ALL);
-				fio_abort = 1;
+				fio_abort = true;
 				nr_started--;
+				free(fd);
 				break;
 			}
-			dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+			dprint(FD_MUTEX, "done waiting on startup_sem\n");
 		}
 
 		/*
@@ -1803,7 +2435,7 @@
 			m_rate += ddir_rw_sum(td->o.ratemin);
 			t_rate += ddir_rw_sum(td->o.rate);
 			todo--;
-			fio_mutex_up(td->mutex);
+			fio_sem_up(td->sem);
 		}
 
 		reap_threads(&nr_running, &t_rate, &m_rate);
@@ -1822,67 +2454,13 @@
 	update_io_ticks();
 }
 
-void wait_for_disk_thread_exit(void)
-{
-	fio_mutex_down(disk_thread_mutex);
-}
-
 static void free_disk_util(void)
 {
-	disk_util_start_exit();
-	wait_for_disk_thread_exit();
 	disk_util_prune_entries();
+	helper_thread_destroy();
 }
 
-static void *disk_thread_main(void *data)
-{
-	int ret = 0;
-
-	fio_mutex_up(startup_mutex);
-
-	while (threads && !ret) {
-		usleep(DISK_UTIL_MSEC * 1000);
-		if (!threads)
-			break;
-		ret = update_io_ticks();
-
-		if (!is_backend)
-			print_thread_status();
-	}
-
-	fio_mutex_up(disk_thread_mutex);
-	return NULL;
-}
-
-static int create_disk_util_thread(void)
-{
-	int ret;
-
-	setup_disk_util();
-
-	disk_thread_mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
-
-	ret = pthread_create(&disk_util_thread, NULL, disk_thread_main, NULL);
-	if (ret) {
-		fio_mutex_remove(disk_thread_mutex);
-		log_err("Can't create disk util thread: %s\n", strerror(ret));
-		return 1;
-	}
-
-	ret = pthread_detach(disk_util_thread);
-	if (ret) {
-		fio_mutex_remove(disk_thread_mutex);
-		log_err("Can't detatch disk util thread: %s\n", strerror(ret));
-		return 1;
-	}
-
-	dprint(FD_MUTEX, "wait on startup_mutex\n");
-	fio_mutex_down(startup_mutex);
-	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
-	return 0;
-}
-
-int fio_backend(void)
+int fio_backend(struct sk_out *sk_out)
 {
 	struct thread_data *td;
 	int i;
@@ -1897,49 +2475,63 @@
 		return 0;
 
 	if (write_bw_log) {
-		setup_log(&agg_io_log[DDIR_READ], 0, IO_LOG_TYPE_BW);
-		setup_log(&agg_io_log[DDIR_WRITE], 0, IO_LOG_TYPE_BW);
-		setup_log(&agg_io_log[DDIR_TRIM], 0, IO_LOG_TYPE_BW);
+		struct log_params p = {
+			.log_type = IO_LOG_TYPE_BW,
+		};
+
+		setup_log(&agg_io_log[DDIR_READ], &p, "agg-read_bw.log");
+		setup_log(&agg_io_log[DDIR_WRITE], &p, "agg-write_bw.log");
+		setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
 	}
 
-	startup_mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
-	if (startup_mutex == NULL)
-		return 1;
-	writeout_mutex = fio_mutex_init(FIO_MUTEX_UNLOCKED);
-	if (writeout_mutex == NULL)
+	startup_sem = fio_sem_init(FIO_SEM_LOCKED);
+	if (!sk_out)
+		is_local_backend = true;
+	if (startup_sem == NULL)
 		return 1;
 
 	set_genesis_time();
 	stat_init();
-	create_disk_util_thread();
+	helper_thread_create(startup_sem, sk_out);
 
 	cgroup_list = smalloc(sizeof(*cgroup_list));
-	INIT_FLIST_HEAD(cgroup_list);
+	if (cgroup_list)
+		INIT_FLIST_HEAD(cgroup_list);
 
-	run_threads();
+	run_threads(sk_out);
+
+	helper_thread_exit();
 
 	if (!fio_abort) {
-		show_run_stats();
+		__show_run_stats();
 		if (write_bw_log) {
-			__finish_log(agg_io_log[DDIR_READ], "agg-read_bw.log");
-			__finish_log(agg_io_log[DDIR_WRITE],
-					"agg-write_bw.log");
-			__finish_log(agg_io_log[DDIR_TRIM],
-					"agg-write_bw.log");
+			for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+				struct io_log *log = agg_io_log[i];
+
+				flush_log(log, false);
+				free_log(log);
+			}
 		}
 	}
 
-	for_each_td(td, i)
+	for_each_td(td, i) {
+		steadystate_free(td);
 		fio_options_free(td);
+		if (td->rusage_sem) {
+			fio_sem_remove(td->rusage_sem);
+			td->rusage_sem = NULL;
+		}
+		fio_sem_remove(td->sem);
+		td->sem = NULL;
+	}
 
 	free_disk_util();
-	cgroup_kill(cgroup_list);
-	sfree(cgroup_list);
-	sfree(cgroup_mnt);
-
-	fio_mutex_remove(startup_mutex);
-	fio_mutex_remove(writeout_mutex);
-	fio_mutex_remove(disk_thread_mutex);
+	if (cgroup_list) {
+		cgroup_kill(cgroup_list);
+		sfree(cgroup_list);
+	}
+
+	fio_sem_remove(startup_sem);
 	stat_exit();
 	return exit_value;
 }
diff -Nru fio-2.1.3/blktrace_api.h fio-3.16/blktrace_api.h
--- fio-2.1.3/blktrace_api.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/blktrace_api.h	2019-09-20 01:01:52.000000000 +0000
@@ -9,7 +9,7 @@
 enum {
 	BLK_TC_READ	= 1 << 0,	/* reads */
 	BLK_TC_WRITE	= 1 << 1,	/* writes */
-	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_FLUSH	= 1 << 2,	/* flush */
 	BLK_TC_SYNC	= 1 << 3,	/* sync */
 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
 	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
@@ -127,9 +127,4 @@
 	__u32 pid;
 };
 
-#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
-#define BLKTRACESTART _IO(0x12,116)
-#define BLKTRACESTOP _IO(0x12,117)
-#define BLKTRACETEARDOWN _IO(0x12,118)
-
 #endif
diff -Nru fio-2.1.3/blktrace.c fio-3.16/blktrace.c
--- fio-2.1.3/blktrace.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/blktrace.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,12 +3,13 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <sys/stat.h>
-#include <dirent.h>
+#include <unistd.h>
 
 #include "flist.h"
 #include "fio.h"
+#include "blktrace.h"
 #include "blktrace_api.h"
+#include "oslib/linux-dev-lookup.h"
 
 #define TRACE_FIFO_SIZE	8192
 
@@ -71,91 +72,41 @@
  * Check if this is a blktrace binary data file. We read a single trace
  * into memory and check for the magic signature.
  */
-int is_blktrace(const char *filename)
+bool is_blktrace(const char *filename, int *need_swap)
 {
 	struct blk_io_trace t;
 	int fd, ret;
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0)
-		return 0;
+		return false;
 
 	ret = read(fd, &t, sizeof(t));
 	close(fd);
 
 	if (ret < 0) {
 		perror("read blktrace");
-		return 0;
+		return false;
 	} else if (ret != sizeof(t)) {
 		log_err("fio: short read on blktrace file\n");
-		return 0;
+		return false;
 	}
 
-	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
-		return 1;
-
-	return 0;
-}
-
-static int lookup_device(struct thread_data *td, char *path, unsigned int maj,
-			 unsigned int min)
-{
-	struct dirent *dir;
-	struct stat st;
-	int found = 0;
-	DIR *D;
-
-	D = opendir(path);
-	if (!D)
-		return 0;
-
-	while ((dir = readdir(D)) != NULL) {
-		char full_path[256];
-
-		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
-			continue;
-
-		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
-		if (lstat(full_path, &st) == -1) {
-			perror("lstat");
-			break;
-		}
-
-		if (S_ISDIR(st.st_mode)) {
-			found = lookup_device(td, full_path, maj, min);
-			if (found) {
-				strcpy(path, full_path);
-				break;
-			}
-		}
-
-		if (!S_ISBLK(st.st_mode))
-			continue;
-
-		/*
-		 * If replay_redirect is set then always return this device
-		 * upon lookup which overrides the device lookup based on
-		 * major minor in the actual blktrace
-		 */
-		if (td->o.replay_redirect) {
-			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
-					" with: %s", maj, min,
-					td->o.replay_redirect);
-			strcpy(path, td->o.replay_redirect);
-			found = 1;
-			break;
-		}
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
+		*need_swap = 0;
+		return true;
+	}
 
-		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
-			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
-			strcpy(path, full_path);
-			found = 1;
-			break;
-		}
+	/*
+	 * Maybe it needs to be endian swapped...
+	 */
+	t.magic = fio_swap32(t.magic);
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
+		*need_swap = 1;
+		return true;
 	}
 
-	closedir(D);
-	return found;
+	return false;
 }
 
 #define FMINORBITS	20
@@ -201,17 +152,36 @@
 		}
 
 	strcpy(dev, "/dev");
-	if (lookup_device(td, dev, maj, min)) {
+	if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
 		int fileno;
 
+		if (td->o.replay_redirect)
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
+					" with: %s\n", maj, min,
+					td->o.replay_redirect);
+		else
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
+
 		dprint(FD_BLKTRACE, "add devices %s\n", dev);
 		fileno = add_file_exclusive(td, dev);
+		td->o.open_files++;
+		td->files[fileno]->major = maj;
+		td->files[fileno]->minor = min;
 		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
 		last_fileno = fileno;
 	}
+
 	return last_fileno;
 }
 
+static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
+{
+	if (!o->replay_align)
+		return;
+
+	t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
+}
+
 /*
  * Store blk_io_trace data in an ipo for later retrieval.
  */
@@ -219,14 +189,15 @@
 		      unsigned int bytes, int rw, unsigned long long ttime,
 		      int fileno)
 {
-	struct io_piece *ipo = malloc(sizeof(*ipo));
+	struct io_piece *ipo;
 
+	ipo = calloc(1, sizeof(*ipo));
 	init_ipo(ipo);
 
-	/*
-	 * the 512 is wrong here, it should be the hardware sector size...
-	 */
 	ipo->offset = offset * 512;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
 	ipo->len = bytes;
 	ipo->delay = ttime / 1000;
 	if (rw)
@@ -245,10 +216,12 @@
 {
 	switch (t->action) {
 	case BLK_TN_PROCESS:
-		printf("got process notify: %x, %d\n", t->action, t->pid);
+		dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
+				t->action, t->pid);
 		break;
 	case BLK_TN_TIMESTAMP:
-		printf("got timestamp notify: %x, %d\n", t->action, t->pid);
+		dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
+				t->action, t->pid);
 		break;
 	case BLK_TN_MESSAGE:
 		break;
@@ -258,25 +231,33 @@
 	}
 }
 
-static void handle_trace_discard(struct thread_data *td, struct blk_io_trace *t,
-				 unsigned long long ttime, unsigned long *ios)
+static void handle_trace_discard(struct thread_data *td,
+				 struct blk_io_trace *t,
+				 unsigned long long ttime,
+				 unsigned long *ios, unsigned int *bs)
 {
-	struct io_piece *ipo = malloc(sizeof(*ipo));
+	struct io_piece *ipo;
 	int fileno;
 
+	if (td->o.replay_skip & (1u << DDIR_TRIM))
+		return;
+
+	ipo = calloc(1, sizeof(*ipo));
 	init_ipo(ipo);
 	fileno = trace_add_file(td, t->device);
 
-	ios[DDIR_WRITE]++;
+	ios[DDIR_TRIM]++;
+	if (t->bytes > bs[DDIR_TRIM])
+		bs[DDIR_TRIM] = t->bytes;
+
 	td->o.size += t->bytes;
 
-	memset(ipo, 0, sizeof(*ipo));
 	INIT_FLIST_HEAD(&ipo->list);
 
-	/*
-	 * the 512 is wrong here, it should be the hardware sector size...
-	 */
 	ipo->offset = t->sector * 512;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
 	ipo->len = t->bytes;
 	ipo->delay = ttime / 1000;
 	ipo->ddir = DDIR_TRIM;
@@ -288,6 +269,11 @@
 	queue_io_piece(td, ipo);
 }
 
+static void dump_trace(struct blk_io_trace *t)
+{
+	log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
+}
+
 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
 			    unsigned long long ttime, unsigned long *ios,
 			    unsigned int *bs)
@@ -299,6 +285,20 @@
 
 	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
+	if (rw) {
+		if (td->o.replay_skip & (1u << DDIR_WRITE))
+			return;
+	} else {
+		if (td->o.replay_skip & (1u << DDIR_READ))
+			return;
+	}
+
+	if (!t->bytes) {
+		if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
+			dump_trace(t);
+		return;
+	}
+
 	if (t->bytes > bs[rw])
 		bs[rw] = t->bytes;
 
@@ -307,56 +307,157 @@
 	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
 }
 
+static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+			       unsigned long long ttime, unsigned long *ios)
+{
+	struct io_piece *ipo;
+	int fileno;
+
+	if (td->o.replay_skip & (1u << DDIR_SYNC))
+		return;
+
+	ipo = calloc(1, sizeof(*ipo));
+	init_ipo(ipo);
+	fileno = trace_add_file(td, t->device);
+
+	ipo->delay = ttime / 1000;
+	ipo->ddir = DDIR_SYNC;
+	ipo->fileno = fileno;
+
+	ios[DDIR_SYNC]++;
+	dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
+	queue_io_piece(td, ipo);
+}
+
 /*
  * We only care for queue traces, most of the others are side effects
  * due to internal workings of the block layer.
  */
 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
-			 unsigned long long ttime, unsigned long *ios,
-			 unsigned int *bs)
+			 unsigned long *ios, unsigned int *bs)
 {
+	static unsigned long long last_ttime;
+	unsigned long long delay = 0;
+
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
 		return;
-	if (t->action & BLK_TC_ACT(BLK_TC_PC))
-		return;
+
+	if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
+		if (!last_ttime || td->o.no_stall)
+			delay = 0;
+		else if (td->o.replay_time_scale == 100)
+			delay = t->time - last_ttime;
+		else {
+			double tmp = t->time - last_ttime;
+			double scale;
+
+			scale = (double) 100.0 / (double) td->o.replay_time_scale;
+			tmp *= scale;
+			delay = tmp;
+		}
+		last_ttime = t->time;
+	}
+
+	t_bytes_align(&td->o, t);
 
 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
 		handle_trace_notify(t);
 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
-		handle_trace_discard(td, t, ttime, ios);
+		handle_trace_discard(td, t, delay, ios, bs);
+	else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
+		handle_trace_flush(td, t, delay, ios);
 	else
-		handle_trace_fs(td, t, ttime, ios, bs);
+		handle_trace_fs(td, t, delay, ios, bs);
+}
+
+static void byteswap_trace(struct blk_io_trace *t)
+{
+	t->magic = fio_swap32(t->magic);
+	t->sequence = fio_swap32(t->sequence);
+	t->time = fio_swap64(t->time);
+	t->sector = fio_swap64(t->sector);
+	t->bytes = fio_swap32(t->bytes);
+	t->action = fio_swap32(t->action);
+	t->pid = fio_swap32(t->pid);
+	t->device = fio_swap32(t->device);
+	t->cpu = fio_swap32(t->cpu);
+	t->error = fio_swap16(t->error);
+	t->pdu_len = fio_swap16(t->pdu_len);
+}
+
+static bool t_is_write(struct blk_io_trace *t)
+{
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
+}
+
+static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_READ))
+		return DDIR_READ;
+	else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
+		return DDIR_WRITE;
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return DDIR_INVAL;
+}
+
+static void depth_inc(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]++;
+}
+
+static void depth_dec(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]--;
+}
+
+static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
+{
+	enum fio_ddir ddir = DDIR_INVAL;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL) {
+		depth[ddir] = max(depth[ddir], this_depth[ddir]);
+		this_depth[ddir] = 0;
+	}
 }
 
 /*
  * Load a blktrace file by reading all the blk_io_trace entries, and storing
  * them as io_pieces like the fio text version would do.
  */
-int load_blktrace(struct thread_data *td, const char *filename)
+bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 {
-	unsigned long long ttime, delay;
 	struct blk_io_trace t;
-	unsigned long ios[2], skipped_writes;
-	unsigned int cpu;
-	unsigned int rw_bs[2];
+	unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
+	unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+	unsigned long skipped_writes;
 	struct fifo *fifo;
-	int fd, i;
+	int fd, i, old_state, max_depth;
 	struct fio_file *f;
+	int this_depth[DDIR_RWDIR_CNT] = { };
+	int depth[DDIR_RWDIR_CNT] = { };
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
 		td_verror(td, errno, "open blktrace file");
-		return 1;
+		return false;
 	}
 
 	fifo = fifo_alloc(TRACE_FIFO_SIZE);
 
-	td->o.size = 0;
+	old_state = td_bump_runstate(td, TD_SETTING_UP);
 
-	cpu = 0;
-	ttime = 0;
-	ios[0] = ios[1] = 0;
-	rw_bs[0] = rw_bs[1] = 0;
+	td->o.size = 0;
 	skipped_writes = 0;
 	do {
 		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
@@ -370,6 +471,9 @@
 			break;
 		}
 
+		if (need_swap)
+			byteswap_trace(&t);
+
 		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
 			log_err("fio: bad magic in blktrace data: %x\n",
 								t.magic);
@@ -389,51 +493,58 @@
 			goto err;
 		}
 		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
-			if (!ttime) {
-				ttime = t.time;
-				cpu = t.cpu;
-			}
+			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
+				depth_inc(&t, this_depth);
+			else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
+				((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
+				depth_dec(&t, this_depth);
+			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
+				depth_end(&t, this_depth, depth);
 
-			delay = 0;
-			if (cpu == t.cpu)
-				delay = t.time - ttime;
-			if ((t.action & BLK_TC_ACT(BLK_TC_WRITE)) && read_only)
+			if (t_is_write(&t) && read_only) {
 				skipped_writes++;
-			else {
-				/*
-				 * set delay to zero if no_stall enabled for
-				 * fast replay
-				 */
-				if (td->o.no_stall)
-					delay = 0;
-
-				handle_trace(td, &t, delay, ios, rw_bs);
+				continue;
 			}
-
-			ttime = t.time;
-			cpu = t.cpu;
-		} else {
-			delay = 0;
-			handle_trace(td, &t, delay, ios, rw_bs);
 		}
+
+		handle_trace(td, &t, ios, rw_bs);
 	} while (1);
 
-	for (i = 0; i < td->files_index; i++) {
-		f= td->files[i];
+	for_each_file(td, f, i)
 		trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
-	}
 
 	fifo_free(fifo);
 	close(fd);
 
+	td_restore_runstate(td, old_state);
+
+	if (!td->files_index) {
+		log_err("fio: did not find replay device(s)\n");
+		return false;
+	}
+
+	/*
+	 * For stacked devices, we don't always get a COMPLETE event so
+	 * the depth grows to insane values. Limit it to something sane(r).
+	 */
+	max_depth = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (depth[i] > 1024)
+			depth[i] = 1024;
+		else if (!depth[i] && ios[i])
+			depth[i] = 1;
+		max_depth = max(depth[i], max_depth);
+	}
+
 	if (skipped_writes)
 		log_err("fio: %s skips replay of %lu writes due to read-only\n",
 						td->o.name, skipped_writes);
 
-	if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) {
+	if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
+	    !ios[DDIR_SYNC]) {
 		log_err("fio: found no ios in blktrace data\n");
-		return 1;
-	} else if (ios[DDIR_READ] && !ios[DDIR_READ]) {
+		return false;
+	} else if (ios[DDIR_READ] && !ios[DDIR_WRITE]) {
 		td->o.td_ddir = TD_DDIR_READ;
 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
 	} else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
@@ -443,17 +554,236 @@
 		td->o.td_ddir = TD_DDIR_RW;
 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
+		td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
 	}
 
 	/*
 	 * We need to do direct/raw ios to the device, to avoid getting
-	 * read-ahead in our way.
+	 * read-ahead in our way. But only do so if the minimum block size
+	 * is a multiple of 4k, otherwise we don't know if it's safe to do so.
 	 */
-	td->o.odirect = 1;
+	if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
+		td->o.odirect = 1;
 
-	return 0;
+	/*
+	 * If depth wasn't manually set, use probed depth
+	 */
+	if (!fio_option_is_set(&td->o, iodepth))
+		td->o.iodepth = td->o.iodepth_low = max_depth;
+
+	return true;
 err:
 	close(fd);
 	fifo_free(fifo);
-	return 1;
+	return false;
+}
+
+static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs,
+				 int nr_logs, int def, size_t off)
+{
+	int i = 0, len = 0;
+
+	while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0)
+		len++;
+
+	if (len && len != nr_logs)
+		return len;
+
+	for (i = 0; i < nr_logs; i++) {
+		int *val = (int *)((char *)&bcs[i] + off);
+		*val = def;
+		if (len)
+			*val = (int)vals[i].u.f;
+	}
+
+	return 0;
+
+}
+
+static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs)
+{
+	__u64 time = ~(__u64)0;
+	int idx = 0, i;
+
+	for (i = 0; i < nr_logs; i++) {
+		if (bcs[i].t.time < time) {
+			time = bcs[i].t.time;
+			idx = i;
+		}
+	}
+
+	return idx;
+}
+
+static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
+{
+	bcs[i].iter++;
+	if (bcs[i].iter < bcs[i].nr_iter) {
+		lseek(bcs[i].fd, 0, SEEK_SET);
+		return;
+	}
+
+	*nr_logs -= 1;
+
+	/* close file */
+	fifo_free(bcs[i].fifo);
+	close(bcs[i].fd);
+
+	/* keep active files contiguous */
+	memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
+}
+
+static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
+{
+	int ret = 0;
+	struct blk_io_trace *t = &bc->t;
+
+read_skip:
+	/* read an io trace */
+	ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
+	if (ret < 0) {
+		return ret;
+	} else if (!ret) {
+		if (!bc->length)
+			bc->length = bc->t.time;
+		return ret;
+	} else if (ret < (int) sizeof(*t)) {
+		log_err("fio: short fifo get\n");
+		return -1;
+	}
+
+	if (bc->swap)
+		byteswap_trace(t);
+
+	/* skip over actions that fio does not care about */
+	if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
+	    t_get_ddir(t) == DDIR_INVAL) {
+		ret = discard_pdu(td, bc->fifo, bc->fd, t);
+		if (ret < 0) {
+			td_verror(td, ret, "blktrace lseek");
+			return ret;
+		} else if (t->pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret,
+				t->pdu_len);
+			return -1;
+		}
+		goto read_skip;
+	}
+
+	t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100;
+
+	return ret;
+}
+
+static int write_trace(FILE *fp, struct blk_io_trace *t)
+{
+	/* pdu is not used so just write out only the io trace */
+	t->pdu_len = 0;
+	return fwrite((void *)t, sizeof(*t), 1, fp);
+}
+
+int merge_blktrace_iologs(struct thread_data *td)
+{
+	int nr_logs = get_max_str_idx(td->o.read_iolog_file);
+	struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) *
+					     nr_logs);
+	struct blktrace_cursor *bc;
+	FILE *merge_fp;
+	char *str, *ptr, *name, *merge_buf;
+	int i, ret;
+
+	ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs,
+				    100, offsetof(struct blktrace_cursor,
+						  scalar));
+	if (ret) {
+		log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n",
+			ret, nr_logs);
+		goto err_param;
+	}
+
+	ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs,
+				    1, offsetof(struct blktrace_cursor,
+						nr_iter));
+	if (ret) {
+		log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n",
+			ret, nr_logs);
+		goto err_param;
+	}
+
+	/* setup output file */
+	merge_fp = fopen(td->o.merge_blktrace_file, "w");
+	merge_buf = malloc(128 * 1024);
+	ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024);
+	if (ret)
+		goto err_out_file;
+
+	/* setup input files */
+	str = ptr = strdup(td->o.read_iolog_file);
+	nr_logs = 0;
+	for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
+		bcs[i].fd = open(name, O_RDONLY);
+		if (bcs[i].fd < 0) {
+			log_err("fio: could not open file: %s\n", name);
+			ret = bcs[i].fd;
+			goto err_file;
+		}
+		bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
+		nr_logs++;
+
+		if (!is_blktrace(name, &bcs[i].swap)) {
+			log_err("fio: file is not a blktrace: %s\n", name);
+			goto err_file;
+		}
+
+		ret = read_trace(td, &bcs[i]);
+		if (ret < 0) {
+			goto err_file;
+		} else if (!ret) {
+			merge_finish_file(bcs, i, &nr_logs);
+			i--;
+		}
+	}
+	free(str);
+
+	/* merge files */
+	while (nr_logs) {
+		i = find_earliest_io(bcs, nr_logs);
+		bc = &bcs[i];
+		/* skip over the pdu */
+		ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+		if (ret < 0) {
+			td_verror(td, ret, "blktrace lseek");
+			goto err_file;
+		} else if (bc->t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret,
+				bc->t.pdu_len);
+			goto err_file;
+		}
+
+		ret = write_trace(merge_fp, &bc->t);
+		ret = read_trace(td, bc);
+		if (ret < 0)
+			goto err_file;
+		else if (!ret)
+			merge_finish_file(bcs, i, &nr_logs);
+	}
+
+	/* set iolog file to read from the newly merged file */
+	td->o.read_iolog_file = td->o.merge_blktrace_file;
+	ret = 0;
+
+err_file:
+	/* cleanup */
+	for (i = 0; i < nr_logs; i++) {
+		fifo_free(bcs[i].fifo);
+		close(bcs[i].fd);
+	}
+err_out_file:
+	fflush(merge_fp);
+	fclose(merge_fp);
+	free(merge_buf);
+err_param:
+	free(bcs);
+
+	return ret;
 }
diff -Nru fio-2.1.3/blktrace.h fio-3.16/blktrace.h
--- fio-2.1.3/blktrace.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/blktrace.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,45 @@
+#ifndef FIO_BLKTRACE_H
+#define FIO_BLKTRACE_H
+
+
+#ifdef FIO_HAVE_BLKTRACE
+
+#include <asm/types.h>
+
+#include "blktrace_api.h"
+
+struct blktrace_cursor {
+	struct fifo		*fifo;	// fifo queue for reading
+	int			fd;	// blktrace file
+	__u64			length; // length of trace
+	struct blk_io_trace	t;	// current io trace
+	int			swap;	// bitwise reverse required
+	int			scalar;	// scale percentage
+	int			iter;	// current iteration
+	int			nr_iter; // number of iterations to run
+};
+
+bool is_blktrace(const char *, int *);
+bool load_blktrace(struct thread_data *, const char *, int);
+int merge_blktrace_iologs(struct thread_data *td);
+
+#else
+
+static inline bool is_blktrace(const char *fname, int *need_swap)
+{
+	return false;
+}
+
+static inline bool load_blktrace(struct thread_data *td, const char *fname,
+				 int need_swap)
+{
+	return false;
+}
+
+static inline int merge_blktrace_iologs(struct thread_data *td)
+{
+	return false;
+}
+
+#endif
+#endif
diff -Nru fio-2.1.3/cconv.c fio-3.16/cconv.c
--- fio-2.1.3/cconv.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/cconv.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,6 @@
 #include <string.h>
 
+#include "log.h"
 #include "thread_options.h"
 
 static void string_to_cpu(char **dst, const uint8_t *src)
@@ -10,21 +11,60 @@
 		*dst = strdup(__src);
 }
 
-static void string_to_net(uint8_t *dst, const char *src)
+static void __string_to_net(uint8_t *dst, const char *src, size_t dst_size)
 {
 	if (src)
-		strcpy((char *) dst, src);
+		snprintf((char *) dst, dst_size, "%s", src);
 	else
 		dst[0] = '\0';
 }
 
+#define string_to_net(dst, src)	__string_to_net((dst), (src), sizeof(dst))
+
+static void free_thread_options_to_cpu(struct thread_options *o)
+{
+	int i;
+
+	free(o->description);
+	free(o->name);
+	free(o->wait_for);
+	free(o->directory);
+	free(o->filename);
+	free(o->filename_format);
+	free(o->opendir);
+	free(o->ioengine);
+	free(o->mmapfile);
+	free(o->read_iolog_file);
+	free(o->write_iolog_file);
+	free(o->merge_blktrace_file);
+	free(o->bw_log_file);
+	free(o->lat_log_file);
+	free(o->iops_log_file);
+	free(o->hist_log_file);
+	free(o->replay_redirect);
+	free(o->exec_prerun);
+	free(o->exec_postrun);
+	free(o->ioscheduler);
+	free(o->profile);
+	free(o->cgroup);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		free(o->bssplit[i]);
+		free(o->zone_split[i]);
+	}
+}
+
 void convert_thread_options_to_cpu(struct thread_options *o,
 				   struct thread_options_pack *top)
 {
 	int i, j;
 
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		o->set_options[i] = le64_to_cpu(top->set_options[i]);
+
 	string_to_cpu(&o->description, top->description);
 	string_to_cpu(&o->name, top->name);
+	string_to_cpu(&o->wait_for, top->wait_for);
 	string_to_cpu(&o->directory, top->directory);
 	string_to_cpu(&o->filename, top->filename);
 	string_to_cpu(&o->filename_format, top->filename_format);
@@ -33,9 +73,11 @@
 	string_to_cpu(&o->mmapfile, top->mmapfile);
 	string_to_cpu(&o->read_iolog_file, top->read_iolog_file);
 	string_to_cpu(&o->write_iolog_file, top->write_iolog_file);
+	string_to_cpu(&o->merge_blktrace_file, top->merge_blktrace_file);
 	string_to_cpu(&o->bw_log_file, top->bw_log_file);
 	string_to_cpu(&o->lat_log_file, top->lat_log_file);
 	string_to_cpu(&o->iops_log_file, top->iops_log_file);
+	string_to_cpu(&o->hist_log_file, top->hist_log_file);
 	string_to_cpu(&o->replay_redirect, top->replay_redirect);
 	string_to_cpu(&o->exec_prerun, top->exec_prerun);
 	string_to_cpu(&o->exec_postrun, top->exec_postrun);
@@ -43,41 +85,59 @@
 	string_to_cpu(&o->profile, top->profile);
 	string_to_cpu(&o->cgroup, top->cgroup);
 
+	o->allow_create = le32_to_cpu(top->allow_create);
+	o->allow_mounted_write = le32_to_cpu(top->allow_mounted_write);
 	o->td_ddir = le32_to_cpu(top->td_ddir);
 	o->rw_seq = le32_to_cpu(top->rw_seq);
 	o->kb_base = le32_to_cpu(top->kb_base);
-	o->unit_base = le32_to_cpu(top->kb_base);
+	o->unit_base = le32_to_cpu(top->unit_base);
 	o->ddir_seq_nr = le32_to_cpu(top->ddir_seq_nr);
 	o->ddir_seq_add = le64_to_cpu(top->ddir_seq_add);
 	o->iodepth = le32_to_cpu(top->iodepth);
 	o->iodepth_low = le32_to_cpu(top->iodepth_low);
 	o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
-	o->iodepth_batch_complete = le32_to_cpu(top->iodepth_batch_complete);
+	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
+	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+	o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
 	o->size = le64_to_cpu(top->size);
+	o->io_size = le64_to_cpu(top->io_size);
 	o->size_percent = le32_to_cpu(top->size_percent);
 	o->fill_device = le32_to_cpu(top->fill_device);
+	o->file_append = le32_to_cpu(top->file_append);
 	o->file_size_low = le64_to_cpu(top->file_size_low);
 	o->file_size_high = le64_to_cpu(top->file_size_high);
 	o->start_offset = le64_to_cpu(top->start_offset);
+	o->start_offset_align = le64_to_cpu(top->start_offset_align);
+	o->start_offset_percent = le32_to_cpu(top->start_offset_percent);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		o->bs[i] = le32_to_cpu(top->bs[i]);
-		o->ba[i] = le32_to_cpu(top->ba[i]);
-		o->min_bs[i] = le32_to_cpu(top->min_bs[i]);
-		o->max_bs[i] = le32_to_cpu(top->max_bs[i]);
+		o->bs[i] = le64_to_cpu(top->bs[i]);
+		o->ba[i] = le64_to_cpu(top->ba[i]);
+		o->min_bs[i] = le64_to_cpu(top->min_bs[i]);
+		o->max_bs[i] = le64_to_cpu(top->max_bs[i]);
 		o->bssplit_nr[i] = le32_to_cpu(top->bssplit_nr[i]);
 
 		if (o->bssplit_nr[i]) {
 			o->bssplit[i] = malloc(o->bssplit_nr[i] * sizeof(struct bssplit));
 			for (j = 0; j < o->bssplit_nr[i]; j++) {
-				o->bssplit[i][j].bs = le32_to_cpu(top->bssplit[i][j].bs);
+				o->bssplit[i][j].bs = le64_to_cpu(top->bssplit[i][j].bs);
 				o->bssplit[i][j].perc = le32_to_cpu(top->bssplit[i][j].perc);
 			}
 		}
 
+		o->zone_split_nr[i] = le32_to_cpu(top->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			o->zone_split[i] = malloc(o->zone_split_nr[i] * sizeof(struct zone_split));
+			for (j = 0; j < o->zone_split_nr[i]; j++) {
+				o->zone_split[i][j].access_perc = top->zone_split[i][j].access_perc;
+				o->zone_split[i][j].size_perc = top->zone_split[i][j].size_perc;
+			}
+		}
+
 		o->rwmix[i] = le32_to_cpu(top->rwmix[i]);
-		o->rate[i] = le32_to_cpu(top->rate[i]);
-		o->ratemin[i] = le32_to_cpu(top->ratemin[i]);
+		o->rate[i] = le64_to_cpu(top->rate[i]);
+		o->ratemin[i] = le64_to_cpu(top->ratemin[i]);
 		o->rate_iops[i] = le32_to_cpu(top->rate_iops[i]);
 		o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
 
@@ -85,10 +145,13 @@
 	}
 
 	o->ratecycle = le32_to_cpu(top->ratecycle);
+	o->io_submit_mode = le32_to_cpu(top->io_submit_mode);
+	o->unique_filename = le32_to_cpu(top->unique_filename);
 	o->nr_files = le32_to_cpu(top->nr_files);
 	o->open_files = le32_to_cpu(top->open_files);
 	o->file_lock_mode = le32_to_cpu(top->file_lock_mode);
 	o->odirect = le32_to_cpu(top->odirect);
+	o->oatomic = le32_to_cpu(top->oatomic);
 	o->invalidate_cache = le32_to_cpu(top->invalidate_cache);
 	o->create_serialize = le32_to_cpu(top->create_serialize);
 	o->create_fsync = le32_to_cpu(top->create_fsync);
@@ -97,15 +160,16 @@
 	o->end_fsync = le32_to_cpu(top->end_fsync);
 	o->pre_read = le32_to_cpu(top->pre_read);
 	o->sync_io = le32_to_cpu(top->sync_io);
+	o->write_hint = le32_to_cpu(top->write_hint);
 	o->verify = le32_to_cpu(top->verify);
 	o->do_verify = le32_to_cpu(top->do_verify);
-	o->verifysort = le32_to_cpu(top->verifysort);
-	o->verifysort_nr = le32_to_cpu(top->verifysort_nr);
 	o->experimental_verify = le32_to_cpu(top->experimental_verify);
+	o->verify_state = le32_to_cpu(top->verify_state);
 	o->verify_interval = le32_to_cpu(top->verify_interval);
 	o->verify_offset = le32_to_cpu(top->verify_offset);
 
 	memcpy(o->verify_pattern, top->verify_pattern, MAX_PATTERN_SIZE);
+	memcpy(o->buffer_pattern, top->buffer_pattern, MAX_PATTERN_SIZE);
 
 	o->verify_pattern_bytes = le32_to_cpu(top->verify_pattern_bytes);
 	o->verify_fatal = le32_to_cpu(top->verify_fatal);
@@ -114,22 +178,33 @@
 	o->verify_batch = le32_to_cpu(top->verify_batch);
 	o->use_thread = le32_to_cpu(top->use_thread);
 	o->unlink = le32_to_cpu(top->unlink);
+	o->unlink_each_loop = le32_to_cpu(top->unlink_each_loop);
 	o->do_disk_util = le32_to_cpu(top->do_disk_util);
 	o->override_sync = le32_to_cpu(top->override_sync);
 	o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
-	o->use_os_rand = le32_to_cpu(top->use_os_rand);
+	o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
+	o->rand_seed = le64_to_cpu(top->rand_seed);
 	o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
+	o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
+	o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
+	o->log_max = le32_to_cpu(top->log_max);
+	o->log_offset = le32_to_cpu(top->log_offset);
+	o->log_gz = le32_to_cpu(top->log_gz);
+	o->log_gz_store = le32_to_cpu(top->log_gz_store);
+	o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
 	o->norandommap = le32_to_cpu(top->norandommap);
 	o->softrandommap = le32_to_cpu(top->softrandommap);
 	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
 	o->fsync_on_close = le32_to_cpu(top->fsync_on_close);
 	o->bs_is_seq_rand = le32_to_cpu(top->bs_is_seq_rand);
 	o->random_distribution = le32_to_cpu(top->random_distribution);
+	o->exitall_error = le32_to_cpu(top->exitall_error);
 	o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
 	o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
+	o->gauss_dev.u.f = fio_uint64_to_double(le64_to_cpu(top->gauss_dev.u.i));
 	o->random_generator = le32_to_cpu(top->random_generator);
 	o->hugepage_size = le32_to_cpu(top->hugepage_size);
-	o->rw_min_bs = le32_to_cpu(top->rw_min_bs);
+	o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
 	o->thinktime = le32_to_cpu(top->thinktime);
 	o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
 	o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
@@ -139,12 +214,19 @@
 
 	o->verify_backlog = le64_to_cpu(top->verify_backlog);
 	o->start_delay = le64_to_cpu(top->start_delay);
+	o->start_delay_high = le64_to_cpu(top->start_delay_high);
 	o->timeout = le64_to_cpu(top->timeout);
 	o->ramp_time = le64_to_cpu(top->ramp_time);
+	o->ss_dur = le64_to_cpu(top->ss_dur);
+	o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time);
+	o->ss_state = le32_to_cpu(top->ss_state);
+	o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i));
 	o->zone_range = le64_to_cpu(top->zone_range);
 	o->zone_size = le64_to_cpu(top->zone_size);
 	o->zone_skip = le64_to_cpu(top->zone_skip);
+	o->zone_mode = le32_to_cpu(top->zone_mode);
 	o->lockmem = le64_to_cpu(top->lockmem);
+	o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
 	o->offset_increment = le64_to_cpu(top->offset_increment);
 	o->number_ios = le64_to_cpu(top->number_ios);
 
@@ -154,12 +236,11 @@
 	o->loops = le32_to_cpu(top->loops);
 	o->mem_type = le32_to_cpu(top->mem_type);
 	o->mem_align = le32_to_cpu(top->mem_align);
-	o->max_latency = le32_to_cpu(top->max_latency);
 	o->stonewall = le32_to_cpu(top->stonewall);
 	o->new_group = le32_to_cpu(top->new_group);
 	o->numjobs = le32_to_cpu(top->numjobs);
-	o->cpumask_set = le32_to_cpu(top->cpumask_set);
-	o->verify_cpumask_set = le32_to_cpu(top->verify_cpumask_set);
+	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
+	o->gpu_dev_id = le32_to_cpu(top->gpu_dev_id);
 	o->iolog = le32_to_cpu(top->iolog);
 	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
 	o->nice = le32_to_cpu(top->nice);
@@ -167,11 +248,13 @@
 	o->ioprio_class = le32_to_cpu(top->ioprio_class);
 	o->file_service_type = le32_to_cpu(top->file_service_type);
 	o->group_reporting = le32_to_cpu(top->group_reporting);
+	o->stats = le32_to_cpu(top->stats);
 	o->fadvise_hint = le32_to_cpu(top->fadvise_hint);
 	o->fallocate_mode = le32_to_cpu(top->fallocate_mode);
 	o->zero_buffers = le32_to_cpu(top->zero_buffers);
 	o->refill_buffers = le32_to_cpu(top->refill_buffers);
 	o->scramble_buffers = le32_to_cpu(top->scramble_buffers);
+	o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
 	o->time_based = le32_to_cpu(top->time_based);
 	o->disable_lat = le32_to_cpu(top->disable_lat);
 	o->disable_clat = le32_to_cpu(top->disable_clat);
@@ -180,14 +263,15 @@
 	o->unified_rw_rep = le32_to_cpu(top->unified_rw_rep);
 	o->gtod_reduce = le32_to_cpu(top->gtod_reduce);
 	o->gtod_cpu = le32_to_cpu(top->gtod_cpu);
-	o->gtod_offload = le32_to_cpu(top->gtod_offload);
 	o->clocksource = le32_to_cpu(top->clocksource);
 	o->no_stall = le32_to_cpu(top->no_stall);
 	o->trim_percentage = le32_to_cpu(top->trim_percentage);
 	o->trim_batch = le32_to_cpu(top->trim_batch);
 	o->trim_zero = le32_to_cpu(top->trim_zero);
 	o->clat_percentiles = le32_to_cpu(top->clat_percentiles);
+	o->lat_percentiles = le32_to_cpu(top->lat_percentiles);
 	o->percentile_precision = le32_to_cpu(top->percentile_precision);
+	o->sig_figs = le32_to_cpu(top->sig_figs);
 	o->continue_on_error = le32_to_cpu(top->continue_on_error);
 	o->cgroup_weight = le32_to_cpu(top->cgroup_weight);
 	o->cgroup_nodelete = le32_to_cpu(top->cgroup_nodelete);
@@ -198,16 +282,40 @@
 	o->flow_watermark = __le32_to_cpu(top->flow_watermark);
 	o->flow_sleep = le32_to_cpu(top->flow_sleep);
 	o->sync_file_range = le32_to_cpu(top->sync_file_range);
+	o->latency_target = le64_to_cpu(top->latency_target);
+	o->latency_window = le64_to_cpu(top->latency_window);
+	o->max_latency = le64_to_cpu(top->max_latency);
+	o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
 	o->compress_percentage = le32_to_cpu(top->compress_percentage);
 	o->compress_chunk = le32_to_cpu(top->compress_chunk);
+	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+	o->block_error_hist = le32_to_cpu(top->block_error_hist);
+	o->replay_align = le32_to_cpu(top->replay_align);
+	o->replay_scale = le32_to_cpu(top->replay_scale);
+	o->replay_time_scale = le32_to_cpu(top->replay_time_scale);
+	o->replay_skip = le32_to_cpu(top->replay_skip);
+	o->per_job_logs = le32_to_cpu(top->per_job_logs);
+	o->write_bw_log = le32_to_cpu(top->write_bw_log);
+	o->write_lat_log = le32_to_cpu(top->write_lat_log);
+	o->write_iops_log = le32_to_cpu(top->write_iops_log);
+	o->write_hist_log = le32_to_cpu(top->write_hist_log);
 
 	o->trim_backlog = le64_to_cpu(top->trim_backlog);
+	o->rate_process = le32_to_cpu(top->rate_process);
+	o->rate_ign_think = le32_to_cpu(top->rate_ign_think);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
 		o->percentile_list[i].u.f = fio_uint64_to_double(le64_to_cpu(top->percentile_list[i].u.i));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		o->merge_blktrace_scalars[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_scalars[i].u.i));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i));
 #if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
 #endif
 }
 
@@ -216,8 +324,12 @@
 {
 	int i, j;
 
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		top->set_options[i] = cpu_to_le64(o->set_options[i]);
+
 	string_to_net(top->description, o->description);
 	string_to_net(top->name, o->name);
+	string_to_net(top->wait_for, o->wait_for);
 	string_to_net(top->directory, o->directory);
 	string_to_net(top->filename, o->filename);
 	string_to_net(top->filename_format, o->filename_format);
@@ -226,9 +338,11 @@
 	string_to_net(top->mmapfile, o->mmapfile);
 	string_to_net(top->read_iolog_file, o->read_iolog_file);
 	string_to_net(top->write_iolog_file, o->write_iolog_file);
+	string_to_net(top->merge_blktrace_file, o->merge_blktrace_file);
 	string_to_net(top->bw_log_file, o->bw_log_file);
 	string_to_net(top->lat_log_file, o->lat_log_file);
 	string_to_net(top->iops_log_file, o->iops_log_file);
+	string_to_net(top->hist_log_file, o->hist_log_file);
 	string_to_net(top->replay_redirect, o->replay_redirect);
 	string_to_net(top->exec_prerun, o->exec_prerun);
 	string_to_net(top->exec_postrun, o->exec_postrun);
@@ -236,22 +350,30 @@
 	string_to_net(top->profile, o->profile);
 	string_to_net(top->cgroup, o->cgroup);
 
+	top->allow_create = cpu_to_le32(o->allow_create);
+	top->allow_mounted_write = cpu_to_le32(o->allow_mounted_write);
 	top->td_ddir = cpu_to_le32(o->td_ddir);
 	top->rw_seq = cpu_to_le32(o->rw_seq);
 	top->kb_base = cpu_to_le32(o->kb_base);
-	top->unit_base = cpu_to_le32(o->kb_base);
+	top->unit_base = cpu_to_le32(o->unit_base);
 	top->ddir_seq_nr = cpu_to_le32(o->ddir_seq_nr);
 	top->iodepth = cpu_to_le32(o->iodepth);
 	top->iodepth_low = cpu_to_le32(o->iodepth_low);
 	top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
-	top->iodepth_batch_complete = cpu_to_le32(o->iodepth_batch_complete);
+	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
+	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+	top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
 	top->size_percent = cpu_to_le32(o->size_percent);
 	top->fill_device = cpu_to_le32(o->fill_device);
+	top->file_append = cpu_to_le32(o->file_append);
 	top->ratecycle = cpu_to_le32(o->ratecycle);
+	top->io_submit_mode = cpu_to_le32(o->io_submit_mode);
 	top->nr_files = cpu_to_le32(o->nr_files);
+	top->unique_filename = cpu_to_le32(o->unique_filename);
 	top->open_files = cpu_to_le32(o->open_files);
 	top->file_lock_mode = cpu_to_le32(o->file_lock_mode);
 	top->odirect = cpu_to_le32(o->odirect);
+	top->oatomic = cpu_to_le32(o->oatomic);
 	top->invalidate_cache = cpu_to_le32(o->invalidate_cache);
 	top->create_serialize = cpu_to_le32(o->create_serialize);
 	top->create_fsync = cpu_to_le32(o->create_fsync);
@@ -260,11 +382,11 @@
 	top->end_fsync = cpu_to_le32(o->end_fsync);
 	top->pre_read = cpu_to_le32(o->pre_read);
 	top->sync_io = cpu_to_le32(o->sync_io);
+	top->write_hint = cpu_to_le32(o->write_hint);
 	top->verify = cpu_to_le32(o->verify);
 	top->do_verify = cpu_to_le32(o->do_verify);
-	top->verifysort = cpu_to_le32(o->verifysort);
-	top->verifysort_nr = cpu_to_le32(o->verifysort_nr);
 	top->experimental_verify = cpu_to_le32(o->experimental_verify);
+	top->verify_state = cpu_to_le32(o->verify_state);
 	top->verify_interval = cpu_to_le32(o->verify_interval);
 	top->verify_offset = cpu_to_le32(o->verify_offset);
 	top->verify_pattern_bytes = cpu_to_le32(o->verify_pattern_bytes);
@@ -274,22 +396,31 @@
 	top->verify_batch = cpu_to_le32(o->verify_batch);
 	top->use_thread = cpu_to_le32(o->use_thread);
 	top->unlink = cpu_to_le32(o->unlink);
+	top->unlink_each_loop = cpu_to_le32(o->unlink_each_loop);
 	top->do_disk_util = cpu_to_le32(o->do_disk_util);
 	top->override_sync = cpu_to_le32(o->override_sync);
 	top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
-	top->use_os_rand = cpu_to_le32(o->use_os_rand);
+	top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
+	top->rand_seed = __cpu_to_le64(o->rand_seed);
 	top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
+	top->log_max = cpu_to_le32(o->log_max);
+	top->log_offset = cpu_to_le32(o->log_offset);
+	top->log_gz = cpu_to_le32(o->log_gz);
+	top->log_gz_store = cpu_to_le32(o->log_gz_store);
+	top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
 	top->norandommap = cpu_to_le32(o->norandommap);
 	top->softrandommap = cpu_to_le32(o->softrandommap);
 	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
 	top->fsync_on_close = cpu_to_le32(o->fsync_on_close);
 	top->bs_is_seq_rand = cpu_to_le32(o->bs_is_seq_rand);
 	top->random_distribution = cpu_to_le32(o->random_distribution);
+	top->exitall_error = cpu_to_le32(o->exitall_error);
 	top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
 	top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
+	top->gauss_dev.u.i = __cpu_to_le64(fio_double_to_uint64(o->gauss_dev.u.f));
 	top->random_generator = cpu_to_le32(o->random_generator);
 	top->hugepage_size = cpu_to_le32(o->hugepage_size);
-	top->rw_min_bs = cpu_to_le32(o->rw_min_bs);
+	top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
 	top->thinktime = cpu_to_le32(o->thinktime);
 	top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
 	top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
@@ -302,12 +433,11 @@
 	top->loops = cpu_to_le32(o->loops);
 	top->mem_type = cpu_to_le32(o->mem_type);
 	top->mem_align = cpu_to_le32(o->mem_align);
-	top->max_latency = cpu_to_le32(o->max_latency);
 	top->stonewall = cpu_to_le32(o->stonewall);
 	top->new_group = cpu_to_le32(o->new_group);
 	top->numjobs = cpu_to_le32(o->numjobs);
-	top->cpumask_set = cpu_to_le32(o->cpumask_set);
-	top->verify_cpumask_set = cpu_to_le32(o->verify_cpumask_set);
+	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
+	top->gpu_dev_id = cpu_to_le32(o->gpu_dev_id);
 	top->iolog = cpu_to_le32(o->iolog);
 	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
 	top->nice = cpu_to_le32(o->nice);
@@ -315,11 +445,13 @@
 	top->ioprio_class = cpu_to_le32(o->ioprio_class);
 	top->file_service_type = cpu_to_le32(o->file_service_type);
 	top->group_reporting = cpu_to_le32(o->group_reporting);
+	top->stats = cpu_to_le32(o->stats);
 	top->fadvise_hint = cpu_to_le32(o->fadvise_hint);
 	top->fallocate_mode = cpu_to_le32(o->fallocate_mode);
 	top->zero_buffers = cpu_to_le32(o->zero_buffers);
 	top->refill_buffers = cpu_to_le32(o->refill_buffers);
 	top->scramble_buffers = cpu_to_le32(o->scramble_buffers);
+	top->buffer_pattern_bytes = cpu_to_le32(o->buffer_pattern_bytes);
 	top->time_based = cpu_to_le32(o->time_based);
 	top->disable_lat = cpu_to_le32(o->disable_lat);
 	top->disable_clat = cpu_to_le32(o->disable_clat);
@@ -328,14 +460,15 @@
 	top->unified_rw_rep = cpu_to_le32(o->unified_rw_rep);
 	top->gtod_reduce = cpu_to_le32(o->gtod_reduce);
 	top->gtod_cpu = cpu_to_le32(o->gtod_cpu);
-	top->gtod_offload = cpu_to_le32(o->gtod_offload);
 	top->clocksource = cpu_to_le32(o->clocksource);
 	top->no_stall = cpu_to_le32(o->no_stall);
 	top->trim_percentage = cpu_to_le32(o->trim_percentage);
 	top->trim_batch = cpu_to_le32(o->trim_batch);
 	top->trim_zero = cpu_to_le32(o->trim_zero);
 	top->clat_percentiles = cpu_to_le32(o->clat_percentiles);
+	top->lat_percentiles = cpu_to_le32(o->lat_percentiles);
 	top->percentile_precision = cpu_to_le32(o->percentile_precision);
+	top->sig_figs = cpu_to_le32(o->sig_figs);
 	top->continue_on_error = cpu_to_le32(o->continue_on_error);
 	top->cgroup_weight = cpu_to_le32(o->cgroup_weight);
 	top->cgroup_nodelete = cpu_to_le32(o->cgroup_nodelete);
@@ -346,14 +479,29 @@
 	top->flow_watermark = __cpu_to_le32(o->flow_watermark);
 	top->flow_sleep = cpu_to_le32(o->flow_sleep);
 	top->sync_file_range = cpu_to_le32(o->sync_file_range);
+	top->latency_target = __cpu_to_le64(o->latency_target);
+	top->latency_window = __cpu_to_le64(o->latency_window);
+	top->max_latency = __cpu_to_le64(o->max_latency);
+	top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
 	top->compress_percentage = cpu_to_le32(o->compress_percentage);
 	top->compress_chunk = cpu_to_le32(o->compress_chunk);
+	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+	top->block_error_hist = cpu_to_le32(o->block_error_hist);
+	top->replay_align = cpu_to_le32(o->replay_align);
+	top->replay_scale = cpu_to_le32(o->replay_scale);
+	top->replay_time_scale = cpu_to_le32(o->replay_time_scale);
+	top->replay_skip = cpu_to_le32(o->replay_skip);
+	top->per_job_logs = cpu_to_le32(o->per_job_logs);
+	top->write_bw_log = cpu_to_le32(o->write_bw_log);
+	top->write_lat_log = cpu_to_le32(o->write_lat_log);
+	top->write_iops_log = cpu_to_le32(o->write_iops_log);
+	top->write_hist_log = cpu_to_le32(o->write_hist_log);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		top->bs[i] = cpu_to_le32(o->bs[i]);
-		top->ba[i] = cpu_to_le32(o->ba[i]);
-		top->min_bs[i] = cpu_to_le32(o->min_bs[i]);
-		top->max_bs[i] = cpu_to_le32(o->max_bs[i]);
+		top->bs[i] = __cpu_to_le64(o->bs[i]);
+		top->ba[i] = __cpu_to_le64(o->ba[i]);
+		top->min_bs[i] = __cpu_to_le64(o->min_bs[i]);
+		top->max_bs[i] = __cpu_to_le64(o->max_bs[i]);
 		top->bssplit_nr[i] = cpu_to_le32(o->bssplit_nr[i]);
 
 		if (o->bssplit_nr[i]) {
@@ -364,14 +512,29 @@
 				bssplit_nr = BSSPLIT_MAX;
 			}
 			for (j = 0; j < bssplit_nr; j++) {
-				top->bssplit[i][j].bs = cpu_to_le32(o->bssplit[i][j].bs);
+				top->bssplit[i][j].bs = cpu_to_le64(o->bssplit[i][j].bs);
 				top->bssplit[i][j].perc = cpu_to_le32(o->bssplit[i][j].perc);
 			}
 		}
 
+		top->zone_split_nr[i] = cpu_to_le32(o->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			unsigned int zone_split_nr = o->zone_split_nr[i];
+
+			if (zone_split_nr > ZONESPLIT_MAX) {
+				log_err("fio: ZONESPLIT_MAX is too small\n");
+				zone_split_nr = ZONESPLIT_MAX;
+			}
+			for (j = 0; j < zone_split_nr; j++) {
+				top->zone_split[i][j].access_perc = o->zone_split[i][j].access_perc;
+				top->zone_split[i][j].size_perc = o->zone_split[i][j].size_perc;
+			}
+		}
+
 		top->rwmix[i] = cpu_to_le32(o->rwmix[i]);
-		top->rate[i] = cpu_to_le32(o->rate[i]);
-		top->ratemin[i] = cpu_to_le32(o->ratemin[i]);
+		top->rate[i] = cpu_to_le64(o->rate[i]);
+		top->ratemin[i] = cpu_to_le64(o->ratemin[i]);
 		top->rate_iops[i] = cpu_to_le32(o->rate_iops[i]);
 		top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
 
@@ -379,29 +542,49 @@
 	}
 
 	memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE);
+	memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
 
 	top->size = __cpu_to_le64(o->size);
+	top->io_size = __cpu_to_le64(o->io_size);
 	top->verify_backlog = __cpu_to_le64(o->verify_backlog);
 	top->start_delay = __cpu_to_le64(o->start_delay);
+	top->start_delay_high = __cpu_to_le64(o->start_delay_high);
 	top->timeout = __cpu_to_le64(o->timeout);
 	top->ramp_time = __cpu_to_le64(o->ramp_time);
+	top->ss_dur = __cpu_to_le64(top->ss_dur);
+	top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time);
+	top->ss_state = cpu_to_le32(top->ss_state);
+	top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f));
 	top->zone_range = __cpu_to_le64(o->zone_range);
 	top->zone_size = __cpu_to_le64(o->zone_size);
 	top->zone_skip = __cpu_to_le64(o->zone_skip);
+	top->zone_mode = __cpu_to_le32(o->zone_mode);
 	top->lockmem = __cpu_to_le64(o->lockmem);
 	top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
 	top->file_size_low = __cpu_to_le64(o->file_size_low);
 	top->file_size_high = __cpu_to_le64(o->file_size_high);
 	top->start_offset = __cpu_to_le64(o->start_offset);
+	top->start_offset_align = __cpu_to_le64(o->start_offset_align);
+	top->start_offset_percent = __cpu_to_le32(o->start_offset_percent);
 	top->trim_backlog = __cpu_to_le64(o->trim_backlog);
+	top->offset_increment_percent = __cpu_to_le32(o->offset_increment_percent);
 	top->offset_increment = __cpu_to_le64(o->offset_increment);
 	top->number_ios = __cpu_to_le64(o->number_ios);
+	top->rate_process = cpu_to_le32(o->rate_process);
+	top->rate_ign_think = cpu_to_le32(o->rate_ign_think);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
 		top->percentile_list[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->percentile_list[i].u.f));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		top->merge_blktrace_scalars[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_scalars[i].u.f));
+
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
+		top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f));
 #if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
 #endif
 
 }
@@ -424,5 +607,7 @@
 	convert_thread_options_to_cpu(&o, &top1);
 	convert_thread_options_to_net(&top2, &o);
 
+	free_thread_options_to_cpu(&o);
+
 	return memcmp(&top1, &top2, sizeof(top1));
 }
diff -Nru fio-2.1.3/cgroup.c fio-3.16/cgroup.c
--- fio-2.1.3/cgroup.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/cgroup.c	2019-09-20 01:01:52.000000000 +0000
@@ -5,13 +5,12 @@
 #include <stdlib.h>
 #include <mntent.h>
 #include <sys/stat.h>
-#include <sys/types.h>
 #include "fio.h"
 #include "flist.h"
 #include "cgroup.h"
 #include "smalloc.h"
 
-static struct fio_mutex *lock;
+static struct fio_sem *lock;
 
 struct cgroup_member {
 	struct flist_head list;
@@ -19,12 +18,13 @@
 	unsigned int cgroup_nodelete;
 };
 
-static char *find_cgroup_mnt(struct thread_data *td)
+static struct cgroup_mnt *find_cgroup_mnt(struct thread_data *td)
 {
-	char *mntpoint = NULL;
+	struct cgroup_mnt *cgroup_mnt = NULL;
 	struct mntent *mnt, dummy;
 	char buf[256] = {0};
 	FILE *f;
+	bool cgroup2 = false;
 
 	f = setmntent("/proc/mounts", "r");
 	if (!f) {
@@ -36,15 +36,29 @@
 		if (!strcmp(mnt->mnt_type, "cgroup") &&
 		    strstr(mnt->mnt_opts, "blkio"))
 			break;
+		if (!strcmp(mnt->mnt_type, "cgroup2")) {
+			cgroup2 = true;
+			break;
+		}
 	}
 
-	if (mnt)
-		mntpoint = smalloc_strdup(mnt->mnt_dir);
-	else
+	if (mnt) {
+		cgroup_mnt = smalloc(sizeof(*cgroup_mnt));
+		if (cgroup_mnt) {
+			cgroup_mnt->path = smalloc_strdup(mnt->mnt_dir);
+			if (!cgroup_mnt->path) {
+				sfree(cgroup_mnt);
+				log_err("fio: could not allocate memory\n");
+			} else {
+				cgroup_mnt->cgroup2 = cgroup2;
+			}
+		}
+	} else {
 		log_err("fio: cgroup blkio does not appear to be mounted\n");
+	}
 
 	endmntent(f);
-	return mntpoint;
+	return cgroup_mnt;
 }
 
 static void add_cgroup(struct thread_data *td, const char *name,
@@ -70,9 +84,9 @@
 	}
 	if (td->o.cgroup_nodelete)
 		cm->cgroup_nodelete = 1;
-	fio_mutex_down(lock);
+	fio_sem_down(lock);
 	flist_add_tail(&cm->list, clist);
-	fio_mutex_up(lock);
+	fio_sem_up(lock);
 }
 
 void cgroup_kill(struct flist_head *clist)
@@ -83,7 +97,7 @@
 	if (!lock)
 		return;
 
-	fio_mutex_down(lock);
+	fio_sem_down(lock);
 
 	flist_for_each_safe(n, tmp, clist) {
 		cm = flist_entry(n, struct cgroup_member, list);
@@ -94,17 +108,17 @@
 		sfree(cm);
 	}
 
-	fio_mutex_up(lock);
+	fio_sem_up(lock);
 }
 
-static char *get_cgroup_root(struct thread_data *td, char *mnt)
+static char *get_cgroup_root(struct thread_data *td, struct cgroup_mnt *mnt)
 {
 	char *str = malloc(64);
 
 	if (td->o.cgroup)
-		sprintf(str, "%s%s%s", mnt, FIO_OS_PATH_SEPARATOR, td->o.cgroup);
+		sprintf(str, "%s/%s", mnt->path, td->o.cgroup);
 	else
-		sprintf(str, "%s%s%s", mnt, FIO_OS_PATH_SEPARATOR, td->o.name);
+		sprintf(str, "%s/%s", mnt->path, td->o.name);
 
 	return str;
 }
@@ -116,7 +130,7 @@
 	char tmp[256];
 	FILE *f;
 
-	sprintf(tmp, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, filename);
+	sprintf(tmp, "%s/%s", path, filename);
 	f = fopen(tmp, "w");
 	if (!f) {
 		td_verror(td, errno, onerr);
@@ -129,25 +143,31 @@
 
 }
 
-static int cgroup_write_pid(struct thread_data *td, const char *root)
+static int cgroup_write_pid(struct thread_data *td, char *path, bool cgroup2)
 {
 	unsigned int val = td->pid;
 
-	return write_int_to_file(td, root, "tasks", val, "cgroup write pid");
+	if (cgroup2)
+		return write_int_to_file(td, path, "cgroup.procs",
+					 val, "cgroup write pid");
+	return write_int_to_file(td, path, "tasks", val, "cgroup write pid");
 }
 
 /*
  * Move pid to root class
  */
-static int cgroup_del_pid(struct thread_data *td, char *mnt)
+static int cgroup_del_pid(struct thread_data *td, struct cgroup_mnt *mnt)
 {
-	return cgroup_write_pid(td, mnt);
+	return cgroup_write_pid(td, mnt->path, mnt->cgroup2);
 }
 
-int cgroup_setup(struct thread_data *td, struct flist_head *clist, char **mnt)
+int cgroup_setup(struct thread_data *td, struct flist_head *clist, struct cgroup_mnt **mnt)
 {
 	char *root;
 
+	if (!clist)
+		return 1;
+
 	if (!*mnt) {
 		*mnt = find_cgroup_mnt(td);
 		if (!*mnt)
@@ -170,13 +190,17 @@
 		add_cgroup(td, root, clist);
 
 	if (td->o.cgroup_weight) {
+		if ((*mnt)->cgroup2) {
+			log_err("fio: cgroup weit doesn't work with cgroup2\n");
+			goto err;
+		}
 		if (write_int_to_file(td, root, "blkio.weight",
 					td->o.cgroup_weight,
 					"cgroup open weight"))
 			goto err;
 	}
 
-	if (!cgroup_write_pid(td, root)) {
+	if (!cgroup_write_pid(td, root, (*mnt)->cgroup2)) {
 		free(root);
 		return 0;
 	}
@@ -186,24 +210,28 @@
 	return 1;
 }
 
-void cgroup_shutdown(struct thread_data *td, char **mnt)
+void cgroup_shutdown(struct thread_data *td, struct cgroup_mnt *mnt)
 {
-	if (*mnt == NULL)
+	if (mnt == NULL)
 		return;
 	if (!td->o.cgroup_weight && !td->o.cgroup)
-		return;
+		goto out;
 
-	cgroup_del_pid(td, *mnt);
+	cgroup_del_pid(td, mnt);
+out:
+	if (mnt->path)
+		sfree(mnt->path);
+	sfree(mnt);
 }
 
 static void fio_init cgroup_init(void)
 {
-	lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	lock = fio_sem_init(FIO_SEM_UNLOCKED);
 	if (!lock)
 		log_err("fio: failed to allocate cgroup lock\n");
 }
 
 static void fio_exit cgroup_exit(void)
 {
-	fio_mutex_remove(lock);
+	fio_sem_remove(lock);
 }
diff -Nru fio-2.1.3/cgroup.h fio-3.16/cgroup.h
--- fio-2.1.3/cgroup.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/cgroup.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,21 +3,28 @@
 
 #ifdef FIO_HAVE_CGROUPS
 
-int cgroup_setup(struct thread_data *, struct flist_head *, char **);
-void cgroup_shutdown(struct thread_data *, char **);
+struct cgroup_mnt {
+	char *path;
+	bool cgroup2;
+};
+
+int cgroup_setup(struct thread_data *, struct flist_head *, struct cgroup_mnt **);
+void cgroup_shutdown(struct thread_data *, struct cgroup_mnt *);
 
 void cgroup_kill(struct flist_head *list);
 
 #else
 
+struct cgroup_mnt;
+
 static inline int cgroup_setup(struct thread_data *td, struct flist_head *list,
-			       char **mnt)
+			       struct cgroup_mnt **mnt)
 {
 	td_verror(td, EINVAL, "cgroup_setup");
 	return 1;
 }
 
-static inline void cgroup_shutdown(struct thread_data *td, char **mnt)
+static inline void cgroup_shutdown(struct thread_data *td, struct cgroup_mnt *mnt)
 {
 }
 
diff -Nru fio-2.1.3/client.c fio-3.16/client.c
--- fio-2.1.3/client.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/client.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <limits.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <sys/wait.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <netinet/in.h>
@@ -23,15 +21,19 @@
 #include "server.h"
 #include "flist.h"
 #include "hash.h"
+#include "verify-state.h"
 
 static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_gs(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd);
-static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd);
+static void handle_stop(struct fio_client *client);
 static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd);
 
+static void convert_text(struct fio_net_cmd *cmd);
+static void client_display_thread_status(struct jobs_eta *je);
+
 struct client_ops fio_client_ops = {
 	.text		= handle_text,
 	.disk_util	= handle_du,
@@ -39,13 +41,13 @@
 	.group_stats	= handle_gs,
 	.stop		= handle_stop,
 	.start		= handle_start,
-	.eta		= display_thread_status,
+	.eta		= client_display_thread_status,
 	.probe		= handle_probe,
 	.eta_msec	= FIO_CLIENT_DEF_ETA_MSEC,
 	.client_type	= FIO_CLIENT_TYPE_CLI,
 };
 
-static struct timeval eta_tv;
+static struct timespec eta_ts;
 
 static FLIST_HEAD(client_list);
 static FLIST_HEAD(eta_list);
@@ -57,13 +59,21 @@
 int sum_stat_clients;
 
 static int sum_stat_nr;
-static int do_output_all_clients;
+static struct buf_output allclients;
+static struct json_object *root = NULL;
+static struct json_object *job_opt_object = NULL;
+static struct json_array *clients_array = NULL;
+static struct json_array *du_array = NULL;
+
+static int error_clients;
 
 #define FIO_CLIENT_HASH_BITS	7
 #define FIO_CLIENT_HASH_SZ	(1 << FIO_CLIENT_HASH_BITS)
 #define FIO_CLIENT_HASH_MASK	(FIO_CLIENT_HASH_SZ - 1)
 static struct flist_head client_hash[FIO_CLIENT_HASH_SZ];
 
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *, bool *);
+
 static void fio_client_add_hash(struct fio_client *client)
 {
 	int bucket = hash_long(client->fd, FIO_CLIENT_HASH_BITS);
@@ -86,6 +96,130 @@
 		INIT_FLIST_HEAD(&client_hash[i]);
 }
 
+static int read_data(int fd, void *data, size_t size)
+{
+	ssize_t ret;
+
+	while (size) {
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
+static int read_ini_data(int fd, void *data, size_t size)
+{
+	char *p = data;
+	int ret = 0;
+	FILE *fp;
+	int dupfd;
+
+	dupfd = dup(fd);
+	if (dupfd < 0)
+		return errno;
+
+	fp = fdopen(dupfd, "r");
+	if (!fp) {
+		ret = errno;
+		close(dupfd);
+		goto out;
+	}
+
+	while (1) {
+		ssize_t len;
+		char buf[OPT_LEN_MAX+1], *sub;
+
+		if (!fgets(buf, sizeof(buf), fp)) {
+			if (ferror(fp)) {
+				if (errno == EAGAIN || errno == EINTR)
+					continue;
+				ret = errno;
+			}
+			break;
+		}
+
+		sub = fio_option_dup_subs(buf);
+		len = strlen(sub);
+		if (len + 1 > size) {
+			log_err("fio: no space left to read data\n");
+			free(sub);
+			ret = ENOSPC;
+			break;
+		}
+
+		memcpy(p, sub, len);
+		free(sub);
+		p += len;
+		*p = '\0';
+		size -= len;
+	}
+
+	fclose(fp);
+out:
+	return ret;
+}
+
+static void fio_client_json_init(void)
+{
+	char time_buf[32];
+	time_t time_p;
+
+	if (!(output_format & FIO_OUTPUT_JSON))
+		return;
+
+	time(&time_p);
+	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
+	time_buf[strlen(time_buf) - 1] = '\0';
+
+	root = json_create_object();
+	json_object_add_value_string(root, "fio version", fio_version_string);
+	json_object_add_value_int(root, "timestamp", time_p);
+	json_object_add_value_string(root, "time", time_buf);
+
+	job_opt_object = json_create_object();
+	json_object_add_value_object(root, "global options", job_opt_object);
+	clients_array = json_create_array();
+	json_object_add_value_array(root, "client_stats", clients_array);
+	du_array = json_create_array();
+	json_object_add_value_array(root, "disk_util", du_array);
+}
+
+static void fio_client_json_fini(void)
+{
+	struct buf_output out;
+
+	if (!root)
+		return;
+
+	buf_output_init(&out);
+
+	__log_buf(&out, "\n");
+	json_print_object(root, &out);
+	__log_buf(&out, "\n");
+	log_info_buf(out.buf, out.buflen);
+
+	buf_output_free(&out);
+
+	json_free_object(root);
+	root = NULL;
+	job_opt_object = NULL;
+	clients_array = NULL;
+	du_array = NULL;
+}
+
 static struct fio_client *find_client_by_fd(int fd)
 {
 	int bucket = hash_long(fd, FIO_CLIENT_HASH_BITS) & FIO_CLIENT_HASH_MASK;
@@ -109,28 +243,70 @@
 	if (--client->refs)
 		return;
 
+	log_info_buf(client->buf.buf, client->buf.buflen);
+	buf_output_free(&client->buf);
+
 	free(client->hostname);
 	if (client->argv)
 		free(client->argv);
 	if (client->name)
 		free(client->name);
-	while (client->nr_ini_file)
-		free(client->ini_file[--client->nr_ini_file]);
-	if (client->ini_file)
-		free(client->ini_file);
+	while (client->nr_files) {
+		struct client_file *cf = &client->files[--client->nr_files];
+
+		free(cf->file);
+	}
+	if (client->files)
+		free(client->files);
+	if (client->opt_lists)
+		free(client->opt_lists);
 
 	if (!client->did_stat)
-		sum_stat_clients -= client->nr_stat;
+		sum_stat_clients--;
+
+	if (client->error)
+		error_clients++;
 
 	free(client);
 }
 
+static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
+{
+	if (!--eta->pending) {
+		eta_fn(&eta->eta);
+		free(eta);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void fio_drain_client_text(struct fio_client *client)
+{
+	do {
+		struct fio_net_cmd *cmd;
+
+		cmd = fio_net_recv_cmd(client->fd, false);
+		if (!cmd)
+			break;
+
+		if (cmd->opcode == FIO_NET_CMD_TEXT) {
+			convert_text(cmd);
+			client->ops->text(client, cmd);
+		}
+
+		free(cmd);
+	} while (1);
+}
+
 static void remove_client(struct fio_client *client)
 {
 	assert(client->refs);
 
 	dprint(FD_NET, "client: removed <%s>\n", client->hostname);
 
+	fio_drain_client_text(client);
+
 	if (!flist_empty(&client->list))
 		flist_del_init(&client->list);
 
@@ -188,9 +364,7 @@
 	}
 }
 
-struct fio_client *fio_client_add_explicit(struct client_ops *ops,
-					   const char *hostname, int type,
-					   int port)
+static struct fio_client *get_new_client(void)
 {
 	struct fio_client *client;
 
@@ -203,15 +377,28 @@
 	INIT_FLIST_HEAD(&client->eta_list);
 	INIT_FLIST_HEAD(&client->cmd_list);
 
+	buf_output_init(&client->buf);
+
+	return client;
+}
+
+struct fio_client *fio_client_add_explicit(struct client_ops *ops,
+					   const char *hostname, int type,
+					   int port)
+{
+	struct fio_client *client;
+
+	client = get_new_client();
+
 	client->hostname = strdup(hostname);
 
 	if (type == Fio_client_socket)
-		client->is_sock = 1;
+		client->is_sock = true;
 	else {
 		int ipv6;
 
 		ipv6 = type == Fio_client_ipv6;
-		if (fio_server_parse_host(hostname, &ipv6,
+		if (fio_server_parse_host(hostname, ipv6,
 						&client->addr.sin_addr,
 						&client->addr6.sin6_addr))
 			goto err;
@@ -235,17 +422,29 @@
 	return NULL;
 }
 
-void fio_client_add_ini_file(void *cookie, const char *ini_file)
+int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote)
 {
 	struct fio_client *client = cookie;
+	struct client_file *cf;
 	size_t new_size;
+	void *new_files;
+
+	if (!client)
+		return 1;
 
 	dprint(FD_NET, "client <%s>: add ini %s\n", client->hostname, ini_file);
 
-	new_size = (client->nr_ini_file + 1) * sizeof(char *);
-	client->ini_file = realloc(client->ini_file, new_size);
-	client->ini_file[client->nr_ini_file] = strdup(ini_file);
-	client->nr_ini_file++;
+	new_size = (client->nr_files + 1) * sizeof(struct client_file);
+	new_files = realloc(client->files, new_size);
+	if (!new_files)
+		return 1;
+
+	client->files = new_files;
+	cf = &client->files[client->nr_files];
+	cf->file = strdup(ini_file);
+	cf->remote = remote;
+	client->nr_files++;
+	return 0;
 }
 
 int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie)
@@ -266,14 +465,7 @@
 		}
 	}
 
-	client = malloc(sizeof(*client));
-	memset(client, 0, sizeof(*client));
-
-	INIT_FLIST_HEAD(&client->list);
-	INIT_FLIST_HEAD(&client->hash_list);
-	INIT_FLIST_HEAD(&client->arg_list);
-	INIT_FLIST_HEAD(&client->eta_list);
-	INIT_FLIST_HEAD(&client->cmd_list);
+	client = get_new_client();
 
 	if (fio_server_parse_string(hostname, &client->hostname,
 					&client->is_sock, &client->port,
@@ -296,10 +488,27 @@
 	return 0;
 }
 
+static const char *server_name(struct fio_client *client, char *buf,
+			       size_t bufsize)
+{
+	const char *from;
+
+	if (client->ipv6)
+		from = inet_ntop(AF_INET6, (struct sockaddr *) &client->addr6.sin6_addr, buf, bufsize);
+	else if (client->is_sock)
+		from = "sock";
+	else
+		from = inet_ntop(AF_INET, (struct sockaddr *) &client->addr.sin_addr, buf, bufsize);
+
+	return from;
+}
+
 static void probe_client(struct fio_client *client)
 {
 	struct cmd_client_probe_pdu pdu;
+	const char *sname;
 	uint64_t tag;
+	char buf[64];
 
 	dprint(FD_NET, "client: send probe\n");
 
@@ -309,6 +518,10 @@
 	pdu.flags = 0;
 #endif
 
+	sname = server_name(client, buf, sizeof(buf));
+	memset(pdu.server, 0, sizeof(pdu.server));
+	snprintf((char *) pdu.server, sizeof(pdu.server), "%s", sname);
+
 	fio_net_send_cmd(client->fd, FIO_NET_CMD_PROBE, &pdu, sizeof(pdu), &tag, &client->cmd_list);
 }
 
@@ -361,7 +574,8 @@
 
 	memset(addr, 0, sizeof(*addr));
 	addr->sun_family = AF_UNIX;
-	strcpy(addr->sun_path, client->hostname);
+	snprintf(addr->sun_path, sizeof(addr->sun_path), "%s",
+		 client->hostname);
 
 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
 	if (fd < 0) {
@@ -412,7 +626,7 @@
 	return fio_net_send_quit(client->fd);
 }
 
-void fio_clients_terminate(void)
+static void fio_clients_terminate(void)
 {
 	struct flist_head *entry;
 	struct fio_client *client;
@@ -431,11 +645,6 @@
 	fio_clients_terminate();
 }
 
-static void sig_show_status(int sig)
-{
-	show_running_run_stats();
-}
-
 static void client_signal_handler(void)
 {
 	struct sigaction act;
@@ -557,6 +766,8 @@
 
 	dprint(FD_NET, "client: start all\n");
 
+	fio_client_json_init();
+
 	flist_for_each_safe(entry, tmp, &client_list) {
 		client = flist_entry(entry, struct fio_client, list);
 
@@ -570,11 +781,34 @@
 	return flist_empty(&client_list);
 }
 
+static int __fio_client_send_remote_ini(struct fio_client *client,
+					const char *filename)
+{
+	struct cmd_load_file_pdu *pdu;
+	size_t p_size;
+	int ret;
+
+	dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname);
+
+	p_size = sizeof(*pdu) + strlen(filename) + 1;
+	pdu = malloc(p_size);
+	memset(pdu, 0, p_size);
+	pdu->name_len = strlen(filename);
+	strcpy((char *) pdu->file, filename);
+	pdu->client_type = cpu_to_le16((uint16_t) client->type);
+
+	client->sent_job = true;
+	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL);
+	free(pdu);
+	return ret;
+}
+
 /*
  * Send file contents to server backend. We could use sendfile(), but to remain
  * more portable lets just read/write the darn thing.
  */
-static int __fio_client_send_ini(struct fio_client *client, const char *filename)
+static int __fio_client_send_local_ini(struct fio_client *client,
+				       const char *filename)
 {
 	struct cmd_job_pdu *pdu;
 	size_t p_size;
@@ -588,88 +822,95 @@
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		int ret = -errno;
-
+		ret = -errno;
 		log_err("fio: job file <%s> open: %s\n", filename, strerror(errno));
 		return ret;
 	}
 
 	if (fstat(fd, &sb) < 0) {
-		int ret = -errno;
-
+		ret = -errno;
 		log_err("fio: job file stat: %s\n", strerror(errno));
 		close(fd);
 		return ret;
 	}
 
+	/*
+	 * Add extra space for variable expansion, but doesn't guarantee.
+	 */
+	sb.st_size += OPT_LEN_MAX;
 	p_size = sb.st_size + sizeof(*pdu);
 	pdu = malloc(p_size);
 	buf = pdu->buf;
 
 	len = sb.st_size;
 	p = buf;
-	do {
-		ret = read(fd, p, len);
-		if (ret > 0) {
-			len -= ret;
-			if (!len)
-				break;
-			p += ret;
-			continue;
-		} else if (!ret)
-			break;
-		else if (errno == EAGAIN || errno == EINTR)
-			continue;
-	} while (1);
-
-	if (len) {
+	if (read_ini_data(fd, p, len)) {
 		log_err("fio: failed reading job file %s\n", filename);
 		close(fd);
-		free(buf);
+		free(pdu);
 		return 1;
 	}
 
 	pdu->buf_len = __cpu_to_le32(sb.st_size);
 	pdu->client_type = cpu_to_le32(client->type);
 
-	client->sent_job = 1;
+	client->sent_job = true;
 	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOB, pdu, p_size, NULL, NULL);
 	free(pdu);
 	close(fd);
 	return ret;
 }
 
-int fio_client_send_ini(struct fio_client *client, const char *filename)
+int fio_client_send_ini(struct fio_client *client, const char *filename,
+			bool remote)
 {
 	int ret;
 
-	ret = __fio_client_send_ini(client, filename);
+	if (!remote)
+		ret = __fio_client_send_local_ini(client, filename);
+	else
+		ret = __fio_client_send_remote_ini(client, filename);
+
 	if (!ret)
-		client->sent_job = 1;
+		client->sent_job = true;
 
 	return ret;
 }
 
+static int fio_client_send_cf(struct fio_client *client,
+			      struct client_file *cf)
+{
+	return fio_client_send_ini(client, cf->file, cf->remote);
+}
+
 int fio_clients_send_ini(const char *filename)
 {
 	struct fio_client *client;
 	struct flist_head *entry, *tmp;
 
 	flist_for_each_safe(entry, tmp, &client_list) {
+		bool failed = false;
+
 		client = flist_entry(entry, struct fio_client, list);
 
-		if (client->nr_ini_file) {
+		if (client->nr_files) {
 			int i;
 
-			for (i = 0; i < client->nr_ini_file; i++) {
-				const char *ini = client->ini_file[i];
+			for (i = 0; i < client->nr_files; i++) {
+				struct client_file *cf;
 
-				if (fio_client_send_ini(client, ini)) {
+				cf = &client->files[i];
+
+				if (fio_client_send_cf(client, cf)) {
+					failed = true;
 					remove_client(client);
 					break;
 				}
 			}
-		} else if (!filename || fio_client_send_ini(client, filename))
+		}
+		if (client->sent_job || failed)
+			continue;
+		if (!filename || fio_client_send_ini(client, filename, 0))
 			remove_client(client);
 	}
 
@@ -684,7 +925,7 @@
 	pdu.thread_number = cpu_to_le32(client->thread_number);
 	pdu.groupid = cpu_to_le32(client->groupid);
 	convert_thread_options_to_net(&pdu.top, o);
-	
+
 	return fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, &pdu, sizeof(pdu), tag, &client->cmd_list);
 }
 
@@ -717,6 +958,7 @@
 		convert_io_stat(&dst->slat_stat[i], &src->slat_stat[i]);
 		convert_io_stat(&dst->lat_stat[i], &src->lat_stat[i]);
 		convert_io_stat(&dst->bw_stat[i], &src->bw_stat[i]);
+		convert_io_stat(&dst->iops_stat[i], &src->iops_stat[i]);
 	}
 
 	dst->usr_time		= le64_to_cpu(src->usr_time);
@@ -724,7 +966,9 @@
 	dst->ctx		= le64_to_cpu(src->ctx);
 	dst->minf		= le64_to_cpu(src->minf);
 	dst->majf		= le64_to_cpu(src->majf);
-	dst->clat_percentiles	= le64_to_cpu(src->clat_percentiles);
+	dst->clat_percentiles	= le32_to_cpu(src->clat_percentiles);
+	dst->lat_percentiles	= le32_to_cpu(src->lat_percentiles);
+	dst->percentile_precision = le64_to_cpu(src->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		fio_fp64_t *fps = &src->percentile_list[i];
@@ -734,27 +978,31 @@
 	}
 
 	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
-		dst->io_u_map[i]	= le32_to_cpu(src->io_u_map[i]);
-		dst->io_u_submit[i]	= le32_to_cpu(src->io_u_submit[i]);
-		dst->io_u_complete[i]	= le32_to_cpu(src->io_u_complete[i]);
+		dst->io_u_map[i]	= le64_to_cpu(src->io_u_map[i]);
+		dst->io_u_submit[i]	= le64_to_cpu(src->io_u_submit[i]);
+		dst->io_u_complete[i]	= le64_to_cpu(src->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
-		dst->io_u_lat_u[i]	= le32_to_cpu(src->io_u_lat_u[i]);
-		dst->io_u_lat_m[i]	= le32_to_cpu(src->io_u_lat_m[i]);
-	}
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		dst->io_u_lat_n[i]	= le64_to_cpu(src->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		dst->io_u_lat_u[i]	= le64_to_cpu(src->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		dst->io_u_lat_m[i]	= le64_to_cpu(src->io_u_lat_m[i]);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
-			dst->io_u_plat[i][j] = le32_to_cpu(src->io_u_plat[i][j]);
+			dst->io_u_plat[i][j] = le64_to_cpu(src->io_u_plat[i][j]);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		dst->total_io_u[i]	= le64_to_cpu(src->total_io_u[i]);
 		dst->short_io_u[i]	= le64_to_cpu(src->short_io_u[i]);
+		dst->drop_io_u[i]	= le64_to_cpu(src->drop_io_u[i]);
 	}
 
 	dst->total_submit	= le64_to_cpu(src->total_submit);
 	dst->total_complete	= le64_to_cpu(src->total_complete);
+	dst->nr_zone_resets	= le64_to_cpu(src->nr_zone_resets);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		dst->io_bytes[i]	= le64_to_cpu(src->io_bytes[i]);
@@ -767,6 +1015,35 @@
 	dst->first_error	= le32_to_cpu(src->first_error);
 	dst->kb_base		= le32_to_cpu(src->kb_base);
 	dst->unit_base		= le32_to_cpu(src->unit_base);
+
+	dst->sig_figs		= le32_to_cpu(src->sig_figs);
+
+	dst->latency_depth	= le32_to_cpu(src->latency_depth);
+	dst->latency_target	= le64_to_cpu(src->latency_target);
+	dst->latency_window	= le64_to_cpu(src->latency_window);
+	dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
+
+	dst->nr_block_infos	= le64_to_cpu(src->nr_block_infos);
+	for (i = 0; i < dst->nr_block_infos; i++)
+		dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
+
+	dst->ss_dur		= le64_to_cpu(src->ss_dur);
+	dst->ss_state		= le32_to_cpu(src->ss_state);
+	dst->ss_head		= le32_to_cpu(src->ss_head);
+	dst->ss_limit.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_limit.u.i));
+	dst->ss_slope.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_slope.u.i));
+	dst->ss_deviation.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
+	dst->ss_criterion.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
+
+	if (dst->ss_state & FIO_SS_DATA) {
+		for (i = 0; i < dst->ss_dur; i++ ) {
+			dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
+			dst->ss_bw_data[i] = le64_to_cpu(src->ss_bw_data[i]);
+		}
+	}
+
+	dst->cachehit		= le64_to_cpu(src->cachehit);
+	dst->cachemiss		= le64_to_cpu(src->cachemiss);
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
@@ -778,37 +1055,61 @@
 		dst->min_run[i]		= le64_to_cpu(src->min_run[i]);
 		dst->max_bw[i]		= le64_to_cpu(src->max_bw[i]);
 		dst->min_bw[i]		= le64_to_cpu(src->min_bw[i]);
-		dst->io_kb[i]		= le64_to_cpu(src->io_kb[i]);
+		dst->iobytes[i]		= le64_to_cpu(src->iobytes[i]);
 		dst->agg[i]		= le64_to_cpu(src->agg[i]);
 	}
 
 	dst->kb_base	= le32_to_cpu(src->kb_base);
 	dst->unit_base	= le32_to_cpu(src->unit_base);
+	dst->sig_figs	= le32_to_cpu(src->sig_figs);
 	dst->groupid	= le32_to_cpu(src->groupid);
 	dst->unified_rw_rep	= le32_to_cpu(src->unified_rw_rep);
 }
 
+static void json_object_add_client_info(struct json_object *obj,
+					struct fio_client *client)
+{
+	const char *hostname = client->hostname ? client->hostname : "";
+
+	json_object_add_value_string(obj, "hostname", hostname);
+	json_object_add_value_int(obj, "port", client->port);
+}
+
 static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+	struct flist_head *opt_list = NULL;
+	struct json_object *tsobj;
 
-	show_thread_status(&p->ts, &p->rs);
-	client->did_stat = 1;
+	if (client->opt_lists && p->ts.thread_number <= client->jobs)
+		opt_list = &client->opt_lists[p->ts.thread_number - 1];
+
+	tsobj = show_thread_status(&p->ts, &p->rs, opt_list, &client->buf);
+	client->did_stat = true;
+	if (tsobj) {
+		json_object_add_client_info(tsobj, client);
+		json_array_add_value_object(clients_array, tsobj);
+	}
 
-	if (!do_output_all_clients)
+	if (sum_stat_clients <= 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
 	client_ts.thread_number = p->ts.thread_number;
 	client_ts.groupid = p->ts.groupid;
 	client_ts.unified_rw_rep = p->ts.unified_rw_rep;
+	client_ts.sig_figs = p->ts.sig_figs;
 
 	if (++sum_stat_nr == sum_stat_clients) {
 		strcpy(client_ts.name, "All clients");
-		show_thread_status(&client_ts, &client_gs);
+		tsobj = show_thread_status(&client_ts, &client_gs, NULL, &allclients);
+		if (tsobj) {
+			json_object_add_client_info(tsobj, client);
+			json_array_add_value_object(clients_array, tsobj);
+		}
 	}
 }
 
@@ -816,7 +1117,41 @@
 {
 	struct group_run_stats *gs = (struct group_run_stats *) cmd->payload;
 
-	show_group_stats(gs);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_group_stats(gs, &client->buf);
+}
+
+static void handle_job_opt(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_job_option *pdu = (struct cmd_job_option *) cmd->payload;
+	struct print_option *p;
+
+	if (!job_opt_object)
+		return;
+
+	pdu->global = le16_to_cpu(pdu->global);
+	pdu->truncated = le16_to_cpu(pdu->truncated);
+	pdu->groupid = le32_to_cpu(pdu->groupid);
+
+	p = malloc(sizeof(*p));
+	p->name = strdup((char *) pdu->name);
+	if (pdu->value[0] != '\0')
+		p->value = strdup((char *) pdu->value);
+	else
+		p->value = NULL;
+
+	if (pdu->global) {
+		const char *pos = "";
+
+		if (p->value)
+			pos = p->value;
+
+		json_object_add_value_string(job_opt_object, p->name, pos);
+	} else if (client->opt_lists) {
+		struct flist_head *opt_list = &client->opt_lists[pdu->groupid];
+
+		flist_add_tail(&p->list, opt_list);
+	}
 }
 
 static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd)
@@ -825,13 +1160,17 @@
 	const char *buf = (const char *) pdu->buf;
 	const char *name;
 	int fio_unused ret;
+	struct buf_output out;
+
+	buf_output_init(&out);
 
 	name = client->name ? client->name : client->hostname;
 
-	if (!client->skip_newline)
-		fprintf(f_out, "<%s> ", name);
-	ret = fwrite(buf, pdu->buf_len, 1, f_out);
-	fflush(f_out);
+	if (!client->skip_newline && !(output_format & FIO_OUTPUT_TERSE))
+		__log_buf(&out, "<%s> ", name);
+	__log_buf(&out, "%s", buf);
+	log_info_buf(out.buf, out.buflen);
+	buf_output_free(&out);
 	client->skip_newline = strchr(buf, '\n') == NULL;
 }
 
@@ -840,16 +1179,16 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		agg->ios[i]	= le32_to_cpu(agg->ios[i]);
-		agg->merges[i]	= le32_to_cpu(agg->merges[i]);
+		agg->ios[i]	= le64_to_cpu(agg->ios[i]);
+		agg->merges[i]	= le64_to_cpu(agg->merges[i]);
 		agg->sectors[i]	= le64_to_cpu(agg->sectors[i]);
-		agg->ticks[i]	= le32_to_cpu(agg->ticks[i]);
+		agg->ticks[i]	= le64_to_cpu(agg->ticks[i]);
 	}
 
-	agg->io_ticks		= le32_to_cpu(agg->io_ticks);
-	agg->time_in_queue	= le32_to_cpu(agg->time_in_queue);
+	agg->io_ticks		= le64_to_cpu(agg->io_ticks);
+	agg->time_in_queue	= le64_to_cpu(agg->time_in_queue);
 	agg->slavecount		= le32_to_cpu(agg->slavecount);
-	agg->max_util.u.f	= fio_uint64_to_double(__le64_to_cpu(agg->max_util.u.i));
+	agg->max_util.u.f	= fio_uint64_to_double(le64_to_cpu(agg->max_util.u.i));
 }
 
 static void convert_dus(struct disk_util_stat *dus)
@@ -857,27 +1196,39 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		dus->ios[i]	= le32_to_cpu(dus->ios[i]);
-		dus->merges[i]	= le32_to_cpu(dus->merges[i]);
-		dus->sectors[i]	= le64_to_cpu(dus->sectors[i]);
-		dus->ticks[i]	= le32_to_cpu(dus->ticks[i]);
+		dus->s.ios[i]		= le64_to_cpu(dus->s.ios[i]);
+		dus->s.merges[i]	= le64_to_cpu(dus->s.merges[i]);
+		dus->s.sectors[i]	= le64_to_cpu(dus->s.sectors[i]);
+		dus->s.ticks[i]		= le64_to_cpu(dus->s.ticks[i]);
 	}
 
-	dus->io_ticks		= le32_to_cpu(dus->io_ticks);
-	dus->time_in_queue	= le32_to_cpu(dus->time_in_queue);
-	dus->msec		= le64_to_cpu(dus->msec);
+	dus->s.io_ticks		= le64_to_cpu(dus->s.io_ticks);
+	dus->s.time_in_queue	= le64_to_cpu(dus->s.time_in_queue);
+	dus->s.msec		= le64_to_cpu(dus->s.msec);
 }
 
 static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
 
-	if (!client->disk_stats_shown) {
-		client->disk_stats_shown = 1;
-		log_info("\nDisk stats (read/write):\n");
-	}
+	if (!client->disk_stats_shown)
+		client->disk_stats_shown = true;
+
+	if (output_format & FIO_OUTPUT_JSON) {
+		struct json_object *duobj;
 
-	print_disk_util(&du->dus, &du->agg, output_format == FIO_OUTPUT_TERSE);
+		json_array_add_disk_util(&du->dus, &du->agg, du_array);
+		duobj = json_array_last_value_object(du_array);
+		json_object_add_client_info(duobj, client);
+	}
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		__log_buf(&client->buf, "\nDisk stats (read/write):\n");
+		print_disk_util(&du->dus, &du->agg, 0, &client->buf);
+	}
+	if (output_format & FIO_OUTPUT_TERSE && terse_version >= 3) {
+		print_disk_util(&du->dus, &du->agg, 1, &client->buf);
+		__log_buf(&client->buf, "\n");
+	}
 }
 
 static void convert_jobs_eta(struct jobs_eta *je)
@@ -891,11 +1242,11 @@
 	je->files_open		= le32_to_cpu(je->files_open);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		je->m_rate[i]	= le32_to_cpu(je->m_rate[i]);
-		je->t_rate[i]	= le32_to_cpu(je->t_rate[i]);
+		je->m_rate[i]	= le64_to_cpu(je->m_rate[i]);
+		je->t_rate[i]	= le64_to_cpu(je->t_rate[i]);
 		je->m_iops[i]	= le32_to_cpu(je->m_iops[i]);
 		je->t_iops[i]	= le32_to_cpu(je->t_iops[i]);
-		je->rate[i]	= le32_to_cpu(je->rate[i]);
+		je->rate[i]	= le64_to_cpu(je->rate[i]);
 		je->iops[i]	= le32_to_cpu(je->iops[i]);
 	}
 
@@ -904,6 +1255,7 @@
 	je->nr_threads		= le32_to_cpu(je->nr_threads);
 	je->is_pow2		= le32_to_cpu(je->is_pow2);
 	je->unit_base		= le32_to_cpu(je->unit_base);
+	je->sig_figs		= le32_to_cpu(je->sig_figs);
 }
 
 void fio_client_sum_jobs_eta(struct jobs_eta *dst, struct jobs_eta *je)
@@ -931,18 +1283,15 @@
 		dst->eta_sec = je->eta_sec;
 
 	dst->nr_threads		+= je->nr_threads;
-	/* we need to handle je->run_str too ... */
-}
 
-void fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
-{
-	if (!--eta->pending) {
-		eta_fn(&eta->eta);
-		free(eta);
-	}
+	/*
+	 * This wont be correct for multiple strings, but at least it
+	 * works for the basic cases.
+	 */
+	strcpy((char *) dst->run_str, (char *) je->run_str);
 }
 
-static void remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
+static bool remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct fio_net_cmd_reply *reply = NULL;
 	struct flist_head *entry;
@@ -958,12 +1307,13 @@
 
 	if (!reply) {
 		log_err("fio: client: unable to find matching tag (%llx)\n", (unsigned long long) cmd->tag);
-		return;
+		return false;
 	}
 
 	flist_del(&reply->list);
 	cmd->tag = reply->saved_tag;
 	free(reply);
+	return true;
 }
 
 int fio_client_wait_for_reply(struct fio_client *client, uint64_t tag)
@@ -1001,6 +1351,7 @@
 
 	client->eta_in_flight = NULL;
 	flist_del_init(&client->eta_list);
+	client->eta_timeouts = 0;
 
 	if (client->ops->jobs_eta)
 		client->ops->jobs_eta(client, je);
@@ -1009,6 +1360,126 @@
 	fio_client_dec_jobs_eta(eta, client->ops->eta);
 }
 
+static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+				      uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry;
+	uint64_t *io_u_plat;
+
+	int stride = 1 << hist_coarseness;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+
+		s = (struct io_sample *)((char *)__get_sample(samples, log_offset, i) +
+			i * sizeof(struct io_u_plat_entry));
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		fprintf(f, "%lu, %u, %llu, ", (unsigned long) s->time,
+						io_sample_ddir(s), (unsigned long long) s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%llu, ", (unsigned long long)hist_sum(j, stride, io_u_plat, NULL));
+		}
+		fprintf(f, "%llu\n", (unsigned long long)
+			hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat, NULL));
+
+	}
+}
+
+static int fio_client_handle_iolog(struct fio_client *client,
+				   struct fio_net_cmd *cmd)
+{
+	struct cmd_iolog_pdu *pdu = NULL;
+	bool store_direct;
+	char *log_pathname = NULL;
+	int ret = 0;
+
+	pdu = convert_iolog(cmd, &store_direct);
+	if (!pdu) {
+		log_err("fio: failed converting IO log\n");
+		ret = 1;
+		goto out;
+	}
+
+        /* allocate buffer big enough for next sprintf() call */
+	log_pathname = malloc(10 + strlen((char *)pdu->name) +
+			strlen(client->hostname));
+	if (!log_pathname) {
+		log_err("fio: memory allocation of unique pathname failed\n");
+		ret = -1;
+		goto out;
+	}
+	/* generate a unique pathname for the log file using hostname */
+	sprintf(log_pathname, "%s.%s", pdu->name, client->hostname);
+
+	if (store_direct) {
+		ssize_t wrote;
+		size_t sz;
+		int fd;
+
+		fd = open((const char *) log_pathname,
+				O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd < 0) {
+			log_err("fio: open log %s: %s\n",
+				log_pathname, strerror(errno));
+			ret = 1;
+			goto out;
+		}
+
+		sz = cmd->pdu_len - sizeof(*pdu);
+		wrote = write(fd, pdu->samples, sz);
+		close(fd);
+
+		if (wrote != sz) {
+			log_err("fio: short write on compressed log\n");
+			ret = 1;
+			goto out;
+		}
+
+		ret = 0;
+	} else {
+		FILE *f;
+		f = fopen((const char *) log_pathname, "w");
+		if (!f) {
+			log_err("fio: fopen log %s : %s\n",
+				log_pathname, strerror(errno));
+			ret = 1;
+			goto out;
+		}
+
+		if (pdu->log_type == IO_LOG_TYPE_HIST) {
+			client_flush_hist_samples(f, pdu->log_hist_coarseness, pdu->samples,
+					   pdu->nr_samples * sizeof(struct io_sample));
+		} else {
+			flush_samples(f, pdu->samples,
+					pdu->nr_samples * sizeof(struct io_sample));
+		}
+		fclose(f);
+		ret = 0;
+	}
+
+out:
+	if (pdu && pdu != (void *) cmd->payload)
+		free(pdu);
+
+	if (log_pathname)
+		free(log_pathname);
+
+	return ret;
+}
+
 static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_probe_reply_pdu *probe = (struct cmd_probe_reply_pdu *) cmd->payload;
@@ -1026,9 +1497,11 @@
 	sprintf(bit, "%d-bit", probe->bpp * 8);
 	probe->flags = le64_to_cpu(probe->flags);
 
-	log_info("hostname=%s, be=%u, %s, os=%s, arch=%s, fio=%s, flags=%lx\n",
-		probe->hostname, probe->bigendian, bit, os, arch,
-		probe->fio_version, (unsigned long) probe->flags);
+	if (output_format & FIO_OUTPUT_NORMAL) {
+		log_info("hostname=%s, be=%u, %s, os=%s, arch=%s, fio=%s, flags=%lx\n",
+			probe->hostname, probe->bigendian, bit, os, arch,
+			probe->fio_version, (unsigned long) probe->flags);
+	}
 
 	if (!client->name)
 		client->name = strdup((char *) probe->hostname);
@@ -1042,13 +1515,21 @@
 	client->jobs = le32_to_cpu(pdu->jobs);
 	client->nr_stat = le32_to_cpu(pdu->stat_outputs);
 
-	if (sum_stat_clients > 1)
-		do_output_all_clients = 1;
+	if (client->jobs) {
+		int i;
+
+		if (client->opt_lists)
+			free(client->opt_lists);
+
+		client->opt_lists = malloc(client->jobs * sizeof(struct flist_head));
+		for (i = 0; i < client->jobs; i++)
+			INIT_FLIST_HEAD(&client->opt_lists[i]);
+	}
 
 	sum_stat_clients += client->nr_stat;
 }
 
-static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd)
+static void handle_stop(struct fio_client *client)
 {
 	if (client->error)
 		log_info("client <%s>: exited with error %d\n", client->hostname, client->error);
@@ -1077,9 +1558,9 @@
 #ifdef CONFIG_ZLIB
 	struct cmd_iolog_pdu *ret;
 	z_stream stream;
-	uint32_t nr_samples;
+	uint64_t nr_samples;
 	size_t total;
-	void *p;
+	char *p;
 
 	stream.zalloc = Z_NULL;
 	stream.zfree = Z_NULL;
@@ -1093,18 +1574,22 @@
 	/*
 	 * Get header first, it's not compressed
 	 */
-	nr_samples = le32_to_cpu(pdu->nr_samples);
+	nr_samples = le64_to_cpu(pdu->nr_samples);
 
-	total = nr_samples * sizeof(struct io_sample);
+	if (pdu->log_type == IO_LOG_TYPE_HIST)
+		total = nr_samples * (__log_entry_sz(le32_to_cpu(pdu->log_offset)) +
+					sizeof(struct io_u_plat_entry));
+	else
+		total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset));
 	ret = malloc(total + sizeof(*pdu));
 	ret->nr_samples = nr_samples;
 
 	memcpy(ret, pdu, sizeof(*pdu));
 
-	p = (void *) ret + sizeof(*pdu);
+	p = (char *) ret + sizeof(*pdu);
 
 	stream.avail_in = cmd->pdu_len - sizeof(*pdu);
-	stream.next_in = (void *) pdu + sizeof(*pdu);
+	stream.next_in = (void *)((char *) pdu + sizeof(*pdu));
 	while (stream.avail_in) {
 		unsigned int this_chunk = 65536;
 		unsigned int this_len;
@@ -1114,10 +1599,15 @@
 			this_chunk = total;
 
 		stream.avail_out = this_chunk;
-		stream.next_out = p;
+		stream.next_out = (void *)p;
 		err = inflate(&stream, Z_NO_FLUSH);
 		/* may be Z_OK, or Z_STREAM_END */
 		if (err < 0) {
+			/*
+			 * Z_STREAM_ERROR and Z_BUF_ERROR can safely be
+			 * ignored */
+			if (err == Z_STREAM_ERROR || err == Z_BUF_ERROR)
+				break;
 			log_err("fio: inflate error %d\n", err);
 			free(ret);
 			ret = NULL;
@@ -1141,17 +1631,23 @@
  * This has been compressed on the server side, since it can be big.
  * Uncompress here.
  */
-static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd)
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
+					   bool *store_direct)
 {
 	struct cmd_iolog_pdu *pdu = (struct cmd_iolog_pdu *) cmd->payload;
 	struct cmd_iolog_pdu *ret;
-	int i;
+	uint64_t i;
+	int compressed;
+	void *samples;
+
+	*store_direct = false;
 
 	/*
 	 * Convert if compressed and we support it. If it's not
 	 * compressed, we need not do anything.
 	 */
-	if (le32_to_cpu(pdu->compressed)) {
+	compressed = le32_to_cpu(pdu->compressed);
+	if (compressed == XMIT_COMPRESSED) {
 #ifndef CONFIG_ZLIB
 		log_err("fio: server sent compressed data by mistake\n");
 		return NULL;
@@ -1161,51 +1657,118 @@
 			log_err("fio: failed decompressing log\n");
 			return NULL;
 		}
+	} else if (compressed == STORE_COMPRESSED) {
+		*store_direct = true;
+		ret = pdu;
 	} else
 		ret = pdu;
 
+	ret->nr_samples		= le64_to_cpu(ret->nr_samples);
 	ret->thread_number	= le32_to_cpu(ret->thread_number);
-	ret->nr_samples		= le32_to_cpu(ret->nr_samples);
 	ret->log_type		= le32_to_cpu(ret->log_type);
 	ret->compressed		= le32_to_cpu(ret->compressed);
+	ret->log_offset		= le32_to_cpu(ret->log_offset);
+	ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
 
+	if (*store_direct)
+		return ret;
+
+	samples = &ret->samples[0];
 	for (i = 0; i < ret->nr_samples; i++) {
-		struct io_sample *s = &ret->samples[i];
+		struct io_sample *s;
+
+		s = __get_sample(samples, ret->log_offset, i);
+		if (ret->log_type == IO_LOG_TYPE_HIST)
+			s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
 
-		s->time	= le64_to_cpu(s->time);
-		s->val	= le64_to_cpu(s->val);
-		s->ddir	= le32_to_cpu(s->ddir);
-		s->bs	= le32_to_cpu(s->bs);
+		s->time		= le64_to_cpu(s->time);
+		s->data.val	= le64_to_cpu(s->data.val);
+		s->__ddir	= le32_to_cpu(s->__ddir);
+		s->bs		= le64_to_cpu(s->bs);
+
+		if (ret->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = le64_to_cpu(so->offset);
+		}
+
+		if (ret->log_type == IO_LOG_TYPE_HIST) {
+			s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s));
+			s->data.plat_entry->list.next = NULL;
+			s->data.plat_entry->list.prev = NULL;
+		}
 	}
 
 	return ret;
 }
 
+static void sendfile_reply(int fd, struct cmd_sendfile_reply *rep,
+			   size_t size, uint64_t tag)
+{
+	rep->error = cpu_to_le32(rep->error);
+	fio_net_send_cmd(fd, FIO_NET_CMD_SENDFILE, rep, size, &tag, NULL);
+}
+
+static int fio_send_file(struct fio_client *client, struct cmd_sendfile *pdu,
+			 uint64_t tag)
+{
+	struct cmd_sendfile_reply *rep;
+	struct stat sb;
+	size_t size;
+	int fd;
+
+	size = sizeof(*rep);
+	rep = malloc(size);
+
+	if (stat((char *)pdu->path, &sb) < 0) {
+fail:
+		rep->error = errno;
+		sendfile_reply(client->fd, rep, size, tag);
+		free(rep);
+		return 1;
+	}
+
+	size += sb.st_size;
+	rep = realloc(rep, size);
+	rep->size = cpu_to_le32((uint32_t) sb.st_size);
+
+	fd = open((char *)pdu->path, O_RDONLY);
+	if (fd == -1 )
+		goto fail;
+
+	rep->error = read_data(fd, &rep->data, sb.st_size);
+	sendfile_reply(client->fd, rep, size, tag);
+	free(rep);
+	close(fd);
+	return 0;
+}
+
 int fio_handle_client(struct fio_client *client)
 {
 	struct client_ops *ops = client->ops;
 	struct fio_net_cmd *cmd;
+	int size;
 
 	dprint(FD_NET, "client: handle %s\n", client->hostname);
 
-	cmd = fio_net_recv_cmd(client->fd);
+	cmd = fio_net_recv_cmd(client->fd, true);
 	if (!cmd)
 		return 0;
 
 	dprint(FD_NET, "client: got cmd op %s from %s (pdu=%u)\n",
 		fio_server_op(cmd->opcode), client->hostname, cmd->pdu_len);
 
+	client->last_cmd = cmd->opcode;
+
 	switch (cmd->opcode) {
 	case FIO_NET_CMD_QUIT:
 		if (ops->quit)
 			ops->quit(client, cmd);
 		remove_client(client);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_TEXT:
 		convert_text(cmd);
 		ops->text(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_DU: {
 		struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
@@ -1214,17 +1777,24 @@
 		convert_agg(&du->agg);
 
 		ops->disk_util(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_TS: {
 		struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
 
+		dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
+		if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
+			dprint(FD_NET, "client: received steadystate ring buffers\n");
+
+			size = le64_to_cpu(p->ts.ss_dur);
+			p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
+			p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+		}
+
 		convert_ts(&p->ts, &p->ts);
 		convert_gs(&p->rs, &p->rs);
 
 		ops->thread_status(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_GS: {
@@ -1233,35 +1803,31 @@
 		convert_gs(gs, gs);
 
 		ops->group_stats(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_ETA: {
 		struct jobs_eta *je = (struct jobs_eta *) cmd->payload;
 
-		remove_reply_cmd(client, cmd);
+		if (!remove_reply_cmd(client, cmd))
+			break;
 		convert_jobs_eta(je);
 		handle_eta(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_PROBE:
 		remove_reply_cmd(client, cmd);
 		ops->probe(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_SERVER_START:
 		client->state = Client_running;
 		if (ops->job_start)
 			ops->job_start(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_START: {
 		struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
 
 		pdu->jobs = le32_to_cpu(pdu->jobs);
 		ops->start(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_STOP: {
@@ -1271,8 +1837,7 @@
 		client->state = Client_stopped;
 		client->error = le32_to_cpu(pdu->error);
 		client->signal = le32_to_cpu(pdu->signal);
-		ops->stop(client, cmd);
-		free(cmd);
+		ops->stop(client);
 		break;
 		}
 	case FIO_NET_CMD_ADD_JOB: {
@@ -1283,32 +1848,77 @@
 
 		if (ops->add_job)
 			ops->add_job(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_IOLOG:
-		if (ops->iolog) {
-			struct cmd_iolog_pdu *pdu;
-
-			pdu = convert_iolog(cmd);
-			ops->iolog(client, pdu);
-		}
-		free(cmd);
+		fio_client_handle_iolog(client, cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ops->update_job(client, cmd);
 		remove_reply_cmd(client, cmd);
-		free(cmd);
 		break;
+	case FIO_NET_CMD_VTRIGGER: {
+		struct all_io_list *pdu = (struct all_io_list *) cmd->payload;
+		char buf[128];
+		int off = 0;
+
+		if (aux_path) {
+			strcpy(buf, aux_path);
+			off = strlen(buf);
+		}
+
+		__verify_save_state(pdu, server_name(client, &buf[off], sizeof(buf) - off));
+		exec_trigger(trigger_cmd);
+		break;
+		}
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile *pdu = (struct cmd_sendfile *) cmd->payload;
+		fio_send_file(client, pdu, cmd->tag);
+		break;
+		}
+	case FIO_NET_CMD_JOB_OPT: {
+		handle_job_opt(client, cmd);
+		break;
+	}
 	default:
 		log_err("fio: unknown client op: %s\n", fio_server_op(cmd->opcode));
-		free(cmd);
 		break;
 	}
 
+	free(cmd);
 	return 1;
 }
 
+int fio_clients_send_trigger(const char *cmd)
+{
+	struct flist_head *entry;
+	struct fio_client *client;
+	size_t slen;
+
+	dprint(FD_NET, "client: send vtrigger: %s\n", cmd);
+
+	if (!cmd)
+		slen = 0;
+	else
+		slen = strlen(cmd);
+
+	flist_for_each(entry, &client_list) {
+		struct cmd_vtrigger_pdu *pdu;
+
+		client = flist_entry(entry, struct fio_client, list);
+
+		pdu = malloc(sizeof(*pdu) + slen);
+		pdu->len = cpu_to_le16((uint16_t) slen);
+		if (slen)
+			memcpy(pdu->cmd, cmd, slen);
+		fio_net_send_cmd(client->fd, FIO_NET_CMD_VTRIGGER, pdu,
+					sizeof(*pdu) + slen, NULL, NULL);
+		free(pdu);
+	}
+
+	return 0;
+}
+
 static void request_client_etas(struct client_ops *ops)
 {
 	struct fio_client *client;
@@ -1316,10 +1926,12 @@
 	struct client_eta *eta;
 	int skipped = 0;
 
+	if (eta_print == FIO_ETA_NEVER)
+		return;
+
 	dprint(FD_NET, "client: request eta (%d)\n", nr_clients);
 
-	eta = malloc(sizeof(*eta));
-	memset(&eta->eta, 0, sizeof(eta->eta));
+	eta = calloc(1, sizeof(*eta) + __THREAD_RUNSTR_SZ(REAL_MAX_JOBS));
 	eta->pending = nr_clients;
 
 	flist_for_each(entry, &client_list) {
@@ -1339,29 +1951,66 @@
 					(uintptr_t) eta, &client->cmd_list);
 	}
 
-	while (skipped--)
-		fio_client_dec_jobs_eta(eta, ops->eta);
+	while (skipped--) {
+		if (!fio_client_dec_jobs_eta(eta, ops->eta))
+			break;
+	}
 
 	dprint(FD_NET, "client: requested eta tag %p\n", eta);
 }
 
+/*
+ * A single SEND_ETA timeout isn't fatal. Attempt to recover.
+ */
+static int handle_cmd_timeout(struct fio_client *client,
+			      struct fio_net_cmd_reply *reply)
+{
+	uint16_t reply_opcode = reply->opcode;
+
+	flist_del(&reply->list);
+	free(reply);
+
+	if (reply_opcode != FIO_NET_CMD_SEND_ETA)
+		return 1;
+
+	log_info("client <%s>: timeout on SEND_ETA\n", client->hostname);
+
+	flist_del_init(&client->eta_list);
+	if (client->eta_in_flight) {
+		fio_client_dec_jobs_eta(client->eta_in_flight, client->ops->eta);
+		client->eta_in_flight = NULL;
+	}
+
+	/*
+	 * If we fail 5 in a row, give up...
+	 */
+	if (client->eta_timeouts++ > 5)
+		return 1;
+
+	return 0;
+}
+
 static int client_check_cmd_timeout(struct fio_client *client,
-				    struct timeval *now)
+				    struct timespec *now)
 {
 	struct fio_net_cmd_reply *reply;
 	struct flist_head *entry, *tmp;
 	int ret = 0;
 
 	flist_for_each_safe(entry, tmp, &client->cmd_list) {
+		unsigned int op;
+
 		reply = flist_entry(entry, struct fio_net_cmd_reply, list);
 
-		if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT)
+		if (mtime_since(&reply->ts, now) < FIO_NET_CLIENT_TIMEOUT)
+			continue;
+
+		op = reply->opcode;
+		if (!handle_cmd_timeout(client, reply))
 			continue;
 
 		log_err("fio: client %s, timeout on cmd %s\n", client->hostname,
-						fio_server_op(reply->opcode));
-		flist_del(&reply->list);
-		free(reply);
+						fio_server_op(op));
 		ret = 1;
 	}
 
@@ -1372,10 +2021,10 @@
 {
 	struct fio_client *client;
 	struct flist_head *entry, *tmp;
-	struct timeval tv;
+	struct timespec ts;
 	int ret = 0;
 
-	fio_gettime(&tv, NULL);
+	fio_gettime(&ts, NULL);
 
 	flist_for_each_safe(entry, tmp, &client_list) {
 		client = flist_entry(entry, struct fio_client, list);
@@ -1383,7 +2032,7 @@
 		if (flist_empty(&client->cmd_list))
 			continue;
 
-		if (!client_check_cmd_timeout(client, &tv))
+		if (!client_check_cmd_timeout(client, &ts))
 			continue;
 
 		if (client->ops->timed_out)
@@ -1391,6 +2040,10 @@
 		else
 			log_err("fio: client %s timed out\n", client->hostname);
 
+		if (client->last_cmd != FIO_NET_CMD_VTRIGGER)
+			client->error = ETIMEDOUT;
+		else
+			log_info("fio: ignoring timeout due to vtrigger\n");
 		remove_client(client);
 		ret = 1;
 	}
@@ -1403,7 +2056,7 @@
 	struct pollfd *pfds;
 	int i, ret = 0, retval = 0;
 
-	fio_gettime(&eta_tv, NULL);
+	fio_gettime(&eta_ts, NULL);
 
 	pfds = malloc(nr_clients * sizeof(struct pollfd));
 
@@ -1435,18 +2088,23 @@
 		assert(i == nr_clients);
 
 		do {
-			struct timeval tv;
+			struct timespec ts;
+			int timeout;
 
-			fio_gettime(&tv, NULL);
-			if (mtime_since(&eta_tv, &tv) >= 900) {
+			fio_gettime(&ts, NULL);
+			if (eta_time_within_slack(mtime_since(&eta_ts, &ts))) {
 				request_client_etas(ops);
-				memcpy(&eta_tv, &tv, sizeof(tv));
+				memcpy(&eta_ts, &ts, sizeof(ts));
 
 				if (fio_check_clients_timed_out())
 					break;
 			}
 
-			ret = poll(pfds, nr_clients, ops->eta_msec);
+			check_trigger_file();
+
+			timeout = min(100u, ops->eta_msec);
+
+			ret = poll(pfds, nr_clients, timeout);
 			if (ret < 0) {
 				if (errno == EINTR)
 					continue;
@@ -1476,6 +2134,17 @@
 		}
 	}
 
+	log_info_buf(allclients.buf, allclients.buflen);
+	buf_output_free(&allclients);
+
+	fio_client_json_fini();
+
 	free(pfds);
-	return retval;
+	return retval || error_clients;
+}
+
+static void client_display_thread_status(struct jobs_eta *je)
+{
+	if (!(output_format & FIO_OUTPUT_JSON))
+		display_thread_status(je);
 }
diff -Nru fio-2.1.3/client.h fio-3.16/client.h
--- fio-2.1.3/client.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/client.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,15 +1,14 @@
 #ifndef CLIENT_H
 #define CLIENT_H
 
-#include <sys/socket.h>
 #include <sys/un.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 
+#include "lib/types.h"
 #include "stat.h"
 
 struct fio_net_cmd;
-struct client_ops;
 
 enum {
 	Client_created		= 0,
@@ -20,6 +19,11 @@
 	Client_exited		= 5,
 };
 
+struct client_file {
+	char *file;
+	bool remote;
+};
+
 struct fio_client {
 	struct flist_head list;
 	struct flist_head hash_list;
@@ -33,21 +37,24 @@
 	int port;
 	int fd;
 	unsigned int refs;
+	unsigned int last_cmd;
 
 	char *name;
 
+	struct flist_head *opt_lists;
+
 	int state;
 
-	int skip_newline;
-	int is_sock;
-	int disk_stats_shown;
+	bool skip_newline;
+	bool is_sock;
+	bool disk_stats_shown;
 	unsigned int jobs;
 	unsigned int nr_stat;
 	int error;
 	int signal;
 	int ipv6;
-	int sent_job;
-	int did_stat;
+	bool sent_job;
+	bool did_stat;
 	uint32_t type;
 
 	uint32_t thread_number;
@@ -55,6 +62,7 @@
 
 	struct flist_head eta_list;
 	struct client_eta *eta_in_flight;
+	unsigned int eta_timeouts;
 
 	struct flist_head cmd_list;
 
@@ -64,16 +72,19 @@
 	struct client_ops *ops;
 	void *client_data;
 
-	char **ini_file;
-	unsigned int nr_ini_file;
+	struct client_file *files;
+	unsigned int nr_files;
+
+	struct buf_output buf;
 };
 
-struct cmd_iolog_pdu;
 typedef void (client_cmd_op)(struct fio_client *, struct fio_net_cmd *);
+typedef void (client_op)(struct fio_client *);
 typedef void (client_eta_op)(struct jobs_eta *je);
 typedef void (client_timed_out_op)(struct fio_client *);
 typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
-typedef void (client_iolog_op)(struct fio_client *client, struct cmd_iolog_pdu *);
+
+extern struct client_ops fio_client_ops;
 
 struct client_ops {
 	client_cmd_op		*text;
@@ -87,10 +98,9 @@
 	client_cmd_op		*add_job;
 	client_cmd_op		*update_job;
 	client_timed_out_op	*timed_out;
-	client_cmd_op		*stop;
+	client_op		*stop;
 	client_cmd_op		*start;
 	client_cmd_op		*job_start;
-	client_iolog_op		*iolog;
 	client_timed_out_op	*removed;
 
 	unsigned int eta_msec;
@@ -98,15 +108,12 @@
 	uint32_t client_type;
 };
 
-extern struct client_ops fio_client_ops;
-
 struct client_eta {
 	unsigned int pending;
 	struct jobs_eta eta;
 };
 
 extern int fio_handle_client(struct fio_client *);
-extern void fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op fn);
 extern void fio_client_sum_jobs_eta(struct jobs_eta *dst, struct jobs_eta *je);
 
 enum {
@@ -119,19 +126,19 @@
 extern int fio_clients_connect(void);
 extern int fio_start_client(struct fio_client *);
 extern int fio_start_all_clients(void);
-extern int fio_client_send_ini(struct fio_client *, const char *);
 extern int fio_clients_send_ini(const char *);
+extern int fio_client_send_ini(struct fio_client *, const char *, bool);
 extern int fio_handle_clients(struct client_ops *);
 extern int fio_client_add(struct client_ops *, const char *, void **);
 extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
 extern void fio_client_add_cmd_option(void *, const char *);
-extern void fio_client_add_ini_file(void *, const char *);
+extern int fio_client_add_ini_file(void *, const char *, bool);
 extern int fio_client_terminate(struct fio_client *);
-extern void fio_clients_terminate(void);
 extern struct fio_client *fio_get_client(struct fio_client *);
 extern void fio_put_client(struct fio_client *);
 extern int fio_client_update_options(struct fio_client *, struct thread_options *, uint64_t *);
 extern int fio_client_wait_for_reply(struct fio_client *, uint64_t);
+extern int fio_clients_send_trigger(const char *);
 
 #define FIO_CLIENT_DEF_ETA_MSEC		900
 
@@ -140,5 +147,9 @@
 	FIO_CLIENT_TYPE_GUI		= 2,
 };
 
+extern int sum_stat_clients;
+extern struct thread_stat client_ts;
+extern struct group_run_stats client_gs;
+
 #endif
 
diff -Nru fio-2.1.3/compiler/compiler-gcc3.h fio-3.16/compiler/compiler-gcc3.h
--- fio-2.1.3/compiler/compiler-gcc3.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/compiler/compiler-gcc3.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,10 +0,0 @@
-#ifndef FIO_COMPILER_GCC3_H
-#define FIO_COMPILER_GCC3_H
-
-#if __GNUC_MINOR__ >= 4
-#ifndef __must_check
-#define __must_check		__attribute__((warn_unused_result))
-#endif
-#endif
-
-#endif
diff -Nru fio-2.1.3/compiler/compiler-gcc4.h fio-3.16/compiler/compiler-gcc4.h
--- fio-2.1.3/compiler/compiler-gcc4.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/compiler/compiler-gcc4.h	2019-09-20 01:01:52.000000000 +0000
@@ -5,4 +5,13 @@
 #define __must_check		__attribute__((warn_unused_result))
 #endif
 
+#define GCC_VERSION (__GNUC__ * 10000		\
+			+ __GNUC_MINOR__ * 100	\
+			+ __GNUC_PATCHLEVEL__)
+
+#if GCC_VERSION >= 40300
+#define __compiletime_warning(message)	__attribute__((warning(message)))
+#define __compiletime_error(message)	__attribute__((error(message)))
+#endif
+
 #endif
diff -Nru fio-2.1.3/compiler/compiler.h fio-3.16/compiler/compiler.h
--- fio-2.1.3/compiler/compiler.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/compiler/compiler.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,13 @@
 #ifndef FIO_COMPILER_H
 #define FIO_COMPILER_H
 
-#if __GNUC__ >= 4
+/* IWYU pragma: begin_exports */
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
 #include "compiler-gcc4.h"
-#elif __GNUC__ == 3
-#include "compiler-gcc3.h"
 #else
-#error Compiler too old, need gcc at least gcc 3.x
+#error Compiler too old, need at least gcc 4.1.0
 #endif
+/* IWYU pragma: end_exports */
 
 #ifndef __must_check
 #define __must_check
@@ -20,4 +20,57 @@
 #define fio_init	__attribute__((constructor))
 #define fio_exit	__attribute__((destructor))
 
+#define fio_unlikely(x)	__builtin_expect(!!(x), 0)
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	__typeof__(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+
+#if defined(CONFIG_STATIC_ASSERT)
+#define compiletime_assert(condition, msg) _Static_assert(condition, msg)
+
+#elif !defined(CONFIG_DISABLE_OPTIMIZATIONS)
+
+#ifndef __compiletime_error
+#define __compiletime_error(message)
+#endif
+
+#ifndef __compiletime_error_fallback
+#define __compiletime_error_fallback(condition)	do { } while (0)
+#endif
+
+#define __compiletime_assert(condition, msg, prefix, suffix)		\
+	do {								\
+		int __cond = !(condition);				\
+		extern void prefix ## suffix(void) __compiletime_error(msg); \
+		if (__cond)						\
+			prefix ## suffix();				\
+		__compiletime_error_fallback(__cond);			\
+	} while (0)
+
+#define _compiletime_assert(condition, msg, prefix, suffix) \
+	__compiletime_assert(condition, msg, prefix, suffix)
+
+#define compiletime_assert(condition, msg) \
+	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
+
+#else
+
+#define compiletime_assert(condition, msg)	do { } while (0)
+
+#endif
+
+#ifdef FIO_INTERNAL
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+#define FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
+#endif
+
 #endif
diff -Nru fio-2.1.3/configure fio-3.16/configure
--- fio-2.1.3/configure	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/configure	2019-09-20 01:01:52.000000000 +0000
@@ -1,7 +1,7 @@
 #!/bin/sh
 #
 # Fio configure script. Heavily influenced by the manual qemu configure
-# script. Sad this this is easier than autoconf and enemies.
+# script. Sad this is easier than autoconf and enemies.
 #
 
 # set temporary file name
@@ -14,12 +14,13 @@
 fi
 
 TMPC="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.c"
+TMPC2="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}-2.c"
 TMPO="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.o"
 TMPE="${TMPDIR1}/fio-conf-${RANDOM}-$$-${RANDOM}.exe"
 
 # NB: do not call "exit" in the trap handler; this is buggy with some shells;
 # see <1285349658-3122-1-git-send-email-loic.minier@linaro.org>
-trap "rm -f $TMPC $TMPO $TMPE" EXIT INT QUIT TERM
+trap "rm -f $TMPC $TMPC2 $TMPO $TMPE" EXIT INT QUIT TERM
 
 rm -rf config.log
 
@@ -37,6 +38,11 @@
   exit 1
 }
 
+# Print result for each configuration test
+print_config() {
+  printf "%-30s%s\n" "$1" "$2"
+}
+
 # Default CFLAGS
 CFLAGS="-D_GNU_SOURCE -include config-host.h"
 BUILD_CFLAGS=""
@@ -130,28 +136,81 @@
 targetos=""
 cpu=""
 
-cross_prefix=${cross_prefix-${CROSS_COMPILE}}
-cc="${CC-${cross_prefix}gcc}"
-
 # default options
 show_help="no"
 exit_val=0
-gfio="no"
+gfio_check="no"
+libhdfs="no"
+pmemblk="no"
+devdax="no"
+pmem="no"
+disable_lex=""
+disable_pmem="no"
+disable_native="no"
+march_set="no"
+libiscsi="no"
+libnbd="no"
+prefix=/usr/local
 
 # parse options
 for opt do
   optarg=`expr "x$opt" : 'x[^=]*=\(.*\)'`
   case "$opt" in
+  --prefix=*) prefix="$optarg"
+  ;;
   --cpu=*) cpu="$optarg"
   ;;
+  #  esx is cross compiled and cannot be detect through simple uname calls
+  --esx)
+  esx="yes"
+  ;;
   --cc=*) CC="$optarg"
   ;;
   --extra-cflags=*) CFLAGS="$CFLAGS $optarg"
   ;;
   --build-32bit-win) build_32bit_win="yes"
   ;;
-  --enable-gfio)
-  gfio="yes"
+  --target-win-ver=*) target_win_ver="$optarg"
+  ;;
+  --build-static) build_static="yes"
+  ;;
+  --enable-gfio) gfio_check="yes"
+  ;;
+  --disable-numa) disable_numa="yes"
+  ;;
+  --disable-rdma) disable_rdma="yes"
+  ;;
+  --disable-rados) disable_rados="yes"
+  ;;
+  --disable-rbd) disable_rbd="yes"
+  ;;
+  --disable-http) disable_http="yes"
+  ;;
+  --disable-gfapi) disable_gfapi="yes"
+  ;;
+  --enable-libhdfs) libhdfs="yes"
+  ;;
+  --disable-lex) disable_lex="yes"
+  ;;
+  --enable-lex) disable_lex="no"
+  ;;
+  --disable-shm) no_shm="yes"
+  ;;
+  --disable-optimizations) disable_opt="yes"
+  ;;
+  --disable-pmem) disable_pmem="yes"
+  ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
+  --disable-native) disable_native="yes"
+  ;;
+  --with-ime=*) ime_path="$optarg"
+  ;;
+  --enable-libiscsi) libiscsi="yes"
+  ;;
+  --enable-libnbd) libnbd="yes"
+  ;;
+  --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
   --help)
     show_help="yes"
@@ -164,22 +223,65 @@
 done
 
 if test "$show_help" = "yes" ; then
-  echo "--cpu=                 Specify target CPU if auto-detect fails"
-  echo "--cc=                  Specify compiler to use"
-  echo "--extra-cflags=        Specify extra CFLAGS to pass to compiler"
-  echo "--build-32bit-win      Enable 32-bit build on Windows"
-  echo "--enable-gfio          Enable building of gtk gfio"
+  echo "--prefix=               Use this directory as installation prefix"
+  echo "--cpu=                  Specify target CPU if auto-detect fails"
+  echo "--cc=                   Specify compiler to use"
+  echo "--extra-cflags=         Specify extra CFLAGS to pass to compiler"
+  echo "--build-32bit-win       Enable 32-bit build on Windows"
+  echo "--target-win-ver=       Minimum version of Windows to target (XP or 7)"
+  echo "--build-static          Build a static fio"
+  echo "--esx                   Configure build options for esx"
+  echo "--enable-gfio           Enable building of gtk gfio"
+  echo "--disable-numa          Disable libnuma even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
+  echo "--disable-rados         Disable Rados support even if found"
+  echo "--disable-rbd           Disable Rados Block Device even if found"
+  echo "--disable-http          Disable HTTP support even if found"
+  echo "--disable-gfapi         Disable gfapi"
+  echo "--enable-libhdfs        Enable hdfs support"
+  echo "--disable-lex           Disable use of lex/yacc for math"
+  echo "--disable-pmem          Disable pmem based engines even if found"
+  echo "--enable-lex            Enable use of lex/yacc for math"
+  echo "--disable-shm           Disable SHM support"
+  echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
+  echo "--disable-native        Don't build for native host"
+  echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
+  echo "--enable-libiscsi       Enable iscsi support"
+  echo "--enable-libnbd         Enable libnbd (NBD engine) support"
+  echo "--disable-tcmalloc	Disable tcmalloc support"
   exit $exit_val
 fi
 
+cross_prefix=${cross_prefix-${CROSS_COMPILE}}
+# Preferred compiler (can be overriden later after we know the platform):
+#  ${CC} (if set)
+#  ${cross_prefix}gcc (if cross-prefix specified)
+#  gcc if available
+#  clang if available
+if test -z "${CC}${cross_prefix}"; then
+  if has gcc; then
+    cc=gcc
+  elif has clang; then
+    cc=clang
+  fi
+else
+  cc="${CC-${cross_prefix}gcc}"
+fi
+
 if check_define __ANDROID__ ; then
   targetos="Android"
 elif check_define __linux__ ; then
   targetos="Linux"
 elif check_define __OpenBSD__ ; then
   targetos='OpenBSD'
+elif check_define __NetBSD__ ; then
+  targetos='NetBSD'
 elif check_define __sun__ ; then
   targetos='SunOS'
+  CFLAGS="$CFLAGS -D_REENTRANT"
+elif check_define _WIN32 ; then
+  targetos='CYGWIN'
 else
   targetos=`uname -s`
 fi
@@ -190,11 +292,32 @@
 echo >> $config_host_mak
 echo "CONFIG_TARGET_OS=$targetos" >> $config_host_mak
 
+if test "$no_shm" = "yes" ; then
+  output_sym "CONFIG_NO_SHM"
+fi
+
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_FIO_NO_OPT"
+fi
+
 # Some host OSes need non-standard checks for which CPU to use.
 # Note that these checks are broken for cross-compilation: if you're
 # cross-compiling to one of these OSes then you'll need to specify
 # the correct CPU with the --cpu option.
 case $targetos in
+AIX|OpenBSD|NetBSD)
+  # Unless explicitly enabled, turn off lex.
+  # OpenBSD will hit syntax error when enabled.
+  if test -z "$disable_lex" ; then
+    disable_lex="yes"
+  else
+    force_no_lex_o="yes"
+  fi
+  ;;
+FreeBSD)
+  CFLAGS="$CFLAGS -I/usr/local/include"
+  LDFLAGS="$LDFLAGS -L/usr/local/lib"
+  ;;
 Darwin)
   # on Leopard most of the system is 32-bit, so we have to ask the kernel if
   # we can run 64-bit userspace code.
@@ -204,6 +327,17 @@
   if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
     cpu="x86_64"
   fi
+  # Error at compile time linking of weak/partial symbols if possible...
+cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
+    echo "Disabling weak symbols"
+    LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+  fi
   ;;
 SunOS)
   # `uname -m` returns i86pc even on an x86_64 box, so default based on isainfo
@@ -213,38 +347,62 @@
   LIBS="-lnsl -lsocket"
   ;;
 CYGWIN*)
-  echo "Forcing known good options on Windows"
-  if test -z "$CC" ; then
+  # We still force some options, so keep this message here.
+  echo "Forcing some known good options on Windows"
+  if test -z "${CC}${cross_prefix}"; then
     if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
-      CC="i686-w64-mingw32-gcc"
+      cc="i686-w64-mingw32-gcc"
     else
-      CC="x86_64-w64-mingw32-gcc"
+      cc="x86_64-w64-mingw32-gcc"
     fi
   fi
-  output_sym "CONFIG_LITTLE_ENDIAN"
-  if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
-    output_sym "CONFIG_32BIT"
+
+  target_win_ver=$(echo "$target_win_ver" | tr '[:lower:]' '[:upper:]')
+  if test -z "$target_win_ver"; then
+    # Default Windows API target
+    target_win_ver="7"
+  fi
+  if test "$target_win_ver" = "XP"; then
+    output_sym "CONFIG_WINDOWS_XP"
+  elif test "$target_win_ver" = "7"; then
+    output_sym "CONFIG_WINDOWS_7"
+    CFLAGS="$CFLAGS -D_WIN32_WINNT=0x0601"
   else
-    output_sym "CONFIG_64BIT_LLP64"
+    fatal "Unknown target Windows version"
   fi
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SOCKLEN_T"
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SFAA"
-  output_sym "CONFIG_RUSAGE_THREAD"
+
+  # We need this to be output_sym'd here because this is Windows specific.
+  # The regular configure path never sets this config.
   output_sym "CONFIG_WINDOWSAIO"
-  output_sym "CONFIG_FDATASYNC"
-  output_sym "CONFIG_CLOCK_MONOTONIC"
-  output_sym "CONFIG_GETTIMEOFDAY"
-  output_sym "CONFIG_CLOCK_GETTIME"
-  output_sym "CONFIG_SCHED_IDLE"
-  output_sym "CONFIG_TCP_NODELAY"
-  echo "CC=$CC" >> $config_host_mak
-  echo "BUILD_CFLAGS=$CFLAGS -include config-host.h -D_GNU_SOURCE" >> $config_host_mak
-  exit 0
+  # We now take the regular configuration path without having exit 0 here.
+  # Flags below are still necessary mostly for MinGW.
+  build_static="yes"
+  socklen_t="yes"
+  rusage_thread="yes"
+  fdatasync="yes"
+  clock_gettime="yes" # clock_monotonic probe has dependency on this
+  clock_monotonic="yes"
+  gettimeofday="yes"
+  sched_idle="yes"
+  tcp_nodelay="yes"
+  ipv6="yes"
   ;;
 esac
 
+# Now we know the target platform we can have another guess at the preferred
+# compiler when it wasn't explictly set
+if test -z "${CC}${cross_prefix}"; then
+  if test "$targetos" = "FreeBSD" || test "$targetos" = "Darwin"; then
+    if has clang; then
+      cc=clang
+    fi
+  fi
+fi
+if test -z "$cc"; then
+    echo "configure: failed to find compiler"
+    exit 1
+fi
+
 if test ! -z "$cpu" ; then
   # command line argument
   :
@@ -276,6 +434,8 @@
   fi
 elif check_define __arm__ ; then
   cpu="arm"
+elif check_define __aarch64__ ; then
+  cpu="aarch64"
 elif check_define __hppa__ ; then
   cpu="hppa"
 else
@@ -288,7 +448,7 @@
     cpu="$cpu"
   ;;
   i386|i486|i586|i686|i86pc|BePC)
-    cpu="i386"
+    cpu="x86"
   ;;
   x86_64|amd64)
     cpu="x86_64"
@@ -296,6 +456,9 @@
   armv*b|armv*l|arm)
     cpu="arm"
   ;;
+  aarch64)
+    cpu="arm64"
+  ;;
   hppa|parisc|parisc64)
     cpu="hppa"
   ;;
@@ -310,22 +473,12 @@
   ;;
 esac
 
-if test -z "$CC" ; then
-  if test "$targetos" = "FreeBSD"; then
-    if has clang; then
-      CC=clang
-    else
-      CC=gcc
-    fi
-  fi
-fi
-
-cc="${CC-${cross_prefix}gcc}"
-
 ##########################################
 # check cross compile
 
-cross_compile="no"
+if test "$cross_compile" != "yes" ; then
+  cross_compile="no"
+fi
 cat > $TMPC <<EOF
 int main(void)
 {
@@ -340,7 +493,9 @@
 
 ##########################################
 # check endianness
-bigendian="no"
+if test "$bigendian" != "yes" ; then
+  bigendian="no"
+fi
 if test "$cross_compile" = "no" ; then
   cat > $TMPC <<EOF
 #include <inttypes.h>
@@ -371,14 +526,27 @@
 fi
 
 
-echo "Operating system              $targetos"
-echo "CPU                           $cpu"
-echo "Big endian                    $bigendian"
-echo "Compiler                      $cc"
-echo "Cross compile                 $cross_compile"
+print_config "Operating system" "$targetos"
+print_config "CPU" "$cpu"
+print_config "Big endian" "$bigendian"
+if test ! -z "$target_win_ver"; then
+  print_config "Target Windows version" "$target_win_ver"
+fi
+print_config "Compiler" "$cc"
+print_config "Cross compile" "$cross_compile"
 echo
 
 ##########################################
+# See if we need to build a static build
+if test "$build_static" = "yes" ; then
+  CFLAGS="$CFLAGS -ffunction-sections -fdata-sections"
+  LDFLAGS="$LDFLAGS -static -Wl,--gc-sections"
+else
+  build_static="no"
+fi
+print_config "Static build" "$build_static"
+
+##########################################
 # check for wordsize
 wordsize="0"
 cat > $TMPC <<EOF
@@ -397,11 +565,13 @@
 else
   fatal "Unknown wordsize"
 fi
-echo "Wordsize                      $wordsize"
+print_config "Wordsize" "$wordsize"
 
 ##########################################
 # zlib probe
-zlib="no"
+if test "$zlib" != "yes" ; then
+  zlib="no"
+fi
 cat > $TMPC <<EOF
 #include <zlib.h>
 int main(void)
@@ -416,12 +586,15 @@
   zlib=yes
   LIBS="-lz $LIBS"
 fi
-echo "zlib                          $zlib"
+print_config "zlib" "$zlib"
 
 ##########################################
 # linux-aio probe
-libaio="no"
-cat > $TMPC <<EOF
+if test "$libaio" != "yes" ; then
+  libaio="no"
+fi
+if test "$esx" != "yes" ; then
+  cat > $TMPC <<EOF
 #include <libaio.h>
 #include <stddef.h>
 int main(void)
@@ -430,21 +603,26 @@
   return 0;
 }
 EOF
-if compile_prog "" "-laio" "libaio" ; then
-  libaio=yes
-  LIBS="-laio $LIBS"
-else
-  if test "$libaio" = "yes" ; then
-    feature_not_found "linux AIO" "libaio-dev or libaio-devel"
+  if compile_prog "" "-laio" "libaio" ; then
+    libaio=yes
+    LIBS="-laio $LIBS"
+  else
+    if test "$libaio" = "yes" ; then
+      feature_not_found "linux AIO" "libaio-dev or libaio-devel"
+    fi
+    libaio=no
   fi
-  libaio=no
 fi
-echo "Linux AIO support             $libaio"
+print_config "Linux AIO support" "$libaio"
 
 ##########################################
 # posix aio probe
-posix_aio="no"
-posix_aio_lrt="no"
+if test "$posix_aio" != "yes" ; then
+  posix_aio="no"
+fi
+if test "$posix_aio_lrt" != "yes" ; then
+  posix_aio_lrt="no"
+fi
 cat > $TMPC <<EOF
 #include <aio.h>
 int main(void)
@@ -456,17 +634,19 @@
 EOF
 if compile_prog "" "" "posixaio" ; then
   posix_aio="yes"
-elif compile_prog "" "-lrt" "posixaio"; then
+elif compile_prog "" "-lrt" "posixaio -lrt"; then
   posix_aio="yes"
   posix_aio_lrt="yes"
   LIBS="-lrt $LIBS"
 fi
-echo "POSIX AIO support             $posix_aio"
-echo "POSIX AIO support needs -lrt  $posix_aio_lrt"
+print_config "POSIX AIO support" "$posix_aio"
+print_config "POSIX AIO support needs -lrt" "$posix_aio_lrt"
 
 ##########################################
 # posix aio fsync probe
-posix_aio_fsync="no"
+if test "$posix_aio_fsync" != "yes" ; then
+  posix_aio_fsync="no"
+fi
 if test "$posix_aio" = "yes" ; then
   cat > $TMPC <<EOF
 #include <fcntl.h>
@@ -482,11 +662,43 @@
     posix_aio_fsync=yes
   fi
 fi
-echo "POSIX AIO fsync               $posix_aio_fsync"
+print_config "POSIX AIO fsync" "$posix_aio_fsync"
+
+##########################################
+# POSIX pshared attribute probe
+if test "$posix_pshared" != "yes" ; then
+  posix_pshared="no"
+fi
+cat > $TMPC <<EOF
+#include <unistd.h>
+int main(void)
+{
+#if defined(_POSIX_THREAD_PROCESS_SHARED) && ((_POSIX_THREAD_PROCESS_SHARED + 0) > 0)
+# if defined(__CYGWIN__)
+#  error "_POSIX_THREAD_PROCESS_SHARED is buggy on Cygwin"
+# elif defined(__APPLE__)
+#  include <AvailabilityMacros.h>
+#  include <TargetConditionals.h>
+#  if TARGET_OS_MAC && MAC_OS_X_VERSION_MIN_REQUIRED < 1070
+#   error "_POSIX_THREAD_PROCESS_SHARED is buggy/unsupported prior to OSX 10.7"
+#  endif
+# endif
+#else
+# error "_POSIX_THREAD_PROCESS_SHARED is unsupported"
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "posix_pshared" ; then
+  posix_pshared=yes
+fi
+print_config "POSIX pshared support" "$posix_pshared"
 
 ##########################################
 # solaris aio probe
-solaris_aio="no"
+if test "$solaris_aio" != "yes" ; then
+  solaris_aio="no"
+fi
 cat > $TMPC <<EOF
 #include <sys/types.h>
 #include <sys/asynch.h>
@@ -502,20 +714,23 @@
   solaris_aio=yes
   LIBS="-laio $LIBS"
 fi
-echo "Solaris AIO support           $solaris_aio"
+print_config "Solaris AIO support" "$solaris_aio"
 
 ##########################################
-# __sync_fetch_and_and test
-sfaa="no"
+# __sync_fetch_and_add test
+if test "$sfaa" != "yes" ; then
+  sfaa="no"
+fi
 cat > $TMPC << EOF
-static int sfaa(int *ptr)
+#include <inttypes.h>
+static int sfaa(uint64_t *ptr)
 {
   return __sync_fetch_and_add(ptr, 0);
 }
 
 int main(int argc, char **argv)
 {
-  int val = 42;
+  uint64_t val = 42;
   sfaa(&val);
   return val;
 }
@@ -523,29 +738,70 @@
 if compile_prog "" "" "__sync_fetch_and_add()" ; then
     sfaa="yes"
 fi
-echo "__sync_fetch_and_add          $sfaa"
+print_config "__sync_fetch_and_add" "$sfaa"
+
+##########################################
+# __sync_synchronize() test
+if test "$sync_sync" != "yes" ; then
+  sync_sync="no"
+fi
+cat > $TMPC << EOF
+#include <inttypes.h>
+
+int main(int argc, char **argv)
+{
+  __sync_synchronize();
+  return 0;
+}
+EOF
+if compile_prog "" "" "__sync_synchronize()" ; then
+    sync_sync="yes"
+fi
+print_config "__sync_synchronize" "$sync_sync"
+
+##########################################
+# __sync_val_compare_and_swap() test
+if test "$cmp_swap" != "yes" ; then
+  cmp_swap="no"
+fi
+cat > $TMPC << EOF
+#include <inttypes.h>
+
+int main(int argc, char **argv)
+{
+  int x = 0;
+  return __sync_val_compare_and_swap(&x, 1, 2);
+}
+EOF
+if compile_prog "" "" "__sync_val_compare_and_swap()" ; then
+    cmp_swap="yes"
+fi
+print_config "__sync_val_compare_and_swap" "$cmp_swap"
 
 ##########################################
 # libverbs probe
-libverbs="no"
+if test "$libverbs" != "yes" ; then
+  libverbs="no"
+fi
 cat > $TMPC << EOF
-#include <stdio.h>
-#include <infiniband/arch.h>
+#include <infiniband/verbs.h>
 int main(int argc, char **argv)
 {
   struct ibv_pd *pd = ibv_alloc_pd(NULL);
   return 0;
 }
 EOF
-if compile_prog "" "-libverbs" "libverbs" ; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then
     libverbs="yes"
     LIBS="-libverbs $LIBS"
 fi
-echo "libverbs                      $libverbs"
+print_config "libverbs" "$libverbs"
 
 ##########################################
 # rdmacm probe
-rdmacm="no"
+if test "$rdmacm" != "yes" ; then
+  rdmacm="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <rdma/rdma_cma.h>
@@ -555,17 +811,54 @@
   return 0;
 }
 EOF
-if compile_prog "" "-lrdmacm" "rdma"; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
     rdmacm="yes"
     LIBS="-lrdmacm $LIBS"
 fi
-echo "rdmacm                        $rdmacm"
+print_config "rdmacm" "$rdmacm"
+
+##########################################
+# asprintf() and vasprintf() probes
+if test "$have_asprintf" != "yes" ; then
+  have_asprintf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+  return asprintf(NULL, "%s", "str") == 0;
+}
+EOF
+if compile_prog "" "" "have_asprintf"; then
+    have_asprintf="yes"
+fi
+print_config "asprintf()" "$have_asprintf"
+
+if test "$have_vasprintf" != "yes" ; then
+  have_vasprintf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+  return vasprintf(NULL, "%s", NULL) == 0;
+}
+EOF
+if compile_prog "" "" "have_vasprintf"; then
+    have_vasprintf="yes"
+fi
+print_config "vasprintf()" "$have_vasprintf"
 
 ##########################################
 # Linux fallocate probe
-linux_fallocate="no"
+if test "$linux_fallocate" != "yes" ; then
+  linux_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
+#include <fcntl.h>
 #include <linux/falloc.h>
 int main(int argc, char **argv)
 {
@@ -576,11 +869,13 @@
 if compile_prog "" "" "linux_fallocate"; then
     linux_fallocate="yes"
 fi
-echo "Linux fallocate               $linux_fallocate"
+print_config "Linux fallocate" "$linux_fallocate"
 
 ##########################################
 # POSIX fadvise probe
-posix_fadvise="no"
+if test "$posix_fadvise" != "yes" ; then
+  posix_fadvise="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -593,11 +888,13 @@
 if compile_prog "" "" "posix_fadvise"; then
     posix_fadvise="yes"
 fi
-echo "POSIX fadvise                 $posix_fadvise"
+print_config "POSIX fadvise" "$posix_fadvise"
 
 ##########################################
 # POSIX fallocate probe
-posix_fallocate="no"
+if test "$posix_fallocate" != "yes" ; then
+  posix_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -610,12 +907,16 @@
 if compile_prog "" "" "posix_fallocate"; then
     posix_fallocate="yes"
 fi
-echo "POSIX fallocate               $posix_fallocate"
+print_config "POSIX fallocate" "$posix_fallocate"
 
 ##########################################
 # sched_set/getaffinity 2 or 3 argument test
-linux_2arg_affinity="no"
-linux_3arg_affinity="no"
+if test "$linux_2arg_affinity" != "yes" ; then
+  linux_2arg_affinity="no"
+fi
+if test "$linux_3arg_affinity" != "yes" ; then
+  linux_3arg_affinity="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -639,12 +940,14 @@
     linux_2arg_affinity="yes"
   fi
 fi
-echo "sched_setaffinity(3 arg)      $linux_3arg_affinity"
-echo "sched_setaffinity(2 arg)      $linux_2arg_affinity"
+print_config "sched_setaffinity(3 arg)" "$linux_3arg_affinity"
+print_config "sched_setaffinity(2 arg)" "$linux_2arg_affinity"
 
 ##########################################
 # clock_gettime probe
-clock_gettime="no"
+if test "$clock_gettime" != "yes" ; then
+  clock_gettime="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <time.h>
@@ -659,11 +962,13 @@
     clock_gettime="yes"
     LIBS="-lrt $LIBS"
 fi
-echo "clock_gettime                 $clock_gettime"
+print_config "clock_gettime" "$clock_gettime"
 
 ##########################################
 # CLOCK_MONOTONIC probe
-clock_monotonic="no"
+if test "$clock_monotonic" != "yes" ; then
+  clock_monotonic="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -677,11 +982,33 @@
       clock_monotonic="yes"
   fi
 fi
-echo "CLOCK_MONOTONIC               $clock_monotonic"
+print_config "CLOCK_MONOTONIC" "$clock_monotonic"
+
+##########################################
+# CLOCK_MONOTONIC_RAW probe
+if test "$clock_monotonic_raw" != "yes" ; then
+  clock_monotonic_raw="no"
+fi
+if test "$clock_gettime" = "yes" ; then
+  cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(CLOCK_MONOTONIC_RAW, NULL);
+}
+EOF
+  if compile_prog "" "$LIBS" "clock monotonic"; then
+      clock_monotonic_raw="yes"
+  fi
+fi
+print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw"
 
 ##########################################
 # CLOCK_MONOTONIC_PRECISE probe
-clock_monotonic_precise="no"
+if test "$clock_monotonic_precise" != "yes" ; then
+  clock_monotonic_precise="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -695,11 +1022,33 @@
       clock_monotonic_precise="yes"
   fi
 fi
-echo "CLOCK_MONOTONIC_PRECISE       $clock_monotonic_precise"
+print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise"
+
+##########################################
+# clockid_t probe
+if test "$clockid_t" != "yes" ; then
+  clockid_t="no"
+fi
+cat > $TMPC << EOF
+#include <time.h>
+#include <string.h>
+int main(int argc, char **argv)
+{
+  volatile clockid_t cid;
+  memset((void*)&cid, 0, sizeof(cid));
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "clockid_t"; then
+  clockid_t="yes"
+fi
+print_config "clockid_t" "$clockid_t"
 
 ##########################################
 # gettimeofday() probe
-gettimeofday="no"
+if test "$gettimeofday" != "yes" ; then
+  gettimeofday="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <stdio.h>
@@ -712,11 +1061,13 @@
 if compile_prog "" "" "gettimeofday"; then
     gettimeofday="yes"
 fi
-echo "gettimeofday                  $gettimeofday"
+print_config "gettimeofday" "$gettimeofday"
 
 ##########################################
 # fdatasync() probe
-fdatasync="no"
+if test "$fdatasync" != "yes" ; then
+  fdatasync="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -728,11 +1079,13 @@
 if compile_prog "" "" "fdatasync"; then
   fdatasync="yes"
 fi
-echo "fdatasync                     $fdatasync"
+print_config "fdatasync" "$fdatasync"
 
 ##########################################
 # sync_file_range() probe
-sync_file_range="no"
+if test "$sync_file_range" != "yes" ; then
+  sync_file_range="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -748,11 +1101,13 @@
 if compile_prog "" "" "sync_file_range"; then
   sync_file_range="yes"
 fi
-echo "sync_file_range               $sync_file_range"
+print_config "sync_file_range" "$sync_file_range"
 
 ##########################################
 # ext4 move extent probe
-ext4_me="no"
+if test "$ext4_me" != "yes" ; then
+  ext4_me="no"
+fi
 cat > $TMPC << EOF
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -770,11 +1125,13 @@
   # work. Takes a while to bubble back.
   ext4_me="yes"
 fi
-echo "EXT4 move extent              $ext4_me"
+print_config "EXT4 move extent" "$ext4_me"
 
 ##########################################
 # splice probe
-linux_splice="no"
+if test "$linux_splice" != "yes" ; then
+  linux_splice="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -786,11 +1143,13 @@
 if compile_prog "" "" "linux splice"; then
   linux_splice="yes"
 fi
-echo "Linux splice(2)               $linux_splice"
+print_config "Linux splice(2)" "$linux_splice"
 
 ##########################################
 # GUASI probe
-guasi="no"
+if test "$guasi" != "yes" ; then
+  guasi="no"
+fi
 cat > $TMPC << EOF
 #include <guasi.h>
 #include <guasi_syscalls.h>
@@ -803,31 +1162,13 @@
 if compile_prog "" "" "guasi"; then
   guasi="yes"
 fi
-echo "GUASI                         $guasi"
-
-##########################################
-# fusion-aw probe
-fusion_aw="no"
-cat > $TMPC << EOF
-#include <nvm/nvm_primitives.h>
-int main(int argc, char **argv)
-{
-  nvm_version_t ver_info;
-  nvm_handle_t handle;
-
-  handle = nvm_get_handle(0, &ver_info);
-  return nvm_atomic_write(handle, 0, 0, 0);
-}
-EOF
-if compile_prog "" "-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -lvsl -ldl" "fusion-aw"; then
-  LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -lvsl -ldl $LIBS"
-  fusion_aw="yes"
-fi
-echo "Fusion-io atomic engine       $fusion_aw"
+print_config "GUASI" "$guasi"
 
 ##########################################
 # libnuma probe
-libnuma="no"
+if test "$libnuma" != "yes" ; then
+  libnuma="no"
+fi
 cat > $TMPC << EOF
 #include <numa.h>
 int main(int argc, char **argv)
@@ -835,14 +1176,14 @@
   return numa_available();
 }
 EOF
-if compile_prog "" "-lnuma" "libnuma"; then
+if test "$disable_numa" != "yes"  && compile_prog "" "-lnuma" "libnuma"; then
   libnuma="yes"
   LIBS="-lnuma $LIBS"
 fi
-echo "libnuma                       $libnuma"
+print_config "libnuma" "$libnuma"
 
 ##########################################
-# libnuma 2.x version API
+# libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes"
 if test "$libnuma" = "yes" ; then
 libnuma_v2="no"
 cat > $TMPC << EOF
@@ -850,53 +1191,81 @@
 int main(int argc, char **argv)
 {
   struct bitmask *mask = numa_parse_nodestring(NULL);
-  return 0;
+  return mask->size == 0;
 }
 EOF
 if compile_prog "" "" "libnuma api"; then
   libnuma_v2="yes"
 fi
-echo "libnuma v2                    $libnuma_v2"
+print_config "libnuma v2" "$libnuma_v2"
 fi
 
 ##########################################
 # strsep() probe
-strsep="no"
+if test "$strsep" != "yes" ; then
+  strsep="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
 {
-  strsep(NULL, NULL);
+  static char *string = "This is a string";
+  strsep(&string, "needle");
   return 0;
 }
 EOF
 if compile_prog "" "" "strsep"; then
   strsep="yes"
 fi
-echo "strsep                        $strsep"
+print_config "strsep" "$strsep"
 
 ##########################################
 # strcasestr() probe
-strcasestr="no"
+if test "$strcasestr" != "yes" ; then
+  strcasestr="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
 {
-  strcasestr(NULL, NULL);
-  return 0;
+  return strcasestr(argv[0], argv[1]) != NULL;
 }
 EOF
 if compile_prog "" "" "strcasestr"; then
   strcasestr="yes"
 fi
-echo "strcasestr                    $strcasestr"
+print_config "strcasestr" "$strcasestr"
+
+##########################################
+# strlcat() probe
+if test "$strlcat" != "yes" ; then
+  strlcat="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+int main(int argc, char **argv)
+{
+  static char dst[64];
+  static char *string = "This is a string";
+  memset(dst, 0, sizeof(dst));
+  strlcat(dst, string, sizeof(dst));
+  return 0;
+}
+EOF
+if compile_prog "" "" "strlcat"; then
+  strlcat="yes"
+fi
+print_config "strlcat" "$strlcat"
 
 ##########################################
 # getopt_long_only() probe
-getopt_long_only="no"
+if test "$getopt_long_only" != "yes" ; then
+  getopt_long_only="no"
+fi
 cat > $TMPC << EOF
 #include <unistd.h>
 #include <stdio.h>
+#include <getopt.h>
 int main(int argc, char **argv)
 {
   int c = getopt_long_only(argc, argv, NULL, NULL, NULL);
@@ -906,11 +1275,13 @@
 if compile_prog "" "" "getopt_long_only"; then
   getopt_long_only="yes"
 fi
-echo "getopt_long_only()            $getopt_long_only"
+print_config "getopt_long_only()" "$getopt_long_only"
 
 ##########################################
 # inet_aton() probe
-inet_aton="no"
+if test "$inet_aton" != "yes" ; then
+  inet_aton="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 #include <arpa/inet.h>
@@ -924,11 +1295,13 @@
 if compile_prog "" "" "inet_aton"; then
   inet_aton="yes"
 fi
-echo "inet_aton                     $inet_aton"
+print_config "inet_aton" "$inet_aton"
 
 ##########################################
 # socklen_t probe
-socklen_t="no"
+if test "$socklen_t" != "yes" ; then
+  socklen_t="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 int main(int argc, char **argv)
@@ -940,14 +1313,16 @@
 if compile_prog "" "" "socklen_t"; then
   socklen_t="yes"
 fi
-echo "socklen_t                     $socklen_t"
+print_config "socklen_t" "$socklen_t"
 
 ##########################################
 # Whether or not __thread is supported for TLS
-tls_thread="no"
+if test "$tls_thread" != "yes" ; then
+  tls_thread="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
-static int __thread ret;
+static __thread int ret;
 int main(int argc, char **argv)
 {
   return ret;
@@ -956,11 +1331,14 @@
 if compile_prog "" "" "__thread"; then
   tls_thread="yes"
 fi
-echo "__thread                      $tls_thread"
+print_config "__thread" "$tls_thread"
 
 ##########################################
 # Check if we have required gtk/glib support for gfio
-if test "$gfio" = "yes" ; then
+if test "$gfio" != "yes" ; then
+  gfio="no"
+fi
+if test "$gfio_check" = "yes" ; then
   cat > $TMPC << EOF
 #include <glib.h>
 #include <cairo.h>
@@ -970,39 +1348,46 @@
   gdk_threads_enter();
   gdk_threads_leave();
 
-  printf("%d", GTK_CHECK_VERSION(2, 18, 0));
+  return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
 }
 EOF
-GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
+GTK_CFLAGS=$(${cross_prefix}pkg-config --cflags gtk+-2.0 gthread-2.0)
+ORG_LDFLAGS=$LDFLAGS
+LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g)
 if test "$?" != "0" ; then
   echo "configure: gtk and gthread not found"
   exit 1
 fi
-GTK_LIBS=$(pkg-config --libs gtk+-2.0 gthread-2.0)
+GTK_LIBS=$(${cross_prefix}pkg-config --libs gtk+-2.0 gthread-2.0)
 if test "$?" != "0" ; then
   echo "configure: gtk and gthread not found"
   exit 1
 fi
-if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then
-  r=$($TMPE)
-  if test "$r" != "0" ; then
+if ! ${cross_prefix}pkg-config --atleast-version 2.18.0 gtk+-2.0; then
+  echo "GTK found, but need version 2.18 or higher"
+  gfio="no"
+else
+  if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then
     gfio="yes"
-    LIBS="$LIBS $GTK_LIBS"
+    GFIO_LIBS="$LIBS $GTK_LIBS"
     CFLAGS="$CFLAGS $GTK_CFLAGS"
   else
-    echo "GTK found, but need version 2.18 or higher"
+    echo "Please install gtk and gdk libraries"
     gfio="no"
   fi
-else
-  echo "Please install gtk and gdk libraries"
-  gfio="no"
 fi
+LDFLAGS=$ORG_LDFLAGS
 fi
 
-echo "gtk 2.18 or higher            $gfio"
+if test "$gfio_check" = "yes" ; then
+  print_config "gtk 2.18 or higher" "$gfio"
+fi
 
+##########################################
 # Check whether we have getrusage(RUSAGE_THREAD)
-rusage_thread="no"
+if test "$rusage_thread" != "yes" ; then
+  rusage_thread="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1016,11 +1401,13 @@
 if compile_prog "" "" "RUSAGE_THREAD"; then
   rusage_thread="yes"
 fi
-echo "RUSAGE_THREAD                 $rusage_thread"
+print_config "RUSAGE_THREAD" "$rusage_thread"
 
 ##########################################
 # Check whether we have SCHED_IDLE
-sched_idle="no"
+if test "$sched_idle" != "yes" ; then
+  sched_idle="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -1032,11 +1419,13 @@
 if compile_prog "" "" "SCHED_IDLE"; then
   sched_idle="yes"
 fi
-echo "SCHED_IDLE                    $sched_idle"
+print_config "SCHED_IDLE" "$sched_idle"
 
 ##########################################
 # Check whether we have TCP_NODELAY
-tcp_nodelay="no"
+if test "$tcp_nodelay" != "yes" ; then
+  tcp_nodelay="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1050,41 +1439,996 @@
 if compile_prog "" "" "TCP_NODELAY"; then
   tcp_nodelay="yes"
 fi
-echo "TCP_NODELAY                   $tcp_nodelay"
+print_config "TCP_NODELAY" "$tcp_nodelay"
 
 ##########################################
-# Check whether we have RLIMIT_MEMLOCK
-rlimit_memlock="no"
+# Check whether we have SO_SNDBUF
+if test "$window_size" != "yes" ; then
+  window_size="no"
+fi
 cat > $TMPC << EOF
-#include <sys/time.h>
-#include <sys/resource.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
 int main(int argc, char **argv)
 {
-  struct rlimit rl;
-  return getrlimit(RLIMIT_MEMLOCK, &rl);
+  setsockopt(0, SOL_SOCKET, SO_SNDBUF, NULL, 0);
+  setsockopt(0, SOL_SOCKET, SO_RCVBUF, NULL, 0);
 }
 EOF
-if compile_prog "" "" "RLIMIT_MEMLOCK"; then
-  rlimit_memlock="yes"
+if compile_prog "" "" "SO_SNDBUF"; then
+  window_size="yes"
 fi
-echo "RLIMIT_MEMLOCK                $rlimit_memlock"
+print_config "Net engine window_size" "$window_size"
 
 ##########################################
-# Check whether we have pwritev/preadv
-pwritev="no"
+# Check whether we have TCP_MAXSEG
+if test "$mss" != "yes" ; then
+  mss="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
-#include <sys/uio.h>
-int main(int argc, char **argv)
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+int main(int argc, char **argv)
+{
+  return setsockopt(0, IPPROTO_TCP, TCP_MAXSEG, NULL, 0);
+}
+EOF
+if compile_prog "" "" "TCP_MAXSEG"; then
+  mss="yes"
+fi
+print_config "TCP_MAXSEG" "$mss"
+
+##########################################
+# Check whether we have RLIMIT_MEMLOCK
+if test "$rlimit_memlock" != "yes" ; then
+  rlimit_memlock="no"
+fi
+cat > $TMPC << EOF
+#include <sys/time.h>
+#include <sys/resource.h>
+int main(int argc, char **argv)
+{
+  struct rlimit rl;
+  return getrlimit(RLIMIT_MEMLOCK, &rl);
+}
+EOF
+if compile_prog "" "" "RLIMIT_MEMLOCK"; then
+  rlimit_memlock="yes"
+fi
+print_config "RLIMIT_MEMLOCK" "$rlimit_memlock"
+
+##########################################
+# Check whether we have pwritev/preadv
+if test "$pwritev" != "yes" ; then
+  pwritev="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+}
+EOF
+if compile_prog "" "" "pwritev"; then
+  pwritev="yes"
+fi
+print_config "pwritev/preadv" "$pwritev"
+
+##########################################
+# Check whether we have pwritev2/preadv2
+if test "$pwritev2" != "yes" ; then
+  pwritev2="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+}
+EOF
+if compile_prog "" "" "pwritev2"; then
+  pwritev2="yes"
+fi
+print_config "pwritev2/preadv2" "$pwritev2"
+
+##########################################
+# Check whether we have the required functions for ipv6
+if test "$ipv6" != "yes" ; then
+  ipv6="no"
+fi
+cat > $TMPC << EOF
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  struct addrinfo hints;
+  struct in6_addr addr;
+  int ret;
+
+  ret = getaddrinfo(NULL, NULL, &hints, NULL);
+  freeaddrinfo(NULL);
+  printf("%s\n", gai_strerror(ret));
+  addr = in6addr_any;
+  return 0;
+}
+EOF
+if compile_prog "" "" "ipv6"; then
+  ipv6="yes"
+fi
+print_config "IPv6 helpers" "$ipv6"
+
+##########################################
+# check for http
+if test "$http" != "yes" ; then
+  http="no"
+fi
+# check for openssl >= 1.1.0, which uses an opaque HMAC_CTX pointer
+cat > $TMPC << EOF
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+
+int main(int argc, char **argv)
+{
+  CURL *curl;
+  HMAC_CTX *ctx;
+
+  curl = curl_easy_init();
+  curl_easy_cleanup(curl);
+
+  ctx = HMAC_CTX_new();
+  HMAC_CTX_reset(ctx);
+  HMAC_CTX_free(ctx);
+  return 0;
+}
+EOF
+# openssl < 1.1.0 uses the HMAC_CTX type directly
+cat > $TMPC2 << EOF
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+
+int main(int argc, char **argv)
+{
+  CURL *curl;
+  HMAC_CTX ctx;
+
+  curl = curl_easy_init();
+  curl_easy_cleanup(curl);
+
+  HMAC_CTX_init(&ctx);
+  HMAC_CTX_cleanup(&ctx);
+  return 0;
+}
+EOF
+if test "$disable_http" != "yes"; then
+  HTTP_LIBS="-lcurl -lssl -lcrypto"
+  if compile_prog "" "$HTTP_LIBS" "curl-new-ssl"; then
+    output_sym "CONFIG_HAVE_OPAQUE_HMAC_CTX"
+    http="yes"
+    LIBS="$HTTP_LIBS $LIBS"
+  elif mv $TMPC2 $TMPC && compile_prog "" "$HTTP_LIBS" "curl-old-ssl"; then
+    http="yes"
+    LIBS="$HTTP_LIBS $LIBS"
+  fi
+fi
+print_config "http engine" "$http"
+
+##########################################
+# check for rados
+if test "$rados" != "yes" ; then
+  rados="no"
+fi
+cat > $TMPC << EOF
+#include <rados/librados.h>
+
+int main(int argc, char **argv)
+{
+  rados_t cluster;
+  rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
+  const char pool[] = "rados";
+
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
+  rados_ioctx_create(cluster, pool, &io_ctx);
+
+  return 0;
+}
+EOF
+if test "$disable_rados" != "yes"  && compile_prog "" "-lrados" "rados"; then
+  LIBS="-lrados $LIBS"
+  rados="yes"
+fi
+print_config "Rados engine" "$rados"
+
+##########################################
+# check for rbd
+if test "$rbd" != "yes" ; then
+  rbd="no"
+fi
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+
+int main(int argc, char **argv)
+{
+  rados_t cluster;
+  rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
+  const char pool[] = "rbd";
+  int major, minor, extra;
+
+  rbd_version(&major, &minor, &extra);
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
+  rados_ioctx_create(cluster, pool, &io_ctx);
+
+  return 0;
+}
+EOF
+if test "$disable_rbd" != "yes"  && compile_prog "" "-lrbd -lrados" "rbd"; then
+  LIBS="-lrbd -lrados $LIBS"
+  rbd="yes"
+fi
+print_config "Rados Block Device engine" "$rbd"
+
+##########################################
+# check for rbd_poll
+if test "$rbd_poll" != "yes" ; then
+  rbd_poll="no"
+fi
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+#include <sys/eventfd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+  rbd_completion_t comp;
+
+  int fd = eventfd(0, EFD_NONBLOCK);
+  rbd_set_image_notification(image, fd, EVENT_TYPE_EVENTFD);
+  rbd_poll_io_events(image, comp, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_poll="yes"
+fi
+print_config "rbd_poll" "$rbd_poll"
+fi
+
+##########################################
+# check for rbd_invalidate_cache()
+if test "$rbd_inval" != "yes" ; then
+  rbd_inval="no"
+fi
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+
+  return rbd_invalidate_cache(image);
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_inval="yes"
+fi
+print_config "rbd_invalidate_cache" "$rbd_inval"
+fi
+
+##########################################
+# Check whether we have setvbuf
+if test "$setvbuf" != "yes" ; then
+  setvbuf="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+int main(int argc, char **argv)
+{
+  FILE *f = NULL;
+  char buf[80];
+  setvbuf(f, buf, _IOFBF, sizeof(buf));
+  return 0;
+}
+EOF
+if compile_prog "" "" "setvbuf"; then
+  setvbuf="yes"
+fi
+print_config "setvbuf" "$setvbuf"
+
+##########################################
+# check for gfapi
+if test "$gfapi" != "yes" ; then
+  gfapi="no"
+fi
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  glfs_t *g = glfs_new("foo");
+
+  return 0;
+}
+EOF
+if test "$disable_gfapi" != "yes"  && compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  LIBS="-lgfapi -lglusterfs $LIBS"
+  gfapi="yes"
+fi
+print_config "Gluster API engine" "$gfapi"
+
+##########################################
+# check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes"
+if test "$gfapi" = "yes" ; then
+gf_fadvise="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  struct glfs_fd *fd;
+  int ret = glfs_fadvise(fd, 0, 0, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  gf_fadvise="yes"
+fi
+print_config "Gluster API use fadvise" "$gf_fadvise"
+fi
+
+##########################################
+# check for newer gfapi
+if test "$gfapi" = "yes" ; then
+gf_new="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  return glfs_fsync(NULL, NULL, NULL) && glfs_ftruncate(NULL, 0, NULL, NULL);
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gf new api"; then
+  gf_new="yes"
+fi
+print_config "Gluster new API" "$gf_new"
+fi
+
+##########################################
+# check for gfapi trim support
+if test "$gf_trim" != "yes" ; then
+  gf_trim="no"
+fi
+if test "$gfapi" = "yes" ; then
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  return glfs_discard_async(NULL, 0, 0);
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then
+  gf_trim="yes"
+fi
+print_config "Gluster API trim support" "$gf_trim"
+fi
+
+##########################################
+# Check if we support stckf on s390
+if test "$s390_z196_facilities" != "yes" ; then
+  s390_z196_facilities="no"
+fi
+cat > $TMPC << EOF
+#define STFLE_BITS_Z196 45 /* various z196 facilities ... */
+int main(int argc, char **argv)
+{
+    /* We want just 1 double word to be returned.  */
+    register unsigned long reg0 asm("0") = 0;
+    unsigned long stfle_bits;
+    asm volatile(".machine push"        "\n\t"
+                 ".machine \"z9-109\""  "\n\t"
+                 "stfle %0"             "\n\t"
+                 ".machine pop"         "\n"
+                 : "=QS" (stfle_bits), "+d" (reg0)
+                 : : "cc");
+
+    if ((stfle_bits & (1UL << (63 - STFLE_BITS_Z196))) != 0)
+      return 0;
+    else
+      return -1;
+}
+EOF
+if compile_prog "" "" "s390_z196_facilities"; then
+  $TMPE
+  if [ $? -eq 0 ]; then
+  	s390_z196_facilities="yes"
+  fi
+fi
+print_config "s390_z196_facilities" "$s390_z196_facilities"
+
+##########################################
+# Check if we have required environment variables configured for libhdfs
+if test "$libhdfs" = "yes" ; then
+  hdfs_conf_error=0
+  if test "$JAVA_HOME" = "" ; then
+    echo "configure: JAVA_HOME should be defined to jdk/jvm path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
+    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_LIB" = "" ; then
+    echo "configure: FIO_LIBHDFS_LIB should be defined to libhdfs library path"
+    hdfs_conf_error=1
+  fi
+  if test "$hdfs_conf_error" = "1" ; then
+    exit 1
+  fi
+  FIO_HDFS_CPU=$cpu
+  if test "$FIO_HDFS_CPU" = "x86_64" ; then
+    FIO_HDFS_CPU="amd64"
+  fi
+fi
+print_config "HDFS engine" "$libhdfs"
+
+##########################################
+# Check whether we have MTD
+if test "$mtd" != "yes" ; then
+  mtd="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+#include <mtd/mtd-user.h>
+#include <sys/ioctl.h>
+int main(int argc, char **argv)
+{
+  struct mtd_write_req ops;
+  struct mtd_info_user info;
+  memset(&ops, 0, sizeof(ops));
+  info.type = MTD_MLCNANDFLASH;
+  return ioctl(0, MEMGETINFO, &info);
+}
+EOF
+if compile_prog "" "" "mtd"; then
+  mtd="yes"
+fi
+print_config "MTD" "$mtd"
+
+##########################################
+# Check whether we have libpmem
+if test "$libpmem" != "yes" ; then
+  libpmem="no"
+fi
+cat > $TMPC << EOF
+#include <libpmem.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  rc = pmem_is_pmem(0, 0);
+  return 0;
+}
+EOF
+if compile_prog "" "-lpmem" "libpmem"; then
+  libpmem="yes"
+  LIBS="-lpmem $LIBS"
+fi
+print_config "libpmem" "$libpmem"
+
+##########################################
+# Check whether we have libpmemblk
+# libpmem is a prerequisite
+if test "$libpmemblk" != "yes" ; then
+  libpmemblk="no"
+fi
+if test "$libpmem" = "yes"; then
+  cat > $TMPC << EOF
+#include <libpmemblk.h>
+int main(int argc, char **argv)
+{
+  PMEMblkpool *pbp;
+  pbp = pmemblk_open("", 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "-lpmemblk" "libpmemblk"; then
+    libpmemblk="yes"
+    LIBS="-lpmemblk $LIBS"
+  fi
+fi
+print_config "libpmemblk" "$libpmemblk"
+
+# Choose the ioengines
+if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+  pmem="yes"
+  devdax="yes"
+  if test "$libpmemblk" = "yes"; then
+    pmemblk="yes"
+  fi
+fi
+
+##########################################
+# Report whether pmemblk engine is enabled
+print_config "PMDK pmemblk engine" "$pmemblk"
+
+##########################################
+# Report whether dev-dax engine is enabled
+print_config "PMDK dev-dax engine" "$devdax"
+
+##########################################
+# Report whether libpmem engine is enabled
+print_config "PMDK libpmem engine" "$pmem"
+
+##########################################
+# Check whether we support DDN's IME
+if test "$libime" != "yes" ; then
+  libime="no"
+fi
+cat > $TMPC << EOF
+#include <ime_native.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  ime_native_init();
+  rc = ime_native_finalize();
+  return 0;
+}
+EOF
+if compile_prog "-I${ime_path}/include" "-L${ime_path}/lib -lim_client" "libime"; then
+  libime="yes"
+  CFLAGS="-I${ime_path}/include $CFLAGS"
+  LDFLAGS="-Wl,-rpath ${ime_path}/lib -L${ime_path}/lib $LDFLAGS"
+  LIBS="-lim_client $LIBS"
+fi
+print_config "DDN's Infinite Memory Engine" "$libime"
+
+##########################################
+# Check if we have required environment variables configured for libiscsi
+if test "$libiscsi" = "yes" ; then
+  if $(pkg-config --atleast-version=1.9.0 libiscsi); then
+    libiscsi="yes"
+    libiscsi_cflags=$(pkg-config --cflags libiscsi)
+    libiscsi_libs=$(pkg-config --libs libiscsi)
+  else
+    if test "$libiscsi" = "yes" ; then
+      echo "libiscsi" "Install libiscsi >= 1.9.0"
+    fi
+    libiscsi="no"
+  fi
+fi
+print_config "iscsi engine" "$libiscsi"
+
+##########################################
+# Check if we have libnbd (for NBD support).
+minimum_libnbd=0.9.8
+if test "$libnbd" = "yes" ; then
+  if $(pkg-config --atleast-version=$minimum_libnbd libnbd); then
+    libnbd="yes"
+    libnbd_cflags=$(pkg-config --cflags libnbd)
+    libnbd_libs=$(pkg-config --libs libnbd)
+  else
+    if test "$libnbd" = "yes" ; then
+      echo "libnbd" "Install libnbd >= $minimum_libnbd"
+    fi
+    libnbd="no"
+  fi
+fi
+print_config "NBD engine" "$libnbd"
+
+##########################################
+# Check if we have lex/yacc available
+yacc="no"
+yacc_is_bison="no"
+lex="no"
+arith="no"
+if test "$disable_lex" = "no" || test -z "$disable_lex" ; then
+if test "$targetos" != "SunOS" ; then
+LEX=$(which lex 2> /dev/null)
+if test -x "$LEX" ; then
+  lex="yes"
+fi
+YACC=$(which bison 2> /dev/null)
+if test -x "$YACC" ; then
+  yacc="yes"
+  yacc_is_bison="yes"
+else
+  YACC=$(which yacc 2> /dev/null)
+  if test -x "$YACC" ; then
+    yacc="yes"
+  fi
+fi
+if test "$yacc" = "yes" && test "$lex" = "yes" ; then
+  arith="yes"
+fi
+
+if test "$arith" = "yes" ; then
+cat > $TMPC << EOF
+extern int yywrap(void);
+
+int main(int argc, char **argv)
 {
-  return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+  yywrap();
+  return 0;
 }
 EOF
-if compile_prog "" "" "pwritev"; then
-  pwritev="yes"
+if compile_prog "" "-ll" "lex"; then
+  LIBS="-ll $LIBS"
+else
+  arith="no"
+fi
+fi
+fi
+fi
+
+# Check if lex fails using -o
+if test "$arith" = "yes" ; then
+if test "$force_no_lex_o" = "yes" ; then
+  lex_use_o="no"
+else
+$LEX -o lex.yy.c exp/expression-parser.l 2> /dev/null
+if test "$?" = "0" ; then
+  lex_use_o="yes"
+else
+  lex_use_o="no"
+fi
+fi
+fi
+
+print_config "lex/yacc for arithmetic" "$arith"
+
+##########################################
+# Check whether we have setmntent/getmntent
+if test "$getmntent" != "yes" ; then
+  getmntent="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <mntent.h>
+int main(int argc, char **argv)
+{
+  FILE *mtab = setmntent(NULL, "r");
+  struct mntent *mnt = getmntent(mtab);
+  endmntent(mtab);
+  return 0;
+}
+EOF
+if compile_prog "" "" "getmntent"; then
+  getmntent="yes"
+fi
+print_config "getmntent" "$getmntent"
+
+##########################################
+# Check whether we have getmntinfo
+# These are originally added for BSDs, but may also work
+# on other operating systems with getmntinfo(3).
+
+# getmntinfo(3) for FreeBSD/DragonFlyBSD/OpenBSD.
+# Note that NetBSD needs -Werror to catch warning as error.
+if test "$getmntinfo" != "yes" ; then
+  getmntinfo="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+int main(int argc, char **argv)
+{
+  struct statfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+if compile_prog "-Werror" "" "getmntinfo"; then
+  getmntinfo="yes"
+fi
+print_config "getmntinfo" "$getmntinfo"
+
+# getmntinfo(3) for NetBSD.
+if test "$getmntinfo_statvfs" != "yes" ; then
+  getmntinfo_statvfs="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/statvfs.h>
+int main(int argc, char **argv)
+{
+  struct statvfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+# Skip the test if the one with statfs arg is detected.
+if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then
+  getmntinfo_statvfs="yes"
+  print_config "getmntinfo_statvfs" "$getmntinfo_statvfs"
+fi
+
+##########################################
+# Check whether we have _Static_assert
+if test "$static_assert" != "yes" ; then
+  static_assert="no"
+fi
+cat > $TMPC << EOF
+#include <assert.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+struct foo {
+  int a, b;
+};
+
+int main(int argc, char **argv)
+{
+  _Static_assert(offsetof(struct foo, a) == 0 , "Check");
+  return 0 ;
+}
+EOF
+if compile_prog "" "" "static_assert"; then
+    static_assert="yes"
+fi
+print_config "Static Assert" "$static_assert"
+
+##########################################
+# Check whether we have bool / stdbool.h
+if test "$have_bool" != "yes" ; then
+  have_bool="no"
+fi
+cat > $TMPC << EOF
+#include <stdbool.h>
+int main(int argc, char **argv)
+{
+  bool var = true;
+  return var != false;
+}
+EOF
+if compile_prog "" "" "bool"; then
+  have_bool="yes"
+fi
+print_config "bool" "$have_bool"
+
+##########################################
+# Check whether we have strndup()
+strndup="no"
+cat > $TMPC << EOF
+#include <string.h>
+#include <stdlib.h>
+int main(int argc, char **argv)
+{
+  char *res = strndup("test string", 8);
+
+  free(res);
+  return 0;
+}
+EOF
+if compile_prog "" "" "strndup"; then
+  strndup="yes"
+fi
+print_config "strndup" "$strndup"
+
+##########################################
+# <valgrind/drd.h> probe
+# Note: presence of <valgrind/drd.h> implies that <valgrind/valgrind.h> is
+# also available but not the other way around.
+if test "$valgrind_dev" != "yes" ; then
+  valgrind_dev="no"
+fi
+cat > $TMPC << EOF
+#include <valgrind/drd.h>
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "" "" "valgrind_dev"; then
+  valgrind_dev="yes"
+fi
+print_config "Valgrind headers" "$valgrind_dev"
+
+##########################################
+# <linux/blkzoned.h> probe
+if test "$linux_blkzoned" != "yes" ; then
+  linux_blkzoned="no"
+fi
+cat > $TMPC << EOF
+#include <linux/blkzoned.h>
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "" "" "linux_blkzoned"; then
+  linux_blkzoned="yes"
+fi
+print_config "Zoned block device support" "$linux_blkzoned"
+
+##########################################
+# check march=armv8-a+crc+crypto
+if test "$march_armv8_a_crc_crypto" != "yes" ; then
+  march_armv8_a_crc_crypto="no"
+fi
+if test "$cpu" = "arm64" ; then
+  cat > $TMPC <<EOF
+#include <arm_acle.h>
+#include <arm_neon.h>
+#include <sys/auxv.h>
+
+int main(void)
+{
+  /* Can we also do a runtime probe? */
+#if __linux__
+  return getauxval(AT_HWCAP);
+#else
+# error "Don't know how to do runtime probe for ARM CRC32c"
+#endif
+}
+EOF
+  if compile_prog "-march=armv8-a+crc+crypto" "" "ARM CRC32c"; then
+    march_armv8_a_crc_crypto="yes"
+    CFLAGS="$CFLAGS -march=armv8-a+crc+crypto"
+    march_set="yes"
+  fi
+fi
+print_config "march_armv8_a_crc_crypto" "$march_armv8_a_crc_crypto"
+
+##########################################
+# cuda probe
+if test "$cuda" != "yes" ; then
+  cuda="no"
+fi
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+print_config "cuda" "$cuda"
+
+##########################################
+# mkdir() probe. mingw apparently has a one-argument mkdir :/
+mkdir_two="no"
+cat > $TMPC << EOF
+#include <sys/stat.h>
+#include <sys/types.h>
+int main(int argc, char **argv)
+{
+  return mkdir("/tmp/bla", 0600);
+}
+EOF
+if compile_prog "" "" "mkdir(a, b)"; then
+  mkdir_two="yes"
+fi
+print_config "mkdir(a, b)" "$mkdir_two"
+
+##########################################
+# check for cc -march=native
+build_native="no"
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if test "$disable_native" = "no" && test "$disable_opt" != "yes" && \
+   compile_prog "-march=native" "" "march=native"; then
+  build_native="yes"
+fi
+print_config "Build march=native" "$build_native"
+
+##########################################
+# check for -lcunit
+if test "$cunit" != "yes" ; then
+  cunit="no"
+fi
+cat > $TMPC << EOF
+#include <CUnit/CUnit.h>
+#include <CUnit/Basic.h>
+int main(void)
+{
+  if (CU_initialize_registry() != CUE_SUCCESS)
+    return CU_get_error();
+  CU_basic_set_mode(CU_BRM_VERBOSE);
+  CU_basic_run_tests();
+  CU_cleanup_registry();
+  return CU_get_error();
+}
+EOF
+if compile_prog "" "-lcunit" "CUnit"; then
+  cunit="yes"
+fi
+print_config "CUnit" "$cunit"
+
+##########################################
+# check for __kernel_rwf_t
+__kernel_rwf_t="no"
+cat > $TMPC << EOF
+#include <linux/fs.h>
+int main(int argc, char **argv)
+{
+  __kernel_rwf_t x;
+  x = 0;
+  return x;
+}
+EOF
+if compile_prog "" "" "__kernel_rwf_t"; then
+  __kernel_rwf_t="yes"
+fi
+print_config "__kernel_rwf_t" "$__kernel_rwf_t"
+
+##########################################
+# check if gcc has -Wimplicit-fallthrough
+fallthrough="no"
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if compile_prog "-Wimplicit-fallthrough" "" "-Wimplicit-fallthrough"; then
+  fallthrough="yes"
+fi
+print_config "-Wimplicit-fallthrough" "$fallthrough"
+
+##########################################
+# check for MADV_HUGEPAGE support
+if test "$thp" != "yes" ; then
+  thp="no"
+fi
+if test "$esx" != "yes" ; then
+  cat > $TMPC <<EOF
+#include <sys/mman.h>
+int main(void)
+{
+  return madvise(0, 0x1000, MADV_HUGEPAGE);
+}
+EOF
+  if compile_prog "" "" "thp" ; then
+    thp=yes
+  else
+    if test "$thp" = "yes" ; then
+      feature_not_found "Transparent Huge Page" ""
+    fi
+    thp=no
+  fi
 fi
-echo "pwritev/preadv                $pwritev"
+print_config "MADV_HUGEPAGE" "$thp"
 
+##########################################
+# check for gettid()
+gettid="no"
+cat > $TMPC << EOF
+#include <unistd.h>
+int main(int argc, char **argv)
+{
+  return gettid();
+}
+EOF
+if compile_prog "" "" "gettid"; then
+  gettid="yes"
+fi
+print_config "gettid" "$gettid"
 
 #############################################################################
 
@@ -1112,6 +2456,15 @@
 if test "$posix_aio_fsync" = "yes" ; then
   output_sym "CONFIG_POSIXAIO_FSYNC"
 fi
+if test "$posix_pshared" = "yes" ; then
+  output_sym "CONFIG_PSHARED"
+fi
+if test "$have_asprintf" = "yes" ; then
+    output_sym "CONFIG_HAVE_ASPRINTF"
+fi
+if test "$have_vasprintf" = "yes" ; then
+    output_sym "CONFIG_HAVE_VASPRINTF"
+fi
 if test "$linux_fallocate" = "yes" ; then
   output_sym "CONFIG_LINUX_FALLOCATE"
 fi
@@ -1127,6 +2480,12 @@
 if test "$sfaa" = "yes" ; then
   output_sym "CONFIG_SFAA"
 fi
+if test "$sync_sync" = "yes" ; then
+  output_sym "CONFIG_SYNC_SYNC"
+fi
+if test "$cmp_swap" = "yes" ; then
+  output_sym "CONFIG_CMP_SWAP"
+fi
 if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then
   output_sym "CONFIG_RDMA"
 fi
@@ -1136,9 +2495,15 @@
 if test "$clock_monotonic" = "yes" ; then
   output_sym "CONFIG_CLOCK_MONOTONIC"
 fi
+if test "$clock_monotonic_raw" = "yes" ; then
+  output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
+fi
 if test "$clock_monotonic_precise" = "yes" ; then
   output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
 fi
+if test "$clockid_t" = "yes"; then
+  output_sym "CONFIG_CLOCKID_T"
+fi
 if test "$gettimeofday" = "yes" ; then
   output_sym "CONFIG_GETTIMEOFDAY"
 fi
@@ -1156,6 +2521,9 @@
 if test "$strcasestr" = "yes" ; then
   output_sym "CONFIG_STRCASESTR"
 fi
+if test "$strlcat" = "yes" ; then
+  output_sym "CONFIG_STRLCAT"
+fi
 if test "$getopt_long_only" = "yes" ; then
   output_sym "CONFIG_GETOPT_LONG_ONLY"
 fi
@@ -1174,9 +2542,6 @@
 if test "$guasi" = "yes" ; then
   output_sym "CONFIG_GUASI"
 fi
-if test "$fusion_aw" = "yes" ; then
-  output_sym "CONFIG_FUSION_AW"
-fi
 if test "$libnuma_v2" = "yes" ; then
   output_sym "CONFIG_LIBNUMA"
 fi
@@ -1190,7 +2555,11 @@
   output_sym "CONFIG_RUSAGE_THREAD"
 fi
 if test "$gfio" = "yes" ; then
-  echo "CONFIG_GFIO=y" >> $config_host_mak
+  output_sym "CONFIG_GFIO"
+fi
+if test "$esx" = "yes" ; then
+  output_sym "CONFIG_ESX"
+  output_sym "CONFIG_NO_SHM"
 fi
 if test "$sched_idle" = "yes" ; then
   output_sym "CONFIG_SCHED_IDLE"
@@ -1198,14 +2567,189 @@
 if test "$tcp_nodelay" = "yes" ; then
   output_sym "CONFIG_TCP_NODELAY"
 fi
+if test "$window_size" = "yes" ; then
+  output_sym "CONFIG_NET_WINDOWSIZE"
+fi
+if test "$mss" = "yes" ; then
+  output_sym "CONFIG_NET_MSS"
+fi
 if test "$rlimit_memlock" = "yes" ; then
   output_sym "CONFIG_RLIMIT_MEMLOCK"
 fi
 if test "$pwritev" = "yes" ; then
   output_sym "CONFIG_PWRITEV"
 fi
+if test "$pwritev2" = "yes" ; then
+  output_sym "CONFIG_PWRITEV2"
+fi
+if test "$ipv6" = "yes" ; then
+  output_sym "CONFIG_IPV6"
+fi
+if test "$http" = "yes" ; then
+  output_sym "CONFIG_HTTP"
+fi
+if test "$rados" = "yes" ; then
+  output_sym "CONFIG_RADOS"
+fi
+if test "$rbd" = "yes" ; then
+  output_sym "CONFIG_RBD"
+fi
+if test "$rbd_poll" = "yes" ; then
+  output_sym "CONFIG_RBD_POLL"
+fi
+if test "$rbd_inval" = "yes" ; then
+  output_sym "CONFIG_RBD_INVAL"
+fi
+if test "$setvbuf" = "yes" ; then
+  output_sym "CONFIG_SETVBUF"
+fi
+if test "$s390_z196_facilities" = "yes" ; then
+  output_sym "CONFIG_S390_Z196_FACILITIES"
+  CFLAGS="$CFLAGS -march=z9-109"
+  march_set="yes"
+fi
+if test "$gfapi" = "yes" ; then
+  output_sym "CONFIG_GFAPI"
+fi
+if test "$gf_fadvise" = "yes" ; then
+  output_sym "CONFIG_GF_FADVISE"
+fi
+if test "$gf_trim" = "yes" ; then
+  output_sym "CONFIG_GF_TRIM"
+fi
+if test "$gf_new" = "yes" ; then
+  output_sym "CONFIG_GF_NEW_API"
+fi
+if test "$libhdfs" = "yes" ; then
+  output_sym "CONFIG_LIBHDFS"
+  echo "FIO_HDFS_CPU=$FIO_HDFS_CPU" >> $config_host_mak
+  echo "JAVA_HOME=$JAVA_HOME" >> $config_host_mak
+  echo "FIO_LIBHDFS_INCLUDE=$FIO_LIBHDFS_INCLUDE" >> $config_host_mak
+  echo "FIO_LIBHDFS_LIB=$FIO_LIBHDFS_LIB" >> $config_host_mak
+fi
+if test "$mtd" = "yes" ; then
+  output_sym "CONFIG_MTD"
+fi
+if test "$pmemblk" = "yes" ; then
+  output_sym "CONFIG_PMEMBLK"
+fi
+if test "$devdax" = "yes" ; then
+  output_sym "CONFIG_LINUX_DEVDAX"
+fi
+if test "$pmem" = "yes" ; then
+  output_sym "CONFIG_LIBPMEM"
+fi
+if test "$libime" = "yes" ; then
+  output_sym "CONFIG_IME"
+fi
+if test "$arith" = "yes" ; then
+  output_sym "CONFIG_ARITHMETIC"
+  if test "$yacc_is_bison" = "yes" ; then
+    echo "YACC=$YACC -y" >> $config_host_mak
+  else
+    echo "YACC=$YACC" >> $config_host_mak
+  fi
+  if test "$lex_use_o" = "yes" ; then
+    echo "CONFIG_LEX_USE_O=y" >> $config_host_mak
+  fi
+fi
+if test "$getmntent" = "yes" ; then
+  output_sym "CONFIG_GETMNTENT"
+fi
+if test "$getmntinfo" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO"
+fi
+if test "$getmntinfo_statvfs" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO_STATVFS"
+fi
+if test "$static_assert" = "yes" ; then
+  output_sym "CONFIG_STATIC_ASSERT"
+fi
+if test "$have_bool" = "yes" ; then
+  output_sym "CONFIG_HAVE_BOOL"
+fi
+if test "$strndup" = "yes" ; then
+  output_sym "CONFIG_HAVE_STRNDUP"
+fi
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
+fi
+if test "$valgrind_dev" = "yes"; then
+  output_sym "CONFIG_VALGRIND_DEV"
+fi
+if test "$linux_blkzoned" = "yes" ; then
+  output_sym "CONFIG_LINUX_BLKZONED"
+fi
+if test "$zlib" = "no" ; then
+  echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it."
+  if test "$build_static" = "yes"; then
+    echo "Note that some distros have separate packages for static libraries."
+  fi
+fi
+if test "$march_armv8_a_crc_crypto" = "yes" ; then
+  output_sym "ARCH_HAVE_CRC_CRYPTO"
+fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
+if test "$mkdir_two" = "yes" ; then
+  output_sym "CONFIG_HAVE_MKDIR_TWO"
+fi
+if test "$march_set" = "no" && test "$build_native" = "yes" ; then
+  output_sym "CONFIG_BUILD_NATIVE"
+fi
+if test "$cunit" = "yes" ; then
+  output_sym "CONFIG_HAVE_CUNIT"
+fi
+if test "$__kernel_rwf_t" = "yes"; then
+  output_sym "CONFIG_HAVE_KERNEL_RWF_T"
+fi
+if test "$gettid" = "yes"; then
+  output_sym "CONFIG_HAVE_GETTID"
+fi
+if test "$fallthrough" = "yes"; then
+  CFLAGS="$CFLAGS -Wimplicit-fallthrough"
+fi
+if test "$thp" = "yes" ; then
+  output_sym "CONFIG_HAVE_THP"
+fi
+if test "$libiscsi" = "yes" ; then
+  output_sym "CONFIG_LIBISCSI"
+  echo "CONFIG_LIBISCSI=m" >> $config_host_mak
+  echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak
+  echo "LIBISCSI_LIBS=$libiscsi_libs" >> $config_host_mak
+fi
+if test "$libnbd" = "yes" ; then
+  output_sym "CONFIG_LIBNBD"
+  echo "CONFIG_LIBNBD=m" >> $config_host_mak
+  echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
+  echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
+fi
+cat > $TMPC << EOF
+int main(int argc, char **argv)
+{
+  return 0;
+}
+EOF
+if test "$disable_tcmalloc" != "yes"  && compile_prog "" "-ltcmalloc" "tcmalloc"; then
+  LIBS="-ltcmalloc $LIBS"
+  tcmalloc="yes"
+else
+  tcmalloc="no"
+fi
+print_config "TCMalloc support" "$tcmalloc"
 
 echo "LIBS+=$LIBS" >> $config_host_mak
+echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
 echo "CFLAGS+=$CFLAGS" >> $config_host_mak
+echo "LDFLAGS+=$LDFLAGS" >> $config_host_mak
 echo "CC=$cc" >> $config_host_mak
 echo "BUILD_CFLAGS=$BUILD_CFLAGS $CFLAGS" >> $config_host_mak
+echo "INSTALL_PREFIX=$prefix" >> $config_host_mak
+
+if [ `dirname $0` != "." -a ! -e Makefile ]; then
+    cat > Makefile <<EOF
+SRCDIR:=`dirname $0`
+include \$(SRCDIR)/Makefile
+EOF
+fi
diff -Nru fio-2.1.3/COPYING fio-3.16/COPYING
--- fio-2.1.3/COPYING	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/COPYING	2019-09-20 01:01:52.000000000 +0000
@@ -1,12 +1,12 @@
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
 
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
 
-			    Preamble
+                            Preamble
 
   The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
@@ -15,7 +15,7 @@
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
+the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.
 
   When we speak of free software, we are referring to freedom, not
@@ -55,8 +55,8 @@
 
   The precise terms and conditions for copying, distribution and
 modification follow.
-
-		    GNU GENERAL PUBLIC LICENSE
+
+                    GNU GENERAL PUBLIC LICENSE
    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 
   0. This License applies to any program or other work which contains
@@ -110,7 +110,7 @@
     License.  (Exception: if the Program itself is interactive but
     does not normally print such an announcement, your work based on
     the Program is not required to print an announcement.)
-
+
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
 and can be reasonably considered independent and separate works in
@@ -168,7 +168,7 @@
 access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
-
+
   4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
@@ -225,7 +225,7 @@
 
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
-
+
   8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
@@ -255,7 +255,7 @@
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 
-			    NO WARRANTY
+                            NO WARRANTY
 
   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
@@ -277,9 +277,9 @@
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 
-		     END OF TERMS AND CONDITIONS
-
-	    How to Apply These Terms to Your New Programs
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
 
   If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -303,10 +303,9 @@
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
 Also add information on how to contact you by electronic and paper mail.
 
@@ -336,5 +335,5 @@
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
+library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
diff -Nru fio-2.1.3/crc/crc32.c fio-3.16/crc/crc32.c
--- fio-2.1.3/crc/crc32.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/crc32.c	2019-09-20 01:01:52.000000000 +0000
@@ -13,9 +13,8 @@
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software Foundation,
-   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
 
-#include <inttypes.h>
 #include "crc32.h"
 
 static const uint32_t crctab[256] = {
diff -Nru fio-2.1.3/crc/crc32c-arm64.c fio-3.16/crc/crc32c-arm64.c
--- fio-2.1.3/crc/crc32c-arm64.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/crc32c-arm64.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,107 @@
+#include "crc32c.h"
+#include "../os/os.h"
+
+bool crc32c_arm64_available = false;
+
+#ifdef ARCH_HAVE_CRC_CRYPTO
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+static bool crc32c_probed;
+
+/*
+ * Function to calculate reflected crc with PMULL Instruction
+ * crc done "by 3" for fixed input block size of 1024 bytes
+ */
+uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
+{
+	signed long len = length;
+	uint32_t crc = ~0;
+	uint32_t crc0, crc1, crc2;
+
+	/* Load two consts: K1 and K2 */
+	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+	uint64_t t0, t1;
+
+	while ((len -= 1024) >= 0) {
+		/* Do first 8 bytes here for better pipelining */
+		crc0 = __crc32cd(crc, *(const uint64_t *)data);
+		crc1 = 0;
+		crc2 = 0;
+		data += sizeof(uint64_t);
+
+		/* Process block inline
+		   Process crc0 last to avoid dependency with above */
+		CRC32C7X3X8(0);
+		CRC32C7X3X8(1);
+		CRC32C7X3X8(2);
+		CRC32C7X3X8(3);
+		CRC32C7X3X8(4);
+		CRC32C7X3X8(5);
+
+		data += 42*3*sizeof(uint64_t);
+
+		/* Merge crc0 and crc1 into crc2
+		   crc1 multiply by K2
+		   crc0 multiply by K1 */
+
+		t1 = (uint64_t)vmull_p64(crc1, k2);
+		t0 = (uint64_t)vmull_p64(crc0, k1);
+		crc = __crc32cd(crc2, *(const uint64_t *)data);
+		crc1 = __crc32cd(0, t1);
+		crc ^= crc1;
+		crc0 = __crc32cd(0, t0);
+		crc ^= crc0;
+
+		data += sizeof(uint64_t);
+	}
+
+	if (!(len += 1024))
+		return crc;
+
+	while ((len -= sizeof(uint64_t)) >= 0) {
+                crc = __crc32cd(crc, *(const uint64_t *)data);
+                data += sizeof(uint64_t);
+        }
+
+        /* The following is more efficient than the straight loop */
+        if (len & sizeof(uint32_t)) {
+                crc = __crc32cw(crc, *(const uint32_t *)data);
+                data += sizeof(uint32_t);
+        }
+        if (len & sizeof(uint16_t)) {
+                crc = __crc32ch(crc, *(const uint16_t *)data);
+                data += sizeof(uint16_t);
+        }
+        if (len & sizeof(uint8_t)) {
+                crc = __crc32cb(crc, *(const uint8_t *)data);
+        }
+
+	return crc;
+}
+
+void crc32c_arm64_probe(void)
+{
+	if (!crc32c_probed) {
+		crc32c_arm64_available = os_cpu_has(CPU_ARM64_CRC32C);
+		crc32c_probed = true;
+	}
+}
+
+#endif /* ARCH_HAVE_CRC_CRYPTO */
diff -Nru fio-2.1.3/crc/crc32c.c fio-3.16/crc/crc32c.c
--- fio-2.1.3/crc/crc32c.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/crc32c.c	2019-09-20 01:01:52.000000000 +0000
@@ -30,7 +30,7 @@
  * any later version.
  *
  */
-#include <inttypes.h>
+#include "crc32c.h"
 
 /*
  * This is the CRC-32C table
diff -Nru fio-2.1.3/crc/crc32c.h fio-3.16/crc/crc32c.h
--- fio-2.1.3/crc/crc32c.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/crc32c.h	2019-09-20 01:01:52.000000000 +0000
@@ -13,15 +13,29 @@
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software Foundation,
-   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 #ifndef CRC32C_H
 #define CRC32C_H
 
+#include <inttypes.h>
+
 #include "../arch/arch.h"
+#include "../lib/types.h"
 
 extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
-extern int crc32c_intel_available;
+extern bool crc32c_arm64_available;
+extern bool crc32c_intel_available;
+
+#ifdef ARCH_HAVE_CRC_CRYPTO
+extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
+extern void crc32c_arm64_probe(void);
+#else
+#define crc32c_arm64 crc32c_sw
+static inline void crc32c_arm64_probe(void)
+{
+}
+#endif /* ARCH_HAVE_CRC_CRYPTO */
 
 #ifdef ARCH_HAVE_SSE4_2
 extern uint32_t crc32c_intel(unsigned char const *, unsigned long);
@@ -31,10 +45,13 @@
 static inline void crc32c_intel_probe(void)
 {
 }
-#endif
+#endif /* ARCH_HAVE_SSE4_2 */
 
 static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len)
 {
+	if (crc32c_arm64_available)
+		return crc32c_arm64(buf, len);
+
 	if (crc32c_intel_available)
 		return crc32c_intel(buf, len);
 
diff -Nru fio-2.1.3/crc/crc32c-intel.c fio-3.16/crc/crc32c-intel.c
--- fio-2.1.3/crc/crc32c-intel.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/crc32c-intel.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,10 +1,3 @@
-#include <inttypes.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/wait.h>
 #include "crc32c.h"
 
 /*
@@ -18,7 +11,7 @@
  * Volume 2A: Instruction Set Reference, A-M
  */
 
-int crc32c_intel_available = 0;
+bool crc32c_intel_available = false;
 
 #ifdef ARCH_HAVE_SSE4_2
 
@@ -30,7 +23,7 @@
 #define SCALE_F 4
 #endif
 
-static int crc32c_probed;
+static bool crc32c_probed;
 
 static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data,
 					unsigned long length)
@@ -87,8 +80,8 @@
 
 		do_cpuid(&eax, &ebx, &ecx, &edx);
 		crc32c_intel_available = (ecx & (1 << 20)) != 0;
-		crc32c_probed = 1;
+		crc32c_probed = true;
 	}
 }
 
-#endif /* ARCH_HAVE_SSE */
+#endif /* ARCH_HAVE_SSE4_2 */
diff -Nru fio-2.1.3/crc/crc32.h fio-3.16/crc/crc32.h
--- fio-2.1.3/crc/crc32.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/crc32.h	2019-09-20 01:01:52.000000000 +0000
@@ -13,11 +13,13 @@
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software Foundation,
-   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 #ifndef CRC32_H
 #define CRC32_H
 
+#include <inttypes.h>
+
 extern uint32_t fio_crc32(const void * const, unsigned long);
 
 #endif
diff -Nru fio-2.1.3/crc/fnv.c fio-3.16/crc/fnv.c
--- fio-2.1.3/crc/fnv.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/fnv.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,34 @@
+#include "fnv.h"
+
+#define FNV_PRIME	0x100000001b3ULL
+
+/*
+ * 64-bit fnv, but don't require 64-bit multiples of data. Use bytes
+ * for the last unaligned chunk.
+ */
+uint64_t fnv(const void *buf, uint32_t len, uint64_t hval)
+{
+	const uint64_t *ptr = buf;
+
+	while (len) {
+		hval *= FNV_PRIME;
+		if (len >= sizeof(uint64_t)) {
+			hval ^= (uint64_t) *ptr++;
+			len -= sizeof(uint64_t);
+			continue;
+		} else {
+			const uint8_t *ptr8 = (const uint8_t *) ptr;
+			uint64_t val = 0;
+			int i;
+
+			for (i = 0; i < len; i++) {
+				val <<= 8;
+				val |= (uint8_t) *ptr8++;
+			}
+			hval ^= val;
+			break;
+		}
+	}
+
+	return hval;
+}
diff -Nru fio-2.1.3/crc/fnv.h fio-3.16/crc/fnv.h
--- fio-2.1.3/crc/fnv.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/fnv.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+#ifndef FIO_FNV_H
+#define FIO_FNV_H
+
+#include <inttypes.h>
+
+uint64_t fnv(const void *, uint32_t, uint64_t);
+
+#endif
diff -Nru fio-2.1.3/crc/md5.c fio-3.16/crc/md5.c
--- fio-2.1.3/crc/md5.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/md5.c	2019-09-20 01:01:52.000000000 +0000
@@ -2,7 +2,6 @@
  * Shamelessly lifted from the 2.6 kernel (crypto/md5.c)
  */
 #include <string.h>
-#include <stdint.h>
 #include "md5.h"
 
 static void md5_transform(uint32_t *hash, uint32_t const *in)
@@ -125,3 +124,23 @@
 
 	memcpy(mctx->block, data, len);
 }
+
+void fio_md5_final(struct fio_md5_ctx *mctx)
+{
+	const unsigned int offset = mctx->byte_count & 0x3f;
+	char *p = (char *)mctx->block + offset;
+	int padding = 56 - (offset + 1);
+
+	*p++ = 0x80;
+	if (padding < 0) {
+		memset(p, 0x00, padding + sizeof (uint64_t));
+		md5_transform(mctx->hash, mctx->block);
+		p = (char *)mctx->block;
+		padding = 56;
+	}
+
+	memset(p, 0, padding);
+	mctx->block[14] = mctx->byte_count << 3;
+	mctx->block[15] = mctx->byte_count >> 29;
+	md5_transform(mctx->hash, mctx->block);
+}
diff -Nru fio-2.1.3/crc/md5.h fio-3.16/crc/md5.h
--- fio-2.1.3/crc/md5.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/md5.h	2019-09-20 01:01:52.000000000 +0000
@@ -23,6 +23,7 @@
 };
 
 extern void fio_md5_update(struct fio_md5_ctx *, const uint8_t *, unsigned int);
+extern void fio_md5_final(struct fio_md5_ctx *);
 extern void fio_md5_init(struct fio_md5_ctx *);
 
 #endif
diff -Nru fio-2.1.3/crc/murmur3.c fio-3.16/crc/murmur3.c
--- fio-2.1.3/crc/murmur3.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/murmur3.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,70 @@
+#include "murmur3.h"
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+	return (x << r) | (x >> (32 - r));
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static inline uint32_t fmix32(uint32_t h)
+{
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+
+	return h;
+}
+
+static uint32_t murmur3_tail(const uint8_t *data, const int nblocks,
+			     uint32_t len, const uint32_t c1,
+			     const uint32_t c2, uint32_t h1)
+{
+	const uint8_t *tail = (const uint8_t *)(data + nblocks * 4);
+
+	uint32_t k1 = 0;
+	switch (len & 3) {
+	case 3:
+		k1 ^= tail[2] << 16;
+		/* fall through */
+	case 2:
+		k1 ^= tail[1] << 8;
+		/* fall through */
+	case 1:
+		k1 ^= tail[0];
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+		h1 ^= k1;
+	};
+
+	return fmix32(h1 ^ len);
+}
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed)
+{
+	const uint8_t *data = (const uint8_t *)key;
+	const int nblocks = len / 4;
+	uint32_t h1 = seed;
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4);
+	int i;
+
+	for (i = -nblocks; i; i++) {
+		uint32_t k1 = blocks[i];
+
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+
+		h1 ^= k1;
+		h1 = rotl32(h1, 13);
+		h1 = h1 * 5 + 0xe6546b64;
+	}
+
+	return murmur3_tail(data, nblocks, len, c1, c2, h1);
+}
diff -Nru fio-2.1.3/crc/murmur3.h fio-3.16/crc/murmur3.h
--- fio-2.1.3/crc/murmur3.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/murmur3.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+#ifndef FIO_MURMUR3_H
+#define FIO_MURMUR3_H
+
+#include <inttypes.h>
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed);
+
+#endif
diff -Nru fio-2.1.3/crc/sha1.c fio-3.16/crc/sha1.c
--- fio-2.1.3/crc/sha1.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha1.c	2019-09-20 01:01:52.000000000 +0000
@@ -55,7 +55,7 @@
 		memcpy(ctx->W, data, len);
 }
 
-void fio_sha1_final(unsigned char hashout[20], struct fio_sha1_ctx *ctx)
+void fio_sha1_final(struct fio_sha1_ctx *ctx)
 {
 	static const unsigned char pad[64] = { 0x80 };
 	unsigned int padlen[2];
@@ -69,11 +69,6 @@
 	i = ctx->size & 63;
 	fio_sha1_update(ctx, pad, 1+ (63 & (55 - i)));
 	fio_sha1_update(ctx, padlen, 8);
-
-	/* Output hash
-	 */
-	for (i = 0; i < 5; i++)
-		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
 }
 
 #if defined(__i386__) || defined(__x86_64__)
diff -Nru fio-2.1.3/crc/sha1.h fio-3.16/crc/sha1.h
--- fio-2.1.3/crc/sha1.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha1.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,8 @@
 #ifndef FIO_SHA1
 #define FIO_SHA1
 
+#include <inttypes.h>
+
 /*
  * Based on the Mozilla SHA1 (see mozilla-sha1/sha1.h),
  * optimized to do word accesses rather than byte accesses,
@@ -15,6 +17,6 @@
 
 void fio_sha1_init(struct fio_sha1_ctx *);
 void fio_sha1_update(struct fio_sha1_ctx *, const void *dataIn, unsigned long len);
-void fio_sha1_final(unsigned char hashout[20], struct fio_sha1_ctx *);
+void fio_sha1_final(struct fio_sha1_ctx *);
 
 #endif
diff -Nru fio-2.1.3/crc/sha256.c fio-3.16/crc/sha256.c
--- fio-2.1.3/crc/sha256.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha256.c	2019-09-20 01:01:52.000000000 +0000
@@ -17,7 +17,6 @@
  *
  */
 #include <string.h>
-#include <inttypes.h>
 
 #include "../lib/bswap.h"
 #include "sha256.h"
@@ -237,37 +236,57 @@
 	sctx->state[5] = H5;
 	sctx->state[6] = H6;
 	sctx->state[7] = H7;
-	sctx->count[0] = sctx->count[1] = 0;
+	sctx->count = 0;
 }
 
 void fio_sha256_update(struct fio_sha256_ctx *sctx, const uint8_t *data,
 		       unsigned int len)
 {
-	unsigned int i, idx, part_len;
+	unsigned int partial, done;
+	const uint8_t *src;
 
-	/* Compute number of bytes mod 128 */
-	idx = (unsigned int)((sctx->count[0] >> 3) & 0x3f);
+	partial = sctx->count & 0x3f;
+	sctx->count += len;
+	done = 0;
+	src = data;
+
+	if ((partial + len) > 63) {
+		if (partial) {
+			done = -partial;
+			memcpy(sctx->buf + partial, data, done + 64);
+			src = sctx->buf;
+		}
+
+		do {
+			sha256_transform(sctx->state, src);
+			done += 64;
+			src = data + done;
+		} while (done + 63 < len);
 
-	/* Update number of bits */
-	if ((sctx->count[0] += (len << 3)) < (len << 3)) {
-		sctx->count[1]++;
-		sctx->count[1] += (len >> 29);
+		partial = 0;
 	}
+	memcpy(sctx->buf + partial, src, len - done);
+}
 
-	part_len = 64 - idx;
+void fio_sha256_final(struct fio_sha256_ctx *sctx)
+{
+	uint64_t bits;
+	unsigned int index, pad_len;
+	int i;
+	static const uint8_t padding[64] = { 0x80, };
 
-	/* Transform as many times as possible. */
-	if (len >= part_len) {
-		memcpy(&sctx->buf[idx], data, part_len);
-		sha256_transform(sctx->state, sctx->buf);
-
-		for (i = part_len; i + 63 < len; i += 64)
-			sha256_transform(sctx->state, &data[i]);
-		idx = 0;
-	} else {
-		i = 0;
-	}
-	
-	/* Buffer remaining input */
-	memcpy(&sctx->buf[idx], &data[i], len-i);
+	/* Save number of bits */
+	bits = (uint64_t) sctx->count << 3;
+
+	/* Pad out to 56 mod 64. */
+	index = sctx->count & 0x3f;
+	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+	fio_sha256_update(sctx, padding, pad_len);
+
+	/* Append length (before padding) */
+	fio_sha256_update(sctx, (const uint8_t *)&bits, sizeof(bits));
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		sctx->buf[i] = sctx->state[i];
 }
diff -Nru fio-2.1.3/crc/sha256.h fio-3.16/crc/sha256.h
--- fio-2.1.3/crc/sha256.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha256.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,19 @@
 #ifndef FIO_SHA256_H
 #define FIO_SHA256_H
 
+#include <inttypes.h>
+
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_BLOCK_SIZE	64
+
 struct fio_sha256_ctx {
-	uint32_t count[2];
-	uint32_t state[8];
+	uint32_t count;
+	uint32_t state[SHA256_DIGEST_SIZE / 4];
 	uint8_t *buf;
 };
 
 void fio_sha256_init(struct fio_sha256_ctx *);
 void fio_sha256_update(struct fio_sha256_ctx *, const uint8_t *, unsigned int);
+void fio_sha256_final(struct fio_sha256_ctx *);
 
 #endif
diff -Nru fio-2.1.3/crc/sha3.c fio-3.16/crc/sha3.c
--- fio-2.1.3/crc/sha3.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/sha3.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,172 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-3, as specified in
+ * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)•
+ * any later version.
+ *
+ */
+#include <string.h>
+
+#include "../os/os.h"
+
+#include "sha3.h"
+
+#define KECCAK_ROUNDS 24
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+static const uint64_t keccakf_rndc[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+static const int keccakf_rotc[24] = {
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const int keccakf_piln[24] = {
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+/* update the state with given number of rounds */
+
+static void keccakf(uint64_t st[25])
+{
+	int i, j, round;
+	uint64_t t, bc[5];
+
+	for (round = 0; round < KECCAK_ROUNDS; round++) {
+
+		/* Theta */
+		for (i = 0; i < 5; i++)
+			bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15]
+				^ st[i + 20];
+
+		for (i = 0; i < 5; i++) {
+			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+			for (j = 0; j < 25; j += 5)
+				st[j + i] ^= t;
+		}
+
+		/* Rho Pi */
+		t = st[1];
+		for (i = 0; i < 24; i++) {
+			j = keccakf_piln[i];
+			bc[0] = st[j];
+			st[j] = ROTL64(t, keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		/* Chi */
+		for (j = 0; j < 25; j += 5) {
+			for (i = 0; i < 5; i++)
+				bc[i] = st[j + i];
+			for (i = 0; i < 5; i++)
+				st[j + i] ^= (~bc[(i + 1) % 5]) &
+					     bc[(i + 2) % 5];
+		}
+
+		/* Iota */
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+static void fio_sha3_init(struct fio_sha3_ctx *sctx, unsigned int digest_sz)
+{
+	memset(sctx->st, 0, sizeof(sctx->st));
+	sctx->md_len = digest_sz;
+	sctx->rsiz = 200 - 2 * digest_sz;
+	sctx->rsizw = sctx->rsiz / 8;
+	sctx->partial = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
+}
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_224_DIGEST_SIZE);
+}
+
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_256_DIGEST_SIZE);
+}
+
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_384_DIGEST_SIZE);
+}
+
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_512_DIGEST_SIZE);
+}
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len)
+{
+	unsigned int done;
+	const uint8_t *src;
+
+	done = 0;
+	src = data;
+
+	if ((sctx->partial + len) > (sctx->rsiz - 1)) {
+		if (sctx->partial) {
+			done = -sctx->partial;
+			memcpy(sctx->buf + sctx->partial, data,
+			       done + sctx->rsiz);
+			src = sctx->buf;
+		}
+
+		do {
+			unsigned int i;
+
+			for (i = 0; i < sctx->rsizw; i++)
+				sctx->st[i] ^= ((uint64_t *) src)[i];
+			keccakf(sctx->st);
+
+			done += sctx->rsiz;
+			src = data + done;
+		} while (done + (sctx->rsiz - 1) < len);
+
+		sctx->partial = 0;
+	}
+	memcpy(sctx->buf + sctx->partial, src, len - done);
+	sctx->partial += (len - done);
+
+	return 0;
+}
+
+void fio_sha3_final(struct fio_sha3_ctx *sctx)
+{
+	unsigned int i, inlen = sctx->partial;
+
+	sctx->buf[inlen++] = 0x06;
+	memset(sctx->buf + inlen, 0, sctx->rsiz - inlen);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] ^= ((uint64_t *) sctx->buf)[i];
+
+	keccakf(sctx->st);
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] = cpu_to_le64(sctx->st[i]);
+
+	memcpy(sctx->sha, sctx->st, sctx->md_len);
+}
diff -Nru fio-2.1.3/crc/sha3.h fio-3.16/crc/sha3.h
--- fio-2.1.3/crc/sha3.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/sha3.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,42 @@
+/*
+ * Common values for SHA-3 algorithms
+ */
+#ifndef __CRYPTO_SHA3_H__
+#define __CRYPTO_SHA3_H__
+
+#include <inttypes.h>
+
+#define SHA3_224_DIGEST_SIZE	(224 / 8)
+#define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
+
+#define SHA3_256_DIGEST_SIZE	(256 / 8)
+#define SHA3_256_BLOCK_SIZE	(200 - 2 * SHA3_256_DIGEST_SIZE)
+
+#define SHA3_384_DIGEST_SIZE	(384 / 8)
+#define SHA3_384_BLOCK_SIZE	(200 - 2 * SHA3_384_DIGEST_SIZE)
+
+#define SHA3_512_DIGEST_SIZE	(512 / 8)
+#define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
+
+struct fio_sha3_ctx {
+	uint64_t	st[25];
+	unsigned int	md_len;
+	unsigned int	rsiz;
+	unsigned int	rsizw;
+
+	unsigned int	partial;
+	uint8_t		buf[SHA3_224_BLOCK_SIZE];
+
+	uint8_t		*sha;
+};
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx);
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len);
+void fio_sha3_final(struct fio_sha3_ctx *sctx);
+
+#endif
diff -Nru fio-2.1.3/crc/sha512.c fio-3.16/crc/sha512.c
--- fio-2.1.3/crc/sha512.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha512.c	2019-09-20 01:01:52.000000000 +0000
@@ -12,7 +12,6 @@
  */
 
 #include <string.h>
-#include <inttypes.h>
 
 #include "../lib/bswap.h"
 #include "sha512.h"
diff -Nru fio-2.1.3/crc/sha512.h fio-3.16/crc/sha512.h
--- fio-2.1.3/crc/sha512.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/crc/sha512.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,8 @@
 #ifndef FIO_SHA512_H
 #define FIO_SHA512_H
 
+#include <inttypes.h>
+
 struct fio_sha512_ctx {
 	uint64_t state[8];
 	uint32_t count[4];
diff -Nru fio-2.1.3/crc/test.c fio-3.16/crc/test.c
--- fio-2.1.3/crc/test.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/test.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,432 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../gettime.h"
+#include "../fio_time.h"
+#include "../lib/rand.h"
+#include "../os/os.h"
+
+#include "../crc/md5.h"
+#include "../crc/crc64.h"
+#include "../crc/crc32.h"
+#include "../crc/crc32c.h"
+#include "../crc/crc16.h"
+#include "../crc/crc7.h"
+#include "../crc/sha1.h"
+#include "../crc/sha256.h"
+#include "../crc/sha512.h"
+#include "../crc/sha3.h"
+#include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/fnv.h"
+#include "../hash.h"
+
+#include "test.h"
+
+#define CHUNK		131072U
+#define NR_CHUNKS	  2048U
+
+struct test_type {
+	const char *name;
+	unsigned int mask;
+	void (*fn)(struct test_type *, void *, size_t);
+	uint32_t output;
+};
+
+enum {
+	T_MD5		= 1U << 0,
+	T_CRC64		= 1U << 1,
+	T_CRC32		= 1U << 2,
+	T_CRC32C	= 1U << 3,
+	T_CRC16		= 1U << 4,
+	T_CRC7		= 1U << 5,
+	T_SHA1		= 1U << 6,
+	T_SHA256	= 1U << 7,
+	T_SHA512	= 1U << 8,
+	T_XXHASH	= 1U << 9,
+	T_MURMUR3	= 1U << 10,
+	T_JHASH		= 1U << 11,
+	T_FNV		= 1U << 12,
+	T_SHA3_224	= 1U << 13,
+	T_SHA3_256	= 1U << 14,
+	T_SHA3_384	= 1U << 15,
+	T_SHA3_512	= 1U << 16,
+};
+
+static void t_md5(struct test_type *t, void *buf, size_t size)
+{
+	uint32_t digest[4];
+	struct fio_md5_ctx ctx = { .hash = digest };
+	int i;
+
+	fio_md5_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_md5_update(&ctx, buf, size);
+		fio_md5_final(&ctx);
+	}
+}
+
+static void t_crc64(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc64(buf, size);
+}
+
+static void t_crc32(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc32(buf, size);
+}
+
+static void t_crc32c(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc32c(buf, size);
+}
+
+static void t_crc16(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc16(buf, size);
+}
+
+static void t_crc7(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fio_crc7(buf, size);
+}
+
+static void t_sha1(struct test_type *t, void *buf, size_t size)
+{
+	uint32_t sha[5];
+	struct fio_sha1_ctx ctx = { .H = sha };
+	int i;
+
+	fio_sha1_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha1_update(&ctx, buf, size);
+		fio_sha1_final(&ctx);
+	}
+}
+
+static void t_sha256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[64];
+	struct fio_sha256_ctx ctx = { .buf = sha };
+	int i;
+
+	fio_sha256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha256_update(&ctx, buf, size);
+		fio_sha256_final(&ctx);
+	}
+}
+
+static void t_sha512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[128];
+	struct fio_sha512_ctx ctx = { .buf = sha };
+	int i;
+
+	fio_sha512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		fio_sha512_update(&ctx, buf, size);
+}
+
+static void t_sha3_224(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_224_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_384(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_384_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_murmur3(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += murmurhash3(buf, size, 0x8989);
+}
+
+static void t_jhash(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += jhash(buf, size, 0x8989);
+}
+
+static void t_fnv(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fnv(buf, size, 0x8989);
+}
+
+static void t_xxhash(struct test_type *t, void *buf, size_t size)
+{
+	void *state;
+	int i;
+
+	state = XXH32_init(0x8989);
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		XXH32_update(state, buf, size);
+
+	t->output = XXH32_digest(state);
+}
+
+static struct test_type t[] = {
+	{
+		.name = "md5",
+		.mask = T_MD5,
+		.fn = t_md5,
+	},
+	{
+		.name = "crc64",
+		.mask = T_CRC64,
+		.fn = t_crc64,
+	},
+	{
+		.name = "crc32",
+		.mask = T_CRC32,
+		.fn = t_crc32,
+	},
+	{
+		.name = "crc32c",
+		.mask = T_CRC32C,
+		.fn = t_crc32c,
+	},
+	{
+		.name = "crc16",
+		.mask = T_CRC16,
+		.fn = t_crc16,
+	},
+	{
+		.name = "crc7",
+		.mask = T_CRC7,
+		.fn = t_crc7,
+	},
+	{
+		.name = "sha1",
+		.mask = T_SHA1,
+		.fn = t_sha1,
+	},
+	{
+		.name = "sha256",
+		.mask = T_SHA256,
+		.fn = t_sha256,
+	},
+	{
+		.name = "sha512",
+		.mask = T_SHA512,
+		.fn = t_sha512,
+	},
+	{
+		.name = "xxhash",
+		.mask = T_XXHASH,
+		.fn = t_xxhash,
+	},
+	{
+		.name = "murmur3",
+		.mask = T_MURMUR3,
+		.fn = t_murmur3,
+	},
+	{
+		.name = "jhash",
+		.mask = T_JHASH,
+		.fn = t_jhash,
+	},
+	{
+		.name = "fnv",
+		.mask = T_FNV,
+		.fn = t_fnv,
+	},
+	{
+		.name = "sha3-224",
+		.mask = T_SHA3_224,
+		.fn = t_sha3_224,
+	},
+	{
+		.name = "sha3-256",
+		.mask = T_SHA3_256,
+		.fn = t_sha3_256,
+	},
+	{
+		.name = "sha3-384",
+		.mask = T_SHA3_384,
+		.fn = t_sha3_384,
+	},
+	{
+		.name = "sha3-512",
+		.mask = T_SHA3_512,
+		.fn = t_sha3_512,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static unsigned int get_test_mask(const char *type)
+{
+	char *ostr, *str = strdup(type);
+	unsigned int mask;
+	char *name;
+	int i;
+
+	ostr = str;
+	mask = 0;
+	while ((name = strsep(&str, ",")) != NULL) {
+		for (i = 0; t[i].name; i++) {
+			if (!strcmp(t[i].name, name)) {
+				mask |= t[i].mask;
+				break;
+			}
+		}
+	}
+
+	free(ostr);
+	return mask;
+}
+
+static int list_types(void)
+{
+	int i;
+
+	for (i = 0; t[i].name; i++)
+		printf("%s\n", t[i].name);
+
+	return 1;
+}
+
+int fio_crctest(const char *type)
+{
+	unsigned int test_mask = 0;
+	uint64_t mb = CHUNK * NR_CHUNKS;
+	struct frand_state state;
+	int i, first = 1;
+	void *buf;
+
+	crc32c_arm64_probe();
+	crc32c_intel_probe();
+
+	if (!type)
+		test_mask = ~0U;
+	else if (!strcmp(type, "help") || !strcmp(type, "list"))
+		return list_types();
+	else
+		test_mask = get_test_mask(type);
+
+	if (!test_mask) {
+		fprintf(stderr, "fio: unknown hash `%s`. Available:\n", type);
+		return list_types();
+	}
+
+	buf = malloc(CHUNK);
+	init_rand_seed(&state, 0x8989, 0);
+	fill_random_buf(&state, buf, CHUNK);
+
+	for (i = 0; t[i].name; i++) {
+		struct timespec ts;
+		double mb_sec;
+		uint64_t usec;
+		char pre[3];
+
+		if (!(t[i].mask & test_mask))
+			continue;
+
+		/*
+		 * For first run, make sure CPUs are spun up and that
+		 * we've touched the data.
+		 */
+		if (first) {
+			usec_spin(100000);
+			t[i].fn(&t[i], buf, CHUNK);
+		}
+
+		fio_gettime(&ts, NULL);
+		t[i].fn(&t[i], buf, CHUNK);
+		usec = utime_since_now(&ts);
+
+		if (usec) {
+			mb_sec = (double) mb / (double) usec;
+			mb_sec /= (1.024 * 1.024);
+			if (strlen(t[i].name) >= 7)
+				sprintf(pre, "\t");
+			else
+				sprintf(pre, "\t\t");
+			printf("%s:%s%8.2f MiB/sec\n", t[i].name, pre, mb_sec);
+		} else
+			printf("%s:inf MiB/sec\n", t[i].name);
+		first = 0;
+	}
+
+	free(buf);
+	return 0;
+}
diff -Nru fio-2.1.3/crc/test.h fio-3.16/crc/test.h
--- fio-2.1.3/crc/test.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/test.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,6 @@
+#ifndef FIO_CRC_TEST_H
+#define FIO_CRC_TEST_H
+
+int fio_crctest(const char *type);
+
+#endif
diff -Nru fio-2.1.3/crc/xxhash.c fio-3.16/crc/xxhash.c
--- fio-2.1.3/crc/xxhash.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/xxhash.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,421 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2014, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+
+//**************************************
+// Tuning parameters
+//**************************************
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for uint32_t).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+// XXH_ACCEPT_NULL_INPUT_POINTER :
+// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+// This option has a very small performance cost (only measurable on small inputs).
+// By default, this option is disabled. To enable it, uncomment below define :
+//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
+
+// XXH_FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// Results are therefore identical for little-endian and big-endian CPU.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+
+//**************************************
+// Includes & Memory related functions
+//**************************************
+#include "xxhash.h"
+#include <stdlib.h>
+#include <string.h>
+
+
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  ifdef __IBMC__
+#    pragma pack(1)
+#  else
+#    pragma pack(push, 1)
+#  endif
+#endif
+
+typedef struct _uint32_t_S { uint32_t v; } _PACKED uint32_t_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((uint32_t_S *)(x))->v)
+
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline uint32_t XXH_swap32 (uint32_t x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+        ((x <<  8) & 0x00ff0000 ) |
+        ((x >>  8) & 0x0000ff00 ) |
+        ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+//**************************************
+// Constants
+//**************************************
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+//**************************************
+// Architecture Macros
+//**************************************
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
+    static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+
+
+//****************************
+// Memory reads
+//****************************
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+static uint32_t XXH_readLE32_align(const uint32_t* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
+    else
+        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
+}
+
+static uint32_t XXH_readLE32(const uint32_t* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); }
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+static uint32_t XXH32_endian_align(const void* input, int len, uint32_t seed, XXH_endianess endian, XXH_alignment align)
+{
+    const uint8_t *p = (const uint8_t *)input;
+    const uint8_t * const bEnd = p + len;
+    uint32_t h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) { len=0; p=(const uint8_t *)(size_t)16; }
+#endif
+
+    if (len>=16)
+    {
+        const uint8_t * const limit = bEnd - 16;
+        uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+        uint32_t v2 = seed + PRIME32_2;
+        uint32_t v3 = seed + 0;
+        uint32_t v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (uint32_t) len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32_align((const uint32_t*)p, endian, align) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+uint32_t XXH32(const void* input, uint32_t len, uint32_t seed)
+{
+#if 0
+    // Simple version, good for code maintenance, but unfortunately slow for small inputs
+    void* state = XXH32_init(seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+int XXH32_sizeofState(void)
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t);
+}
+
+
+XXH_errorcode XXH32_resetState(void* state_in, uint32_t seed)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+void* XXH32_init (uint32_t seed)
+{
+    void *state = malloc (sizeof(struct XXH_state32_t));
+    XXH32_resetState(state, seed);
+    return state;
+}
+
+
+static XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const uint8_t *p = (const uint8_t *)input;
+    const uint8_t * const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   // fill in tmp buffer
+    {
+        memcpy(state->memory + state->memsize, input, len);
+        state->memsize +=  len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   // some data left from previous update
+    {
+        memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        {
+            const uint32_t* p32 = (const uint32_t*)state->memory;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const uint8_t * const limit = bEnd - 16;
+        uint32_t v1 = state->v1;
+        uint32_t v2 = state->v2;
+        uint32_t v3 = state->v3;
+        uint32_t v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        memcpy(state->memory, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+static uint32_t XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const uint8_t *p = (const uint8_t *)state->memory;
+    uint8_t * bEnd = (uint8_t *)state->memory + state->memsize;
+    uint32_t h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (uint32_t) state->total_len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32((const uint32_t*)p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+uint32_t XXH32_intermediateDigest (void* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+}
+
+
+uint32_t XXH32_digest (void* state_in)
+{
+    uint32_t h32 = XXH32_intermediateDigest(state_in);
+
+    free(state_in);
+
+    return h32;
+}
diff -Nru fio-2.1.3/crc/xxhash.h fio-3.16/crc/xxhash.h
--- fio-2.1.3/crc/xxhash.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/crc/xxhash.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,177 @@
+/*
+   xxHash - Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+  
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+  
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function. 
+It depends on successfully passing SMHasher test set. 
+10 is a perfect score.
+*/
+
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#include <inttypes.h>
+
+struct XXH_state32_t
+{
+    uint64_t total_len;
+    uint32_t seed;
+    uint32_t v1;
+    uint32_t v2;
+    uint32_t v3;
+    uint32_t v4;
+    int memsize;
+    char memory[16];
+};
+
+//****************************
+// Type
+//****************************
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+uint32_t XXH32 (const void* input, uint32_t len, uint32_t seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    The memory between input & input+len must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
+*/
+
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*         XXH32_init   (uint32_t seed);
+XXH_errorcode XXH32_update (void* state, const void* input, int len);
+uint32_t XXH32_digest (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+It must be started with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_update().
+XXH32_update() can be called as many times as necessary.
+The user must provide a valid (allocated) input.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1. 
+If your data is larger, it is recommended to chunk your data into blocks 
+of size for example 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_digest().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+Memory will be freed by XXH32_digest().
+*/
+
+
+int           XXH32_sizeofState(void);
+XXH_errorcode XXH32_resetState(void* state, uint32_t seed);
+
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+*/
+
+
+uint32_t XXH32_intermediateDigest (void* state);
+/*
+This function does the same as XXH32_digest(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
+To free memory context, use XXH32_digest(), or free().
+*/
+
+
+
+//****************************
+// Deprecated function names
+//****************************
+// The following translations are provided to ease code transition
+// You are encouraged to no longer this function names
+#define XXH32_feed   XXH32_update
+#define XXH32_result XXH32_digest
+#define XXH32_getIntermediateResult XXH32_intermediateDigest
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff -Nru fio-2.1.3/debian/changelog fio-3.16/debian/changelog
--- fio-2.1.3/debian/changelog	2013-10-07 10:01:58.000000000 +0000
+++ fio-3.16/debian/changelog	2019-12-21 23:54:45.000000000 +0000
@@ -1,3 +1,289 @@
+fio (3.16-1sergeyd1~trusty1) trusty; urgency=medium
+
+  * Rebuild for SergeyD
+  * Disabled support: glusterfs, rdb, rdma
+  * Diable gfio
+
+ -- Sergey Dryabzhinsky <sergey.dryabzhinsky@gmail.com>  Sat, 21 Dec 2019 20:06:06 +0300
+
+fio (3.16-1) unstable; urgency=medium
+
+  [ Sven Hoexter ]
+  * Import new upstream version 3.16. Closes: #942537
+  * Add debian/patches/fix-compilation-3.16 to fix the compilation of
+    3.16. Picked from upstream master 5b215853ed4b438b5b2d4ac3e56d5f0d19e145d9.
+
+ -- Sven Hoexter <hoexter@debian.org>  Mon, 21 Oct 2019 09:50:07 +0200
+
+fio (3.15-1) unstable; urgency=medium
+
+  [ Martin Steigerwald ]
+  * Imported new upstream version 3.15:
+    * New engine 'io_uring' for new IO uring asynchronous I/O interface
+      of Linux.
+    * New engine 'libiscsi' for iSCSI targets.
+    * Addresses bug #929579 for upstream related changes to support cross
+      building (see messages 15 and 20 of this bug report).
+  * patches: Refreshed.
+  * control: Bumped standards version to 4.4.0.1, no changes needed.
+
+  [ Sven Hoexter ]
+  * Bump Standards-Version to 4.4.1, no changes required.
+  * Export CROSS_COMPILE when required to allow cross compilation.
+    Patch provided by Helmut Grohne. Closes: #929579
+
+ -- Sven Hoexter <hoexter@debian.org>  Sun, 20 Oct 2019 19:48:57 +0200
+
+fio (3.12-2) unstable; urgency=medium
+
+  * control: Depend on libglusterfs-dev instead of glusterfs-common
+    (Closes: #919670).
+  * control: Bumped standards version to 4.3.0, no changes needed.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Mon, 21 Jan 2019 09:29:38 +0100
+
+fio (3.12-1) unstable; urgency=medium
+
+  * Imported upstream version 3.12.
+  * patches: Refreshed.
+  * copyright: Removed engines/fusion-aw.c, removed by upstream.
+  * control: Bumped standards version to 4.2.1, no changes needed.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Wed, 28 Nov 2018 16:47:49 +0100
+
+fio (3.8-1) unstable; urgency=medium
+
+  * Imported upstream version 3.8.
+  * rules: Use '--disable-native' to disable build system local CPU
+    optimizations (Closes: #898473). Thank you, Sitsofe.
+  * control: Bumped standards version to 4.2.0, no changes needed.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Tue, 21 Aug 2018 16:19:28 +0200
+
+fio (3.6-4) unstable; urgency=medium
+
+  * Contrary to what I stated in 3.5-1 changelog entry libnuma1-dev appears
+    to be available in buster/sid. As it is only available for Linux platforms
+    restrict the build dependency to those instead.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Wed, 16 May 2018 11:24:41 +0200
+
+fio (3.6-3) unstable; urgency=medium
+
+  * control: Also build libibverbs-dev only on Linux platforms as it is only
+    available for those.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Tue, 15 May 2018 14:04:56 +0200
+
+fio (3.6-2) unstable; urgency=medium
+
+  * control: Build with librdmacm-dev and librbd-dev only on Linux platforms as
+    they are only available for those.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Tue, 15 May 2018 10:21:30 +0200
+
+fio (3.6-1) unstable; urgency=medium
+
+  * Imported upstream version 3.6.
+  * patches: Refreshed.
+  * control: Build with libaio-dev only on Linux platforms instead of all
+    other platforms than Linux. Thanks for Sven Hoexter for finding the
+    mistake in the build depends. Thanks Hillel Lubman for the report
+    (Closes: #898210).
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Wed, 09 May 2018 08:55:08 +0200
+
+fio (3.5-1) unstable; urgency=medium
+
+  * Imported upstream version 3.5.
+  * Install systemd service file for running fio as a service, disabled
+    by default (also see #893778).
+  * Install sysvinit script for running fio as a service, disabled by
+    default. Thanks to KatolaZ from Devuan project for helpful hint.
+    Does not install any symlinks currently (see #894084).
+  * patches:
+    - Refreshed.
+    - fix-ftbfs-with-libmtd.h: Removed. Implemented upstream (also see
+    #893778).
+    - makefile-hardening: Allow adding hardening options to CPPFLAGS as
+      well.
+    - reproducible-build: Removed. Implemented upstream (also see #893778
+      and upstream commit 785e49c659023df1735bff195ad4ba133ebd23a7).
+    - genfio-interpreter: Keep /bin/bash as interpreter, no usrmerge (yet).
+  * changelog, rules: Removed trailing white spaces as lintian suggests.
+  * compat: Switched debhelper compat level from 9 to 11 as lintian and
+    debhelper manpage recommend.
+  * control:
+    - Switched Vcs URLs from Alioth to Salsa as I migrated the repo.
+    - Bumped standards version to 4.1.4. No changes needed.
+    - Updated build dependency to debhelper version 11 or later.
+    - Added build dependency for libnuma1 for all architectures, except
+      armhf and armel, where this library is not available (Closes: #861554,
+      also see #893778).
+    - Added build dependencies bison and flex to enable arithmetic
+      calculations in fio jobs (see #893778, also see fio manpage,
+      JOB FILE PARAMETERS).
+    - Added build dependency for the fio GlusterFS engine (see #893778).
+    - Disable build dependency for libaio for all platforms except Linux,
+      to enable build on non-Linux platforms like Hurd (see #893778).
+    - Change binary dependency to python2.7 as all the python scripts use
+      this version.
+  * rules:
+    - Enable verbose build to allow tools like blhc to check for
+      hardening flags for compiler and linker (see #893778).
+    - Enable all hardening build flags, not just bindnow.
+  * docs: Removed tools/fio_latency2csv.py, not in upstream source anymore.
+  * copyright: Switched copyright format URL to https as Lintian suggests.
+  * Implemented all suggestions from sitsofe´s Debian patch comments I
+    decided to implement (Closes: #893778).
+  * README.Debian: Added a note about fio´s weird usage of units in job
+    files and parseable output (Closes: #872321).
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Mon, 09 Apr 2018 13:45:59 +0200
+
+fio (3.1-1) unstable; urgency=medium
+
+  * Imported upstream versions 3.0 and 3.1 (Closes: #869686).
+  * Adapted my mail address to new company domain.
+  * patches:
+    - Refreshed.
+    - Dropped patch spelling-errors. Applied upstream.
+    - Changed author in all of my patches to new mail address.
+  * control: Bumped standards version to 4.1.1.0.
+    * watch: Changed to pgpsigurlmangle as recommended by policy 4.1.0.0,
+      section 4.11.
+
+ -- Martin Steigerwald <martin.steigerwald@proact.de>  Tue, 24 Oct 2017 09:47:45 +0200
+
+fio (2.17-1) unstable; urgency=medium
+
+  * Imported upstream version 2.17.
+  * debian/patches: Refreshed.
+  * debian/copyright: Updated, added some files, all GPL-2.
+  * debian/docs: Added fio_latency2csv.py, example systemd fio.service file, and
+    logparser histogram scripts from tools/hist.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Mon, 23 Jan 2017 11:10:42 +0100
+
+fio (2.16-1) unstable; urgency=medium
+
+  * Imported upstream version 2.16.
+  * debian/patches: Refreshed.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Fri, 23 Dec 2016 17:09:43 +0100
+
+fio (2.15-1) unstable; urgency=medium
+
+  * Imported upstream version 2.15.
+  * debian/patches:
+    - Refreshed.
+    - spelling-errors: Fix some spelling errors in HOWTO, fio binary
+      and manpage.
+    - reproducible-build: Sort object files for deterministic linking order
+      to enable reproducible builds. Thank to Rainer Herrmann for the bug
+      report and patch (Closes: #828791).
+  * debian/watch: Check for GPG signature of upstream tarball.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Mon, 19 Dec 2016 16:02:52 +0100
+
+fio (2.10-2) unstable; urgency=medium
+
+  * copyright: Added fiologparser.py copyrights as per git blame.
+  * control: Added python-scipy as suggests for fiologparser.py.
+  * control: Removed Sven Hoexter as uploader again as per his request.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Tue, 24 May 2016 12:33:25 +0200
+
+fio (2.10-1) unstable; urgency=medium
+
+  * Imported Upstream version 2.10.
+  * patches: Refreshed, no manual changes needed.
+  * debian/rules: Enabled using the "bindnow" linker flag to disable more
+    insecure late binding.
+  * docs: Added file tools/fiologparser.py.
+  * Imported Upstream version 2.9.
+  * patches: Refreshed, no manual changes needed.
+  * control:
+    * Upgraded to standards version 3.9.8.0, no changes needed.
+    * Removed Michael Prokop as uploader as per his request. Thanks
+      for sponsoring the package initially.
+    * Added Sven Hoexter as uploader as he sponsored the last versions.
+  * docs: Add file MORAL-LICENSE.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Mon, 23 May 2016 16:28:16 +0200
+
+fio (2.6-1) unstable; urgency=medium
+
+  * Imported Upstream version 2.6.
+  * patches/fix-ftbfs-with-libmtd.h: Fix FTBFS: libmtd.h:288:8: error:
+    unknown type name 'uint8_t'. Thanks Chris. (Closes: #815735)
+  * control:
+    - Updated to standards version 3.9.7. No changes needed.
+    - Made Vcs-Browser and Vcs-Git URLs secure.
+  - copyright:
+    - Adapted path for library files that are now in oslib.
+    - fio.1: Fixed license to GPL-2. Don´t know how I came to the
+      conclusion that it would be GPL-2+, if the rest without any
+      license specified is GPL-2.
+    - rate_submit.c: Added copyright.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Wed, 24 Feb 2016 10:47:40 +0100
+
+fio (2.2.10-1) unstable; urgency=medium
+
+  * Imported Upstream version 2.2.10 (Closes: #803991).
+    * Fixes fio: calc_clat_percentiles() create unaligned access on armhf.
+      (Closes: #779336). Thanks to Andreas Bießmann.
+    * Fixes ssd-test 1g file size too small (Closes: #776701). Thanks to
+      Daniel Pocock.
+  * patches:
+    * Removed makefile patches as makefile is now generated via autoconf.
+    * configure-no-configlog: Remove config.log at end of configure process
+      to avoid error about uncommitted change from dpkg-source.
+  * rules:
+    * Switched to dh_$@ wildcard version of rules file.
+    * Added override for dh_auto_configure cause upstream configure file
+      does not understand many options dh_auto_configure passed to it.
+    * Enabled build of graphical fio frontend gfio.
+    * Made sure that is only links in needed libraries for gfio to avoid
+      unnecessary dependencies.
+  * control:
+    * Updated to standards version 3.9.6. No changes needed.
+    * Point to github page as new homepage as it appears to be closest to a
+      homepage of any of the URLs mentioned in the README.
+    * Added new binary package gfio for graphical frontend of fio.
+    * Mentioned relationship between fio and gfio in package descriptions
+      and added suggests for each other.
+  * copyright:
+    * Fixed lintian messages about changed/wrong paths to
+      referenced files.
+    * Reviewed and updated by rechecking against all source files.
+  * Switched to debhelper 9 in order to have dpkg-buildflags and thus
+    hardening by default.
+
+ -- Martin Steigerwald <martin.steigerwald@teamix.de>  Tue, 10 Nov 2015 10:00:35 +0100
+
+fio (2.1.11-2) unstable; urgency=medium
+
+  * debian/control: Added RBD support by adding librbd-dev to build
+    dependencies. Thanks to Benjamin Drung for report and patch.
+    Closes: #760266.
+
+ -- Martin Steigerwald <ms@teamix.de>  Tue, 02 Sep 2014 15:37:28 +0200
+
+fio (2.1.11-1) unstable; urgency=medium
+
+  * dh_builddeb without -z9, so use xz, but do not set default level.
+  * Imported Upstream version 2.1.11.
+  * debian/patches:
+    - makefile-hardening: Adapted to new C++ build option.
+    - makefile-clean-delete-config-log-and-d-files: Deleted. Fixed upstream.
+    - makefile-rm-configlog: Added.
+  * debian/control: Updated to standards version 3.9.5.
+    No changes needed.
+
+ -- Martin Steigerwald <ms@teamix.de>  Mon, 21 Jul 2014 15:53:36 +0200
+
 fio (2.1.3-1) unstable; urgency=low
 
   * Imported Upstream version 2.1.2.
@@ -32,7 +318,7 @@
   * debian/control: Added build dependency zlib1g-dev as fio can use
     zlib to compress client/server communication.
   * Switched to xz compression. Saved almost 37 KiB on binary package.
-  * Updated to Standards-Version to 3.9.4, no changes needed. 
+  * Updated to Standards-Version to 3.9.4, no changes needed.
   * debian/patches/fio_generate_plots-gnuplot:
     Make test for gnuplot work with empty strings (Closes: #700580).
     Thanks to Hervé Werner for providing bug report and fix.
@@ -41,7 +327,7 @@
 
 fio (2.0.8-2) unstable; urgency=low
 
-  * debian/patches/ftbfs-fix-for-mips-architecture: 
+  * debian/patches/ftbfs-fix-for-mips-architecture:
     Fixed build failure on mips and mipsel architectures. Sven Hoexter
     suggested the fix and Jens Axboe confirmed it. Thanks.
 
@@ -212,4 +498,3 @@
   * Created manpage for bash shell script fio_generate_plots.
 
  -- Martin Steigerwald <ms@teamix.de>  Tue, 19 May 2009 17:12:02 +0200
-
diff -Nru fio-2.1.3/debian/compat fio-3.16/debian/compat
--- fio-2.1.3/debian/compat	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/compat	2019-12-21 17:05:51.000000000 +0000
@@ -1 +1 @@
-7
+9
diff -Nru fio-2.1.3/debian/control fio-3.16/debian/control
--- fio-2.1.3/debian/control	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/control	2019-12-21 17:32:39.000000000 +0000
@@ -1,18 +1,20 @@
 Source: fio
 Section: utils
 Priority: optional
-Maintainer: Martin Steigerwald <ms@teamix.de>
-Uploaders: Michael Prokop <mika@debian.org>
-Build-Depends: debhelper (>= 7), dpkg-dev (>= 1.16.1~), libaio-dev, zlib1g-dev, librdmacm-dev, libibverbs-dev
-Standards-Version: 3.9.4
-Homepage: http://freshmeat.net/projects/fio
-Vcs-Git: git://anonscm.debian.org/collab-maint/fio.git
-Vcs-Browser: http://anonscm.debian.org/gitweb/?p=collab-maint/fio.git
+Maintainer: Martin Steigerwald <martin.steigerwald@proact.de>
+Build-Depends: debhelper (>= 9), dpkg-dev (>= 1.16.1~), libaio-dev [linux-any], zlib1g-dev,
+# librdmacm-dev [linux-any], libibverbs-dev [linux-any], librbd-dev [linux-any],
+ libnuma-dev [amd64 i386], flex, bison,
+# libglusterfs-dev
+Standards-Version: 4.4.1
+Homepage: https://github.com/axboe/fio
+Vcs-Git: https://salsa.debian.org/debian/fio.git
+Vcs-Browser: https://salsa.debian.org/debian/fio
 
 Package: fio
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}, python
-Suggests: gnuplot
+Depends: ${shlibs:Depends}, ${misc:Depends}, python2.7
+Suggests: gnuplot, gfio, python-scipy
 Description: flexible I/O tester
  fio is a tool that will spawn a number of threads or processes doing a
  particular type of I/O action as specified by the user. fio takes a
@@ -20,3 +22,7 @@
  otherwise parameters given to them overriding that setting is given.
  The typical use of fio is to write a job file matching the I/O load
  one wants to simulate.
+ .
+ This package contains the command line version of fio and all additional
+ command line tools. The package gfio contains the GTK+ based gui frontend
+ for fio.
diff -Nru fio-2.1.3/debian/copyright fio-3.16/debian/copyright
--- fio-2.1.3/debian/copyright	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/copyright	2019-10-21 06:18:14.000000000 +0000
@@ -1,15 +1,16 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: fio
 Upstream-Contact: Jens Axboe <axboe@kernel.dk>
 Source: http://brick.kernel.dk/snaps/
 
 Files: *
 Copyright: 2005 Jens Axboe <axboe@suse.de>
-           2006-2012 Jens Axboe <axboe@kernel.dk>
+           2006-2017 Jens Axboe <axboe@kernel.dk>
 License: GPL-2
 
 Files: debian/*
-Copyright: 2009-2012 Martin Steigerwald <ms@teamix.de>
+Copyright: 2009-2017 Martin Steigerwald <ms@teamix.de
+           2017 Martin Steigerwald <martin.steigerwald@proact.de>
 Comment:
  This package was debianized by Martin Steigerwald <ms@teamix.de> on
  Tue, 19 May 2009 15:04:02 +0200.
@@ -24,22 +25,32 @@
  based on documentation by Jens Axboe
 License: GPL-2
 
-Files: fio_generate_plots.1
-Copyright: Written by Martin Steigerwald <ms@teamix.de>
+Files: gfio.c
+Copyright: 2012 Stephen M. Cameron <stephenmcameron@gmail.com>
+           2012 Jens Axboe <axboe@kernel.dk>
+License: GPL-2
+
+Files: graph.c
+Copyright: 2012 Stephen M. Cameron <stephenmcameron@gmail.com>
 License: GPL-2
 
 Files: hash.h
 Copyright: 2002 William Lee Irwin III, IBM
+License: GPL-2+
+
+Files: rate-submit.c
+Copyright: 2015 Jens Axboe <axboe@kernel.dk>
 License: GPL-2
 
-Files: rbtree.c
-Copyright: 1999 Andrea Arcangeli <andrea@suse.de>
-           2002  David Woodhouse <dwmw2@infradead.org>
+Files: tickmarks.c
+Copyright: adapted from Paul Heckbert's algorithm on p 657-659 o
+           Andrew S. Glassner's book, "Graphics Gems"
+           ISBN 0-12-286166-3
 License: GPL-2+
 
-Files: rbtree.h
-Copyright: 1999 Andrea Arcangeli <andrea@suse.de>
-License: GPL-2+
+Files: workqueue.c
+Copyright: 2015 Jens Axboe <axboe@kernel.dk>
+License: GPL-2
 
 Files: crc/crc16.h
 Copyright: 2005 Ben Gardner <bgardner@wabtec.com>
@@ -49,15 +60,15 @@
 Copyright: 1992, 1995-1999 Free Software Foundation, Inc.
 License: GPL-2+
 
-Files: crc/crc32c.c:
+Files: crc/crc32c.c
 Copyright: 2004 Cisco Systems, Inc.
 License: GPL-2+
 
 Files: crc/crc32c-intel.c
-Copyright: Based on a posting to lkml by Austin Zhang <austin.zhang@intel.com>
-License: GPL-2
+Copyright: Based on a posting to LKML by Austin Zhang <austin.zhang@intel.com>
+License: GPL-2+
 
-Files: crc/md5.c:
+Files: crc/md5.c
 Copyright: Shamelessly lifted from the 2.6 kernel (crypto/md5.c)
 License: GPL-2
 
@@ -77,18 +88,134 @@
            2003 Kyle McMartin <kyle@debian.org>
 License: GPL-2+
 
-Files: engines/windowsaio.c
-Copyright: 2011-2012 Bruce Cran <bruce@cran.org.uk>
+Files: crc/xxhash.c crc/xxhash.h
+Copyright: 2012-2014 Yann Collet
+License: BSD-2-clause
+
+Files: engines/dev-dax.c
+Copyright: 2016 Intel Corp
 License: GPL-2
 
-Files: lib/getopt_long.c
-Copyright: This file was imported from the klibc library from hpa
+Files: engines/pmemblk.c
+Copyright: 2016 Hewlett Packard Enterprise Development LP
+License: GPL-2
+
+Files: examples/gfapi.fio
+Copyright: Originally authored by Castor Fu
+License: GPL-2
+
+Files: examples/jesd219.fio
+Copyright: Based on a posting from Jeff Furlong <jeff.furlong@hgst.com>
+License: GPL-2
+
+Files: exp/expression-parser.l exp/expression-parser.y exp/test-expression-parser.c
+Copyright: 2014, Stephen M. Cameron
+License: GPL-2
+
+Files: lib/ieee754.c
+Copyright: Shamelessly lifted from Beej's Guide to Network Programming, found here:
+           http://beej.us/guide/bgnet/output/html/singlepage/bgnet.html#serialization
+           Brian "Beej Jorgensen" Hall <beej@beej.us>
+License: public-domain
+
+Files: lib/lfsr.c
+Copyright: LFSR taps retrieved from:
+           http://home1.gte.net/res0658s/electronics/LFSRtaps.html
+License: GPL-2+
+
+Files: lib/prio_tree.c
+Copyright: 2004 Rajesh Venkatasubramanian <vrajesh@umich.edu>
 License: GPL-2
 
 Files: lib/rand.c
 Copyright: Based on code from GNU Scientific Library 1.5 (30 Jun 2004)
 License: GPL-2
 
+Files: lib/rbtree.c
+Copyright: 1999 Andrea Arcangeli <andrea@suse.de>
+           2002 David Woodhouse <dwmw2@infradead.org>
+License: GPL-2+
+
+Files: lib/rbtree.h
+Copyright: 1999 Andrea Arcangeli <andrea@suse.de>
+License: GPL-2+
+
+Files: oslib/getopt_long.c
+Copyright: This file was imported from the klibc library from hpa
+License: GPL-2
+
+Files: oslib/libmtd.c oslib/libmtd_int.h
+Copyright: 2006 International Business Machines Corp.
+           2009 Nokia Corporation
+License: GPL-2+
+
+Files: oslib/libmtd.h
+Copyright: 2008, 2009 Nokia Corporation
+License: GPL-2+
+
+Files: oslib/libmtd_common.h
+Copyright: 2007, 2008 Artem Bityutskiy
+License: GPL-2+
+
+Files: oslib/libmtd_legacy.c
+Copyright: 2009 Nokia Corporation
+License: GPL-2+
+
+Files: oslib/libmtd_xalloc.h
+Copyright: 2001, 2002 Red Hat, Inc.
+           2001 David A. Schleef <ds@lineo.com>
+           2002 Axis Communications AB
+           2001, 2002 Erik Andersen <andersen@codepoet.org>
+           2004 University of Szeged, Hungary
+           2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
+License: GPL-2+
+
+Files: t/read-to-pipe-async.c
+Copyright: 2016 Jens Axboe
+License: GPL-2+
+
+Files: tools/fio_generate_plots.1
+Copyright: Written by Martin Steigerwald <ms@teamix.de>
+License: GPL-2
+
+Files: tools/fiologparser.py
+Copyright: 2016 Mark Nelson <mnelson@redhat.com>
+           2016 Ben England <bengland@redhat.com>
+License: GPL-2
+
+Files: tools/hist/*
+Copyright: Karl Cronburg
+License: GPL-2
+
+Files: tools/genfio tools/plot/fio2gnuplot tools/plot/fio2gnuplot.1 tools/plot/fio2gnuplot.manpage tools/plot/graph2D.gpm tools/plot/graph3D.gpm tools/plot/math.gpm
+Copyright: 2013 eNovance SAS <licensing@enovance.com>
+           Erwan Velu <erwan@enovance.com>
+License: GPL-2
+
+License: BSD-2-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ .
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 License: GPL-2
  This package is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -122,3 +249,8 @@
  On Debian systems, the complete text of the GNU General
  Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
 
+License: public-domain
+ The C source code presented in this document is hereby granted to the public domain, and is completely free of any license restriction.
+Comment:
+ http://beej.us/guide/bgnet/output/html/singlepage/bgnet.html#copyright
+
diff -Nru fio-2.1.3/debian/docs fio-3.16/debian/docs
--- fio-2.1.3/debian/docs	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/docs	2019-10-21 06:18:14.000000000 +0000
@@ -1,4 +1,9 @@
 README
 HOWTO
+MORAL-LICENSE
 REPORTING-BUGS
 examples/
+tools/hist/fiologparser_hist.py
+tools/hist/fiologparser_hist.py.1
+tools/hist/half-bins.py
+tools/fiologparser.py
diff -Nru fio-2.1.3/debian/fio.init fio-3.16/debian/fio.init
--- fio-2.1.3/debian/fio.init	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/fio.init	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,22 @@
+#!/bin/sh
+# kFreeBSD do not accept scripts as interpreters, using #!/bin/sh and sourcing.
+if [ true != "$INIT_D_SCRIPT_SOURCED" ] ; then
+    set "$0" "$@"; INIT_D_SCRIPT_SOURCED=true . /lib/init/init-d-script
+fi
+### BEGIN INIT INFO
+# Provides:          fio
+# Required-Start:    $syslog
+# Required-Stop:     $syslog
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: Flexible I/O Tester as service
+# Description:       Runs fio as a service to that fio clients can connect
+#                    to it
+### END INIT INFO
+
+# Author: Martin Steigerwald <martin.steigerwald@proact.de>
+
+DESC="Flexible I/O Tester as service"
+DAEMON=/usr/bin/fio
+DAEMON_ARGS='--server --daemonize /run/fio.pid'
+PIDFILE='/run/fio.pid'
diff -Nru fio-2.1.3/debian/fio.install fio-3.16/debian/fio.install
--- fio-2.1.3/debian/fio.install	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/fio.install	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,8 @@
+fio				usr/bin
+t/fio-genzipf			usr/bin
+t/fio-btrace2fio		usr/bin
+t/fio-dedupe			usr/bin
+tools/fio_generate_plots	usr/bin
+tools/plot/fio2gnuplot		usr/bin
+tools/genfio			usr/bin
+tools/plot/*.gpm		usr/share/fio
diff -Nru fio-2.1.3/debian/fio.manpages fio-3.16/debian/fio.manpages
--- fio-2.1.3/debian/fio.manpages	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/fio.manpages	2019-10-21 06:18:14.000000000 +0000
@@ -1,2 +1,3 @@
 fio.1
 tools/fio_generate_plots.1
+tools/plot/fio2gnuplot.1
diff -Nru fio-2.1.3/debian/fio.service fio-3.16/debian/fio.service
--- fio-2.1.3/debian/fio.service	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/fio.service	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,12 @@
+[Unit]
+Description=Flexible I/O Tester as service
+After=network.target
+Documentation=man:fio(1)
+
+[Service]
+Type=forking
+PIDFile=/run/fio.pid
+ExecStart=/usr/bin/fio --server --daemonize /run/fio.pid
+
+[Install]
+WantedBy=multi-user.target
diff -Nru fio-2.1.3/debian/gbp.conf fio-3.16/debian/gbp.conf
--- fio-2.1.3/debian/gbp.conf	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/gbp.conf	2019-10-21 06:18:14.000000000 +0000
@@ -1,4 +1,3 @@
 [DEFAULT]
 pristine-tar = True
-postimport = git-dch -N%(version)s -S -a
 ignore-new = True
diff -Nru fio-2.1.3/debian/patches/configure-no-configlog fio-3.16/debian/patches/configure-no-configlog
--- fio-2.1.3/debian/patches/configure-no-configlog	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/patches/configure-no-configlog	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,11 @@
+Description: Remove config.log to fix dpkg-source error about changed files.
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
+
+--- a/configure
++++ b/configure
+@@ -2726,3 +2726,5 @@
+ include \$(SRCDIR)/Makefile
+ EOF
+ fi
++
++rm -f config.log
diff -Nru fio-2.1.3/debian/patches/fio2gnuplot-manpage fio-3.16/debian/patches/fio2gnuplot-manpage
--- fio-2.1.3/debian/patches/fio2gnuplot-manpage	2013-10-07 09:00:29.000000000 +0000
+++ fio-3.16/debian/patches/fio2gnuplot-manpage	2019-10-21 06:18:14.000000000 +0000
@@ -1,11 +1,11 @@
 Description: Fix tag lintian manpage-section-mismatch.
-Author: Martin Steigerwald <ms@teamix.de>
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/tools/plot/fio2gnuplot.1
 +++ b/tools/plot/fio2gnuplot.1
 @@ -1,5 +1,5 @@
  .\" Text automatically generated by txt2man
--.TH fio2gnuplot  "07 août 2013" "" ""
+-.TH fio2gnuplot 1 "August 2013"
 +.TH fio2gnuplot 1 "07 August 2013" "User Manual"
  .SH NAME
  \fBfio2gnuplot \fP- Render fio's output files with gnuplot
diff -Nru fio-2.1.3/debian/patches/fio_generate_plots-gnuplot fio-3.16/debian/patches/fio_generate_plots-gnuplot
--- fio-2.1.3/debian/patches/fio_generate_plots-gnuplot	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/patches/fio_generate_plots-gnuplot	1970-01-01 00:00:00.000000000 +0000
@@ -1,17 +0,0 @@
-Description: Make test for gnuplot work with empty strings
-  This way it also works if gnuplot is not installed. Fixes:
-  /usr/bin/fio_generate_plots: -: not found
-  http://bugs.debian.org/700580 
-Author: Martin Steigerwald <ms@teamix.de>
-
---- a/fio_generate_plots
-+++ b/fio_generate_plots
-@@ -22,7 +22,7 @@
- fi
- 
- GNUPLOT=$(which gnuplot)
--if [ ! -x $GNUPLOT ]
-+if [ ! -x "$GNUPLOT" ]
- then
- 	echo You need gnuplot installed to generate graphs
- 	exit 1
diff -Nru fio-2.1.3/debian/patches/fix-compilation-3.16 fio-3.16/debian/patches/fix-compilation-3.16
--- fio-2.1.3/debian/patches/fix-compilation-3.16	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/patches/fix-compilation-3.16	2019-10-21 07:42:52.000000000 +0000
@@ -0,0 +1,16 @@
+Fix compilation of 3.16, picked from
+https://github.com/axboe/fio/commit/5b215853ed4b438b5b2d4ac3e56d5f0d19e145d9
+Can be removed once 3.17+ is released.
+Index: fio/gclient.c
+===================================================================
+--- fio.orig/gclient.c
++++ fio/gclient.c
+@@ -330,7 +330,7 @@ static void gfio_update_thread_status_al
+ 	static char message[100];
+ 	const char *m = message;
+ 
+-	strncpy(message, sizeof(message), "%s", status_message);
++	snprintf(message, sizeof(message), "%s", status_message);
+ 	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ui->thread_status_pb), m);
+ 	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ui->thread_status_pb), perc / 100.0);
+ 	gtk_widget_queue_draw(ui->window);
diff -Nru fio-2.1.3/debian/patches/genfio-interpreter fio-3.16/debian/patches/genfio-interpreter
--- fio-2.1.3/debian/patches/genfio-interpreter	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/patches/genfio-interpreter	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,11 @@
+Description: Keep /bin/bash as interpreter, no usrmerge (yet).
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
+
+--- a/tools/genfio
++++ b/tools/genfio
+@@ -1,4 +1,4 @@
+-#!/usr/bin/bash
++#!/bin/bash
+ #
+ #  Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ #  Author: Erwan Velu  <erwan@enovance.com>
diff -Nru fio-2.1.3/debian/patches/makefile fio-3.16/debian/patches/makefile
--- fio-2.1.3/debian/patches/makefile	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/patches/makefile	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-Description: Change makefile to build in correct locations.
-  This patch changes the makefile to build in /usr instead of
-  /usr/local. Also change manpage build dir to /usr/share/man
-  instead of /usr/man.
-Author: Martin Steigerwald <ms@teamix.de>
-
----
---- a/Makefile
-+++ b/Makefile
-@@ -184,14 +184,14 @@
- else
- 	INSTALL = install
- endif
--prefix = /usr/local
-+prefix = /usr
- bindir = $(prefix)/bin
- 
- ifeq ($(CONFIG_TARGET_OS), Darwin)
- mandir = /usr/share/man
- sharedir = /usr/share/fio
- else
--mandir = $(prefix)/man
-+mandir = $(prefix)/share/man
- sharedir = $(prefix)/share/fio
- endif
- 
diff -Nru fio-2.1.3/debian/patches/makefile-clean-delete-config-log-and-d-files fio-3.16/debian/patches/makefile-clean-delete-config-log-and-d-files
--- fio-2.1.3/debian/patches/makefile-clean-delete-config-log-and-d-files	2013-10-07 08:33:53.000000000 +0000
+++ fio-3.16/debian/patches/makefile-clean-delete-config-log-and-d-files	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-Description: Delete config log and .d-files on clean.
-  Makefile of fio 2.1.1 leaves them which prevents repeated builds
-  with dpkg-buildpackage due to:
-  dpkg-source: info: local changes detected, …
-Author: Martin Steigerwald <ms@teamix.de>
-
----
---- a/Makefile
-+++ b/Makefile
-@@ -267,7 +267,7 @@
- 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
- 
- clean: FORCE
--	-rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) core.* core gfio FIO-VERSION-FILE *.d config-host.mak config-host.h
-+	-rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) core.* core gfio FIO-VERSION-FILE *.d */*.d config.log config-host.mak config-host.h
- 
- distclean: clean FORCE
- 	@rm -f cscope.out
diff -Nru fio-2.1.3/debian/patches/makefile-hardening fio-3.16/debian/patches/makefile-hardening
--- fio-2.1.3/debian/patches/makefile-hardening	2013-10-07 08:33:08.000000000 +0000
+++ fio-3.16/debian/patches/makefile-hardening	2019-10-21 06:18:14.000000000 +0000
@@ -1,19 +1,17 @@
-Description: Change makefile to build with hardening flags.
-  This patch changes the makefile with hardening flags. It fixes
-  lintian warning: W: fio: hardening-no-relro usr/bin/fio.
-Author: Martin Steigerwald <ms@teamix.de>
+Description: Keep hardening build flags.
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
 
 --- a/Makefile
 +++ b/Makefile
-@@ -15,9 +15,9 @@
+@@ -20,9 +20,9 @@
  endif
  
- DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
--CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 $(DEBUGFLAGS)
-+CPPFLAGS+= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 $(DEBUGFLAGS)
- OPTFLAGS= -O3 -g -ffast-math
--CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS)
-+CFLAGS	+= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS)
+ DEBUGFLAGS = -DFIO_INC_DEBUG
+-CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
++CPPFLAGS+= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
+ OPTFLAGS= -g -ffast-math
+-CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
++CFLAGS	+= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
  LIBS	+= -lm $(EXTLIBS)
  PROGS	= fio
- SCRIPTS = tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio
+ SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv)
diff -Nru fio-2.1.3/debian/patches/makefile-manpagepath fio-3.16/debian/patches/makefile-manpagepath
--- fio-2.1.3/debian/patches/makefile-manpagepath	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/patches/makefile-manpagepath	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,14 @@
+Description: Adapt manpage path to Debian.
+Author: Martin Steigerwald <martin.steigerwald@proact.de>
+
+--- a/Makefile
++++ b/Makefile
+@@ -349,7 +349,7 @@
+ mandir = /usr/share/man
+ sharedir = /usr/share/fio
+ else
+-mandir = $(prefix)/man
++mandir = $(prefix)/share/man
+ sharedir = $(prefix)/share/fio
+ endif
+ 
diff -Nru fio-2.1.3/debian/patches/series fio-3.16/debian/patches/series
--- fio-2.1.3/debian/patches/series	2013-10-07 08:48:25.000000000 +0000
+++ fio-3.16/debian/patches/series	2019-10-21 06:27:56.000000000 +0000
@@ -1,4 +1,6 @@
-makefile
+fix-compilation-3.16
+makefile-manpagepath
 makefile-hardening
-makefile-clean-delete-config-log-and-d-files
 fio2gnuplot-manpage
+configure-no-configlog
+genfio-interpreter
diff -Nru fio-2.1.3/debian/README.Debian fio-3.16/debian/README.Debian
--- fio-2.1.3/debian/README.Debian	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/README.Debian	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,37 @@
+How flexible I/O tester uses measurement units
+----------------------------------------------
+
+fio uses the opposite symbol for kibibytes/kilobytes (Kb/KiB) than
+ISO 80000-1 by default in *jobfiles*. The user readable output however
+adheres to the standard, while the parseable output for scripts also
+uses the opposite symbol.
+
+This leads to weird scenarios like:
+
+% fio --name=rand-read --bs=4k --size=1MiB --iodepth=64 --runtime=10 --rw=randread
+rand-read: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=psync, iodepth=64
+[…]
+% ls -l rand-read.0.0                                                             
+-rw-r--r-- 1 root root 1000000 Mär 26 13:10 rand-read.0.0
+
+In case you decide to write fio jobs in a way that adheres to the ISO
+standard, you can use:
+
+% fio --name=rand-read --kb_base=1000 --bs=4KiB --size=1MiB --iodepth=64 --runtime=10 --rw=randread
+rand-read: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=psync, iodepth=64
+[…]
+% ls -l rand-read.0.0                                                                              
+-rw-r--r-- 1 root root 1048576 Mär 26 13:19 rand-read.0.0
+
+Note: In this case the block size needs to be in "KiB" as well, otherwise
+it would be just 4000 bytes in this example.
+
+I received no feedback from upstream developers regarding this issue. As I
+decided to keep upstream behavior rather than diverting from it, using
+'--kib_base=1000' is your option in case you decide to adhere to the
+standard. For further discussion see:
+
+fio: uses the opposite symbol for kibibytes/kilobytes (Kb/KiB) than ISO 80000-1
+https://bugs.debian.org/872321
+
+-- Martin Steigerwald <martin.steigerwald@proact.de>, Mon, Mar 26 13:29:45 +0200
diff -Nru fio-2.1.3/debian/rules fio-3.16/debian/rules
--- fio-2.1.3/debian/rules	2013-10-07 10:01:43.000000000 +0000
+++ fio-3.16/debian/rules	2019-12-21 17:43:55.000000000 +0000
@@ -9,72 +9,32 @@
 # Uncomment this to turn on verbose mode.
 #export DH_VERBOSE=1
 
-# For hardening build flags
-DPKG_EXPORT_BUILDFLAGS = 1
-include /usr/share/dpkg/buildflags.mk
+include /usr/share/dpkg/architecture.mk
 
-build: build-arch build-indep
+export DEB_LDFLAGS_MAINT_PREPEND := -Wl,-z,defs -Wl,--as-needed
 
-build-arch: build-stamp
+export DEB_BUILD_MAINT_OPTIONS ?= hardening=-pie
 
-build-indep:
-# Since there are no architecture independent packages we
-# have nothing to build by default.
+export V = 1
 
-build-stamp:
-	dh_testdir
+ifneq ($(DEB_BUILD_ARCH),$(DEB_HOST_ARCH))
+export CROSS_COMPILE=$(DEB_HOST_GNU_TYPE)-
+endif
 
-	# Build inclusive I/O engine rdma.
-	EXTFLAGS="-DFIO_HAVE_RDMA" EXTLIBS="-libverbs -lrdmacm" $(MAKE)
+%:
+	dh $@
 
-	touch $@
-
-clean:
-	dh_testdir
-	dh_testroot
-	rm -f build-stamp
-
-	# Add here commands to clean up after the build process.
-	$(MAKE) clean
+override_dh_auto_configure:
+	./configure --disable-native --prefix=/usr
 
+override_dh_clean:
 	dh_clean
+	rm -f config.log
 
-install-indep:
-# Since there are no architecture independent packages we have
-# nothing to install by default.
-
-install-arch: build-arch
-	dh_testdir -a
-	dh_testroot -a
-	dh_prep -a
-	dh_installdirs -a
-
-	# Add here commands to install the package into debian/fio.
-	$(MAKE) DESTDIR=$(CURDIR)/debian/fio install
-
-install: install-arch install-indep
-
-# Build architecture-independent files here.
-binary-indep: build-indep install-indep
-# Since there are no architecture independent packages We have
-# nothing to do by default.
-
-# Build architecture-dependent files here.
-binary-arch: build-arch install-arch
-	dh_testdir -a
-	dh_testroot -a
-	dh_installchangelogs -a
-	dh_installdocs -a
-	dh_installman -a
-	dh_link -a
-	dh_strip -a
-	dh_compress -a
-	dh_fixperms -a
-	dh_installdeb -a
-	dh_shlibdeps -a
-	dh_gencontrol -a
-	dh_md5sums -a
-	dh_builddeb -a -- -Zxz
+override_dh_installsystemd:
+	dh_installsystemd --no-start --no-enable
 
-binary: binary-indep binary-arch
-.PHONY: build clean binary-indep binary-arch binary install
+# dh_installinit --no-start --no-enable gives update-rc.d: error: no runlevel symlinks to modify, aborting!
+# https://bugs.debian.org/894084
+override_dh_installinit:
+	dh_installinit --no-start
diff -Nru fio-2.1.3/debian/upstream/signing-key.asc fio-3.16/debian/upstream/signing-key.asc
--- fio-2.1.3/debian/upstream/signing-key.asc	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/debian/upstream/signing-key.asc	2019-10-21 06:18:14.000000000 +0000
@@ -0,0 +1,171 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBE6byYYBEADdvefbg3TpCkasOnrc69r8neDjASq9/0l3kL6tkpGVZZN/NF73
+CAEeqnafQM7Dt89w1/5k/dnIqnZ7FsBdPz7TMnjomOZUuMurN5j4Cv05R1gBriwa
++Ayz2/lQn2Bdl/X0qz6A0g66JzQxdPGKKqNZizZFZCPknpMk6TSavac8RmJRJGM+
+Uj16qP8beabAAcN1aS45DOlksHfYheJ0/GLhb9/zuDl0uzblmRvQVncIjN/XYwQy
+pOFP8Y6UAwRHcGY1XZUhHbPp04lvmo1YdYnMCSJDmziTSUD3fx1HyeepeNr01fZ8
+s3rWps0S+E4B+zijZH9sqcmeYhmnvqvzv7Vndee+u5zjOJFi5KzDr1LRajnmBPPO
+uNyb29pVHO8B7sV+sqVyOGyE4zW6gvwda7IjU9g+RC5HoRDYMyKzOO7NhG7TAX5R
+ls1KzaWTIae43YU01Pr+Ewm+wVAOQr/xbw/5KCo9N6XsLpZXNqabiIsgyBrXiGG6
+b2FD6i+MC6H6BtWHnfzldXXtje5ZxV74I15jTPSoTwrljc6tAnrtw6Ty3pWgtReT
+HESWFta+HBBlfSfj/vZfXfEmsGI303j3X61qUm07orYsYAM0tdFcNRpcJtm0odwD
+PPBnIU+YwR26t6eDroGpx4TWyQlzcFQKhbyxePAfPimpgq/5QN6PyCDtRQARAQAB
+tBxKZW5zIEF4Ym9lIDxheGJvZUBrZXJuZWwuZGs+iQI6BBMBCAAkAhsDAh4BAheA
+BQsJCAcDBRUKCQgLBRYCAwEABQJOm8ubAhkBAAoJEPfTWPspceCmlWEQAMLghzhu
+4sDkjH0OC2IrsZc3HX+2P7ZMiQF+kfLemuXk/bAzmQZqT56C+NHpiS7B6qkU1B6s
+0vxkXWqm0G+qb4FTV6wuQmf/sOaYKpLGN+LcIGzUp1GocIfLCPBHuNahCUXOKAhQ
+vEOBoZSToS18p7R6lJvno8+9O98QUTX0WGSVT0Tgmkn6Oqd80+vSQs6icVXLwuk5
+gERfdSKNt6QsS70FW1unZuM25aywKKxC2Z86OSYF3Bt+S+7iNJXpHkqOyhx+Hwgj
+F/pu++prOmgcBuEumcyP4x3e6xswd/NrdpRxF0RZHMLObkmk56Sk6Dfe9GA/1qXw
+6hEqLiWqiAywH5N6LjscLkiASsmwedKGOFlzrRpyLTGKHfa7EyG5tSnNEuyMr1Ub
+k0n2QYCnrQDpSufa+hSkzC4gUuq85pMtiFUJJwX6EV1/B33gMRDbET72rbrkKTn8
+JqREvZ9IBDin8HuCPBVVPApng6oseGV3FlmUkWsqP/x53jGBQYKytOpzAzP8S3vy
+X1fJxiERfYZFueymvtl5alsjixmYNJ0Y69Xz/XsT8158ZV7bOO4FyDXAUlQ7rDFl
+W8UjGNDDJIFysJxMqFv0kljQMH1d6tK19NnXCV9uUB/9K7+2p41+S+Jzq/A/PI45
+ZGWGITkTbjp+Ecas0cd0IeRaDLSiMUDezeI/0dOH04UBEAABAQAAAAAAAAAAAAAA
+AP/Y/+AAEEpGSUYAAQEAAAEAAQAA/9sAQwAIBgYHBgUIBwcHCQkICgwUDQwLCwwZ
+EhMPFB0aHx4dGhwcICQuJyAiLCMcHCg3KSwwMTQ0NB8nOT04MjwuMzQy/9sAQwEJ
+CQkMCwwYDQ0YMiEcITIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIy
+MjIyMjIyMjIyMjIyMjIy/8AAEQgAlgBwAwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEB
+AAAAAAAAAAABAgMEBQYHCAkKC//EALUQAAIBAwMCBAMFBQQEAAABfQECAwAEEQUS
+ITFBBhNRYQcicRQygZGhCCNCscEVUtHwJDNicoIJChYXGBkaJSYnKCkqNDU2Nzg5
+OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6g4SFhoeIiYqSk5SVlpeY
+mZqio6Slpqeoqaqys7S1tre4ubrCw8TFxsfIycrS09TV1tfY2drh4uPk5ebn6Onq
+8fLz9PX29/j5+v/EAB8BAAMBAQEBAQEBAQEAAAAAAAABAgMEBQYHCAkKC//EALUR
+AAIBAgQEAwQHBQQEAAECdwABAgMRBAUhMQYSQVEHYXETIjKBCBRCkaGxwQkjM1Lw
+FWJy0QoWJDThJfEXGBkaJicoKSo1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdo
+aWpzdHV2d3h5eoKDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLD
+xMXGx8jJytLT1NXW19jZ2uLj5OXm5+jp6vLz9PX29/j5+v/aAAwDAQACEQMRAD8A
+5LW7p7uZppGJZzk1z0h5rXvzheTWM5rSbKgMzULDJqaom6msTQgX5HK9m6VNCMnF
+MkTcPftUqJJEscroyq5wCRjP0pM2oys7MvRR8Vchi5HFFsgZARWlaWM9w4WCF5GP
+ZVzXPJns04WVz3f4ZzQyeCrWOIjdEzLIB65z/Iiuwrwfw9qt14RnaSXUra3Df6y2
+Y+azemVXp9citfVvjDPJC0Wm2KoSMGWY/wAlB/rVRnaNmeVXwU51W4apmR8RpP7V
+8ZXK2UbTeUqxsUGeQOelcRcwJZ5+0zxRt/cB3N+Q6VLqOuahqAZZLlhGTny4xsX8
+hWV9keTouAe5qEurPTgpwgoLobljpcV9PbQC6ihafnzJ32ooAycmtC38Mre+JYNG
+sbqO7LsFaaNTtXux564FYCwkpGrHcUG0V7f8MvDYsYJ9XnTEs4EcOR0QYyfxP8qc
+VdixVZ0o839XPCr2QMgAbd+FZLGr9xt8vIcE+mKz2PNdsmfPxQlRt96njrTX+9WZ
+oJWxPpd2ulWjXBJS4iM1q2eOCQV+vFZABPQV7DLa6Rd/CfSre4vYbXUIoTLD5mQd
+4ZsY45z0/KpbsF2tjy+1v3tIEeOON5FfDCQbhjtx9c/mK0n1fUbyPZJdSeX/AM84
+/kX8lxn8aypEjZi4YAkBnjH8JzzVqG/skO0yYI9qzloexh6kJJOTJ0iYDAAUVKIf
+U5qxatBcqTFKjY689KseQOwzWTkelGKaumUAgHRQKciAnLZNWGjAPJFAQdcUrhy2
+Oh8G+E5fE+psgcRW0ADSvjPXoB7nmvf4IUt7eOGMYjjUKo9AK4z4Y6K+meH2u5si
+S9IcKeyDOD+OSfpiu2JwM1vTVlc+bx1Z1KritkfHsw61Sfg1em5zVF+tbnMhAeaa
+/wB40q9aG+9UlmjpFv51wPmAI9RnHvW3q5ieBAbhrhI1Cgqxwh9MdhVbwrEbi9MO
+BsCl3J7+gr1HQ/h9b+IrU3shEMP3IzGoG/HU+4o6D06niM5BlSZRgk/MAcg1SuTi
+Q+o617ZefCfT7fz1F3JuXlTjj6VwWpeC5YWOZQV6AgdqzclHc3VCcleJx0N2Y5N2
+SPcGuhg8QPdGKKZhhF2AKAM+596hHhtY2+diazb6xNjJuUZU96jmhJ2RrFVqPvHW
+LPbjYQHk3EAlRwK6vw74ei1vW7S1hlE0BUSzsisAi9wcgc9q5TQ0iuLK3VPnllIU
+D/aJxivoDwb4ZPh7TMThDeS/6wryAAeAD+NRGN2d+Jxfs6Kd9WdGiJDEqIAqIAAB
+2AqKQknJPHpT5JFHU1karqMtsgECAs38bn5V+vcmtmzwIpvU+V5SeR2qnJ1q3L1O
+Bx61UfrWwkNUc0pHzUJ1pf4zUlo6vwZpv9o3Jt95UTSLE4HUr1IB/n9K+nba3i0+
+xitoECxQoERRxgCvnL4XIG8Vw7um9B19Wr37VdcttPfy5Qxz1ZVyBUt2HyOTSRha
+zMHaVVxlutcheRIEOVB781u32qWd5ue1cOe4HFYt1MiwSPJtXjBzXLUXMz3qELRR
+xOoFfMYKMVzupR77dwRz1roNQv7QyMFdSfasSdkmRtpByKzUXF3FVcZJoz9Lumhg
+OwsHR8rt6g179P8AEyxttKso7dmvblokR5V4UyYG4Envn2rh/gcbD+0dYiuhA0zL
+F5QlAJPLZxn6iuhudJaXVtZuZyEubW4VFMaBV5jU5xj361vOappyex5MpOo402ti
+vL8S726tJwunSTMpKutsTkD2I9iP1rBbxRJMRJLoOqkKOF2HC/QYq94fmubDU9S1
+VJG/cXX2fCDAbK5LN2wM11Hh261PWdG1OcXFtLGjSsHdW9+hDe1OLjJ6ESTieEyJ
+NDAnmQghxlWPXFUZVxyDkVpb4WTeWV3Tqj5w34isy6nEshKoqD+6vaus5kxqnmnA
+/MagD4pVYnOOT6CpLTPYfgrbC6vLwlIyIZYpd3G8YWQflkivT/Fi3iafNLaRqzqh
+ZQxwDjt9a8p+DesWFjrL6ZcRSQXV6UaKWQDkgH5Poc5Fe43oWeNoWXK9zWco3NIS
+amrHlHha7vtSvmhvrROmS4GMc9PesTx5cPp2ptbR8qeor1/S7WxFwyxIoYc8D9TX
+jnxGZZfEs390HGaxnHlVz1qNRzk4dkc2b24EEZitkw/Jx2+tVCTMjSFNrA81q2qD
+yMMOKq3ixxKQhyT1rOUk9hqlJbsy9NupNFu4b2NmQpcK5K9QA3avZ5NQjmXX7qGO
+YQXN0jozLgEeVH6++a8Rnha61CztHnjgSaYZllOETPAJPavUfEN9q+iyHRZ9Pmmt
+5CqRXCjIVRwFGOOP60qsHOnp1OSaSqK3Qn0GaNPCHie5MUUkwvpfKEpwhcRxjB/7
+6qX4Ya7c6n4Z1PSotKSG5jTYtyIyIpSxwQx6ZUHOM1588mrSyjR491wklzJIkSI+
+5CR0OQAemeM17lp97ofhnwxaaZJeQ2pWEHEzBGdyMtwe+TXXTp2gmcdR628z5hlm
+ODgAfSqjNXTz+EL15XMVxbMmTty+Diqz+D9RH/LS2/7+10WZmqcuxgKC7YH41YWI
+AYyxP5V1Fr4Ma00yS+1W5NvFuUKYozJ8pONxx29Kv+G7KOcwutsu24XNmjHH2vHy
+yIJOqvkBl9PxrSNK+5lKVjG0K11W1vrbULWCUNZ7LlC5wSgcAFQeWGSBxmvqTUp1
+Fh5yggOu7kYPNfPGo+IJZNP1G1t0c2yp5YUsY22vIvzbR8odXXa20ANu59a9V8Le
+IRrPw4sLlXEs8EYgcE9WTjn6jBrLEJRWhrhrymrljT5b20jnuBKsckzA/Mu75Qem
+PevLvF95HcasZ0xJBJuUDuGBxXoV9LqlvbGYRWhzzuJLEfpXmes3EjKqNp6iNWZh
+t4O49a43qj3oxes0TW3lvpuV+8owwrB1FxDE7seACa0bFswSMishC/MjmuZ8R3u2
+FYlOGc9PQVnGN5WIrVeWFys919stR91XyCRuAJPYAde9a6eL9bW2uYXnc73DI2/e
+I9nGB1/E965aIb0Mu8q20uMD+IdfyHP4Vesg6pGqFlDSGLHJYsOwH3R+IrsUUlZH
+le2bd5DbzVb28uRLNO7ODkEcc/QVqyvc2Gk2737wXxmm8wRTOzlQVAwTn2H0rLaN
+Lh1jZY0d1yqoclR/tHPU8fl0pj21zPCFRd6oxAJwPwOTTWhE3zPQ+gri00IDc+lW
+ZJ9YhVK507RIIllGkW4mdgIkSLcxPYBR94+34kgU6yjYwx3MylyzbYIs8yN/gOee
+2D6Vh6nqUckMtxJNN9kZzAZYhia+fPMMA/gjHdupzzknFehFqUb2Mq8uR8sWZviN
+5dUtry3t0kuLmLEpaJgsNuV5Jkl6SMAMYGFHQVnaHcWU1olpGzi3bVUnjhY58iNE
+Ls/sW4HHoaj1SdTeWkF7CkrRgN/ZcEnlwWUXcO3dyOuemefSshIrqx1UWphW1ub9
+mtzGP+WW7t+AcCok9TBLQ1PDUVrrsviGW8lMUX2OS5DZweJFlf6YAUVc+E17csNQ
+0+GTJeHzkhY4DFeD+OCKyr8xaN4R1V7MEi/v209JD2hjCu//AH0dg+imsPwjrbaB
+4gsL7JEayFJcd0bg/wCP4VhUV4tG9KXLNM+gXexvtLLSeYGwd8ZbBVh1BrzjUiLN
+m8tmaMnoxzXdatpqahELyzmMcjLyyHhh2yO9efa3Z3cP7uV+F9B1rgSTPYVVxRjy
+6osZYKOTxgVyGsytLe/MeQoz7V0DQrEzO/RRkmuUuJTNcPIf4iTWlKKvoceJm3HU
+t2cQn0q/X/lpAFnX/dzsb/0JfyrSilj/ALSAHRp1mHtmIk/qaqeHB5uri1PS7ikt
+vxZSF/8AHttIA0aR3MSFiIi7D0AQLn9a36nFcmRo10+Mgo+IiJoSuWwXYhx7jI/O
+p1hiVgtwDIgXcTE23eD0dTjpnqOx9M1YkF1NqbwoI0vbJC9sqKCJo+uz3+XOPxFJ
+sjdIDbkC2uSWtS3SGb+KI/7LdPxHvT5RqVtT1nWb9ZTdJPOLeG3TbezIcm3iPAhQ
+95Xx83oMDoK5W8v7tdThuY0CapeQf6FEzfLplvyNx9G2jI9Ac9TTtQv7WCV3Z1uN
+LsmzGoU41C8wNzHPUZ5+mPWufvjM3m2lzJGb+6xc3tyz4MUeOIs9uoyPXArrlKyM
+IxK19NaraT2VtcBtPjk3KxUiS/k6Hn+6P881Ru1eGVr9bjzRFIAJM9ZmG9sZ5IB4
+z7D1pbjUHlEMnniEKhhihWHeYYT3B9Tn6+9RPEjRRTMphtTLsiiJy2FA3ufc8fif
+asW7mliXxG80KWmmCRtkcSSSR7uPOkBZjj1+bH4VkfZ5Y8I0bB1YjbjnPpVrUrpr
+3Upbt+GlnDkenX+ldLGLa+Fxp92ZIYzc/aEuoYw7w4h3kgEjIIHTI6Ubti2sdJ4L
+8VtLo4s5nPmW42gnuvYH6UurXyXJJPWuPT7Rpmpw3MrJLHcL8l1F9y5Xs3s3qDz6
+jPXokgF0A6nINcFSDUj1qE+aBy2tTBYzCm5mfqF5NcwEJbGOc16IbeBpLqKAiPau
+281F1ylrGeqoO8jdP09SOUvRbziSWztRBb2hRFzy7/Njc57sSG9h0FdMIcsTgrz5
+psq6Us0GrWU6plo7gMoPGWQhse1aaxtHax3G4RxTxGNyeQodWI/VcVZ0mEeXp9xJ
+j5dVQOfZ2Of/AEE1XvS1rp1lDLG0sDgwyIpwd8UjE4PY4etLdTG5Yt7e6Fu9sr/8
+THTf9LtHHPmxdWUHvjqPoRUrx2kzoQRFpms/MrDkW1yP6ZOPoafEWtLSOWGUC50o
+rc2zvx51rJglT64LDj/aYVNJp/2iTVdHtspHIo1LTlHR8DLAE8/dJx/u1aRLZPe3
+kYi3xgw2unZSyidOZpj95mHrxn2wBWHM6iN4ZG3KGEl1KRlpJOyD2/8ArmtKfUZL
+x1W+LrqVqpSLecBj6t/te/esaUMr7MlmRjyf4pD1Y/SocrmqiV5BcXEhRQzSv95U
+OPoMD04q7/ZM4mitZFZZip+Q9UQMQPxZsmtCxsks08xuZSPyFatwrW1hfa1I++a4
+TKn3G1FH/fRJ/wCA1MJJuxpUoyhBSZxksayXrpHygcqv0VK2LOfbqQH/AD2jjjH1
+eFox/MVU022y6senl3bjP+zCf6impIVuYJQeYzbH8mNXHTUwZpaCjGCXTJIw8V3H
+HOoPGCpKEqex6/XGK09OvU0uzu5bvzJoYTGqCM483zM7eewwCfwqpCot9T0p0BIW
+e5teP9l9w/R6t3Ni8+o/2DbTLJBYs13cOAMBmbCLnuE8wfm3rTnBMqnVcbpGPrGo
+XGpXS2oRYbKGRGjt4uEXqxY+rY6k81n6lEbSWeyH8UMEz/UqHP6ua2YrdG0a9v2X
+aWilnjHcLlY0/Td+VZgYX+pTXEn3Tavkj0jXH8lFHLYi9zUeBotI1GJOGREu0+sc
+xU/pJWd4huj/AGtLFGqmKS6F9CD0HmKGI+nOD9K6bXfs9rHbNGu1JBcWco35yWjD
+jtxgsp5rgDcu9xBK+S0Soo9wOgok0KJ2lrEI2t1lG6zik+wXUMuCRDMzOjK/dcAE
+H19jSMLmy0uG4jDDUfDd55T7uCYWY7SfbOR/wKrFpbxahapaqA0V5ataq44ZZUJl
+gP4rtX8D6VcsSuoanpssn+r12wbT7g54WdflDEeoIQ/jWqRLZpa74FS5Uutzh1GF
+Zhkj2965QaO2nEieRZpFOAcdPpRRXBBvlPXUI+02I5JDXQaLZR654durGZmX7POJ
+lb1GOR+aiiitaS94eL/hnMIfs1pCoAz/AGbckn3dnU/pWcEwjn+6ID/Oiiuh9Dyk
+dBbA/a4cnmPV5sfioz/6CK3rMRxeJvFgjQAG1DDjodhP86KK1tsZs53V5vs8KWij
+5JNNtU+gLFj+pNVdFtV/s37QeT5Fwh+hZF/9mNFFTb3ivsh4pvHGoX1j/BHfNKp9
+/LA/9lFcxjCk+lFFYVNzSGx3fgGzl1m3nt4pFjezC3BZ8kYWRWGAO+PMHb73vUl/
+4osrPfb6RYmRkupJ4rq+VWeJ2OTsUcDp1JY/Siito7IzktT/2YkCNwQTAQgAIQUC
+Tpv+pAIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRD301j7KXHgpvO8D/4j
+EC5Mrje4w0fMAaLNK+M+3XUDMyChrnNw7ZdApmhjZtU7R87+SErbzH+D3i9L8uot
+JldTjcxareBfxkX5SbAey3IFrCce5pam8kDV/9lT9rm44uy/aTOoKC7RKDRx2l99
+4z5zL4OyHNGz+f3GP92Igx5IfzOvkXwjxWhltkc9eIn+bFMNLN/APZGTa2bLbdsG
+8tSzmJw6dm9g/0tApTsA59v/eiIGUcvdiMLNAotm10AEOKF6WFrewdO6DPX93evp
+NhF/bHZuSgr/gvGuwuWtgvt5HbgT767a3Hhl+XxQwlHDlh8/VWLC/lxxiEl6Aq7i
+GhecSXH5YLTUdFQ4akk1C3N3fN5qQ00F/fiGIhLUZM1XbY4hKwGfU6fnPTmwR6kp
+nOlSiWEMhuYD6xmwKBRRxihcYglp1osLZ3SmZ4LWjb2c5pfZKnWVTDHKSBuWK8tk
+IXBmbb1K1K8r5OkSjBNvXwmZu1iM4VUmDFSiIK/fVro/VabTGkaPXGg0DGFyt8ka
+wKIz5Hn48sCI5AF2tux0hHVm9XmEGvnEIyI8zXjKrOLGkR7Ml5ox3cxaynBWwBUg
+EX6iHTz5NZ1xe+nRpYOP4wl+K+D5v7tSQygvD2m3n/oqfwRnob7BB30vdj1VyT8j
+SJ4/tXi+PWfL/AxJw5pE3to9/LtXzmRYbJDpBhJHkbkCDQROm8ukARAA2qnAQb9B
+YxDHVIghh9dS01Fc/Yoq6zPWsTYLgoxXzQMYpEma3VN0Y+/xmYOoTdpg/wGv1ZfK
+YJZYOJPUyQ3PMLNs+JsSgXOjqNfWKWBykYfYc5XEzo8nKRNvCkflMvx8gFt0B6fO
+ahU4WSTOmuLGz9HA7xQeavG0XUKDxceRyGbUaPrNDkmcyJIQwxDCY0FQBZfX6Hd8
+KI9P0nJYCob4Q+wPS9WZyPgVYSlbsSIn6D0Ef7waCSkY9URX+gciUB9noRWgXB8M
+F/Shn0OYSa4gGLYRdkPm57ZZBBJhzF9QjipMBEZ+M/to4g1rYv6kMJFGlc0UPehS
+vVn2M+H8oB4508OqH7V7mV4lIJkQiMalszAoq4rwXoxIAW3UVw0nyWsgdomQ3DIV
+yvL6cl+qAVHCoaLcyMHXfg7avGvnn711HOoJhP3pbKckjubY+ZwJCYc/LNY1BxL0
+GUUJ1nPOJZbW9vAzyx+H174ETYTmOZ2XDl+Oby980xwKH4ApUrilrHdzRfdbfzfZ
+gOGPy1cProM94yaoC2EEnCp5klgAhyBZvfxmSxXyOBGyY12K2BhuhI8axDsaJjlm
+kjuspw/FFAUcWKf5wAFybzQs2yziObS6AIDl+NPg2SxjgmP+0YbaqDRwPA6AOivu
+lfb1byb7UsbUxHeLBMkQjflGcAE6EyVaO3MAEQEAAYkCHwQYAQgACQUCTpvLpAIb
+DAAKCRD301j7KXHgpuD8EACeFpgkKP7hUPb1PtzrVUMoNRarsKVfaPfl0Ln/2lkS
+dBhobNitIy13MJjYzkc3NVcm4F3+HW8lV0NZp80PMFUKY0TOtKan+ePmMnTSggia
+6bZGSYtgF2PDZgUKhCLiVB5D131aiTtL7KHuGHO6YxCnxHe9kEUCmQ8vxB4WUy+P
+UYTqvytRiqXuOTMiG3QNsfloxO+vPZfg36LA0OlWvIQI0zEwVwSvK95lkfxIxHJd
+zfueynd4vQWh7zNFYZrh9E7E8DSvWeU9AzBvoLBS8lw5GP+mN1h7kdWzXRpIH5+x
+rOKchbXs2GpykNKpNzFYrDdcLPtQ9I6IMkm571uZT3QFeIVZH8FqsUaCNufHiy9E
+cv+95jsqepmrkvjh5LVQ7NOc+UOp6rtth7yKIcaF8cVsZFIKGp9X4cEilQGG8aTF
+ASAWM+pVfJccLR1nCPK5bHPV/2zWH2IslMSHe5V8H9P+TqaqShf1vgbYE5wRWhg7
+AZNSiaACZuVjXvqNhs6o/7P0pDqHcdtwND6a/J9wC87Gug+oQtDKV+tT6smUW048
+GjUMc+5iYOwiGNCcaQVeFV06wq1GRjwY56Ueq0xebRI3hDFagCda/dHZdN6CUSB7
+X7BaWQ1JZbJqtpQc/1yRQ/Ht/LGdeR622xbhCbx05PB9C2hcIP+jmRRKDma83Ii6
+Zg==
+=lYau
+-----END PGP PUBLIC KEY BLOCK-----
diff -Nru fio-2.1.3/debian/watch fio-3.16/debian/watch
--- fio-2.1.3/debian/watch	2013-10-07 08:31:01.000000000 +0000
+++ fio-3.16/debian/watch	2019-10-21 06:18:14.000000000 +0000
@@ -1,2 +1,3 @@
-version=3
-http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz
+version=4
+opts=pgpmode=mangle
+opts=pgpsigurlmangle=s/$/.asc/ http://brick.kernel.dk/snaps/fio-(\d.*)\.tar\.gz
diff -Nru fio-2.1.3/debug.c fio-3.16/debug.c
--- fio-2.1.3/debug.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/debug.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,26 +1,18 @@
+#include <assert.h>
 #include <stdarg.h>
-#include <sys/types.h>
-#include <unistd.h>
+
 #include "debug.h"
+#include "log.h"
 
 #ifdef FIO_INC_DEBUG
 void __dprint(int type, const char *str, ...)
 {
 	va_list args;
-	pid_t pid;
 
 	assert(type < FD_DEBUG_MAX);
 
-	pid = getpid();
-	if (fio_debug_jobp && *fio_debug_jobp != -1U
-	    && pid != *fio_debug_jobp)
-		return;
-
-	log_info("%-8s ", debug_levels[type].name);
-	log_info("%-5u ", (int) pid);
-
 	va_start(args, str);
-	log_valist(str, args);
+	log_prevalist(type, str, args);
 	va_end(args);
 }
 #endif
diff -Nru fio-2.1.3/debug.h fio-3.16/debug.h
--- fio-2.1.3/debug.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/debug.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,8 +1,7 @@
 #ifndef FIO_DEBUG_H
 #define FIO_DEBUG_H
 
-#include <assert.h>
-#include "log.h"
+#include "lib/types.h"
 
 enum {
 	FD_PROCESS	= 0,
@@ -19,10 +18,33 @@
 	FD_PROFILE,
 	FD_TIME,
 	FD_NET,
+	FD_RATE,
+	FD_COMPRESS,
+	FD_STEADYSTATE,
+	FD_HELPERTHREAD,
+	FD_ZBD,
 	FD_DEBUG_MAX,
 };
 
-extern unsigned int fio_debug_jobno, *fio_debug_jobp;
+extern unsigned int fio_debug_jobno, *fio_debug_jobp, *fio_warned;
+
+static inline bool fio_did_warn(unsigned int mask)
+{
+	if (*fio_warned & mask)
+		return true;
+
+	*fio_warned |= mask;
+	return false;
+}
+
+enum {
+	FIO_WARN_ROOT_FLUSH	= 1,
+	FIO_WARN_VERIFY_BUF	= 2,
+	FIO_WARN_ZONED_BUG	= 4,
+	FIO_WARN_IOLOG_DROP	= 8,
+	FIO_WARN_FADVISE	= 16,
+	FIO_WARN_BTRACE_ZERO	= 32,
+};
 
 #ifdef FIO_INC_DEBUG
 struct debug_level {
@@ -31,7 +53,7 @@
 	unsigned long shift;
 	unsigned int jobno;
 };
-extern struct debug_level debug_levels[];
+extern const struct debug_level debug_levels[];
 
 extern unsigned long fio_debug;
 
@@ -39,7 +61,7 @@
 
 #define dprint(type, str, args...)			\
 	do {						\
-		if ((((1 << type)) & fio_debug) == 0)	\
+		if (((1 << type) & fio_debug) == 0)	\
 			break;				\
 		__dprint((type), (str), ##args);	\
 	} while (0)					\
diff -Nru fio-2.1.3/diskutil.c fio-3.16/diskutil.c
--- fio-2.1.3/diskutil.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/diskutil.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,22 +1,25 @@
 #include <stdio.h>
 #include <string.h>
-#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/sysmacros.h>
 #include <dirent.h>
 #include <libgen.h>
-#include <math.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
 
 #include "fio.h"
 #include "smalloc.h"
 #include "diskutil.h"
+#include "helper_thread.h"
 
 static int last_majdev, last_mindev;
 static struct disk_util *last_du;
 
-static struct fio_mutex *disk_util_mutex;
-
-FLIST_HEAD(disk_list);
+static struct fio_sem *disk_util_sem;
 
 static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
 		int majdev, int mindev, char *path);
@@ -29,12 +32,13 @@
 	while (!flist_empty(&du->slaves)) {
 		struct disk_util *slave;
 
-		slave = flist_entry(du->slaves.next, struct disk_util, slavelist);
+		slave = flist_first_entry(&du->slaves, struct disk_util, slavelist);
 		flist_del(&slave->slavelist);
 		slave->users--;
 	}
 
-	fio_mutex_remove(du->lock);
+	fio_sem_remove(du->lock);
+	free(du->sysfs_root);
 	sfree(du);
 }
 
@@ -61,23 +65,29 @@
 
 	dprint(FD_DISKUTIL, "%s: %s", du->path, p);
 
-	ret = sscanf(p, "%u %u %llu %u %u %u %llu %u %u %u %u\n", &dus->ios[0],
-					&dus->merges[0], &sectors[0],
-					&dus->ticks[0], &dus->ios[1],
-					&dus->merges[1], &sectors[1],
-					&dus->ticks[1], &in_flight,
-					&dus->io_ticks, &dus->time_in_queue);
+	ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n",
+				(unsigned long long *) &dus->s.ios[0],
+				(unsigned long long *) &dus->s.merges[0],
+				&sectors[0],
+				(unsigned long long *) &dus->s.ticks[0],
+				(unsigned long long *) &dus->s.ios[1],
+				(unsigned long long *) &dus->s.merges[1],
+				&sectors[1],
+				(unsigned long long *) &dus->s.ticks[1],
+				&in_flight,
+				(unsigned long long *) &dus->s.io_ticks,
+				(unsigned long long *) &dus->s.time_in_queue);
 	fclose(f);
 	dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1);
-	dus->sectors[0] = sectors[0];
-	dus->sectors[1] = sectors[1];
+	dus->s.sectors[0] = sectors[0];
+	dus->s.sectors[1] = sectors[1];
 	return ret != 11;
 }
 
 static void update_io_tick_disk(struct disk_util *du)
 {
 	struct disk_util_stat __dus, *dus, *ldus;
-	struct timeval t;
+	struct timespec t;
 
 	if (!du->users)
 		return;
@@ -87,21 +97,21 @@
 	dus = &du->dus;
 	ldus = &du->last_dus;
 
-	dus->sectors[0] += (__dus.sectors[0] - ldus->sectors[0]);
-	dus->sectors[1] += (__dus.sectors[1] - ldus->sectors[1]);
-	dus->ios[0] += (__dus.ios[0] - ldus->ios[0]);
-	dus->ios[1] += (__dus.ios[1] - ldus->ios[1]);
-	dus->merges[0] += (__dus.merges[0] - ldus->merges[0]);
-	dus->merges[1] += (__dus.merges[1] - ldus->merges[1]);
-	dus->ticks[0] += (__dus.ticks[0] - ldus->ticks[0]);
-	dus->ticks[1] += (__dus.ticks[1] - ldus->ticks[1]);
-	dus->io_ticks += (__dus.io_ticks - ldus->io_ticks);
-	dus->time_in_queue += (__dus.time_in_queue - ldus->time_in_queue);
+	dus->s.sectors[0] += (__dus.s.sectors[0] - ldus->s.sectors[0]);
+	dus->s.sectors[1] += (__dus.s.sectors[1] - ldus->s.sectors[1]);
+	dus->s.ios[0] += (__dus.s.ios[0] - ldus->s.ios[0]);
+	dus->s.ios[1] += (__dus.s.ios[1] - ldus->s.ios[1]);
+	dus->s.merges[0] += (__dus.s.merges[0] - ldus->s.merges[0]);
+	dus->s.merges[1] += (__dus.s.merges[1] - ldus->s.merges[1]);
+	dus->s.ticks[0] += (__dus.s.ticks[0] - ldus->s.ticks[0]);
+	dus->s.ticks[1] += (__dus.s.ticks[1] - ldus->s.ticks[1]);
+	dus->s.io_ticks += (__dus.s.io_ticks - ldus->s.io_ticks);
+	dus->s.time_in_queue += (__dus.s.time_in_queue - ldus->s.time_in_queue);
 
 	fio_gettime(&t, NULL);
-	dus->msec += mtime_since(&du->time, &t);
+	dus->s.msec += mtime_since(&du->time, &t);
 	memcpy(&du->time, &t, sizeof(t));
-	memcpy(ldus, &__dus, sizeof(__dus));
+	memcpy(&ldus->s, &__dus.s, sizeof(__dus.s));
 }
 
 int update_io_ticks(void)
@@ -112,9 +122,9 @@
 
 	dprint(FD_DISKUTIL, "update io ticks\n");
 
-	fio_mutex_down(disk_util_mutex);
+	fio_sem_down(disk_util_sem);
 
-	if (!disk_util_exit) {
+	if (!helper_should_exit()) {
 		flist_for_each(entry, &disk_list) {
 			du = flist_entry(entry, struct disk_util, list);
 			update_io_tick_disk(du);
@@ -122,7 +132,7 @@
 	} else
 		ret = 1;
 
-	fio_mutex_up(disk_util_mutex);
+	fio_sem_up(disk_util_sem);
 	return ret;
 }
 
@@ -131,18 +141,18 @@
 	struct flist_head *entry;
 	struct disk_util *du;
 
-	fio_mutex_down(disk_util_mutex);
+	fio_sem_down(disk_util_sem);
 
 	flist_for_each(entry, &disk_list) {
 		du = flist_entry(entry, struct disk_util, list);
 
 		if (major == du->major && minor == du->minor) {
-			fio_mutex_up(disk_util_mutex);
+			fio_sem_up(disk_util_sem);
 			return du;
 		}
 	}
 
-	fio_mutex_up(disk_util_mutex);
+	fio_sem_up(disk_util_sem);
 	return NULL;
 }
 
@@ -171,7 +181,7 @@
 		/*
 		 * must be a file, open "." in that path
 		 */
-		strncpy(tempname, file_name, PATH_MAX - 1);
+		snprintf(tempname, ARRAY_SIZE(tempname), "%s", file_name);
 		p = dirname(tempname);
 		if (stat(p, &st)) {
 			perror("disk util stat");
@@ -231,21 +241,29 @@
 		    !strcmp(dirent->d_name, ".."))
 			continue;
 
-		sprintf(temppath, "%s%s%s", slavesdir, FIO_OS_PATH_SEPARATOR, dirent->d_name);
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s", slavesdir,
+				dirent->d_name);
 		/* Can we always assume that the slaves device entries
 		 * are links to the real directories for the slave
 		 * devices?
 		 */
-		linklen = readlink(temppath, slavepath, PATH_MAX - 0);
-		if (linklen  < 0) {
+		linklen = readlink(temppath, slavepath, PATH_MAX - 1);
+		if (linklen < 0) {
 			perror("readlink() for slave device.");
+			closedir(dirhandle);
 			return;
 		}
 		slavepath[linklen] = '\0';
 
-		sprintf(temppath, "%s/%s/dev", slavesdir, slavepath);
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s/dev",
+				slavesdir, slavepath);
+		if (access(temppath, F_OK) != 0)
+			nowarn_snprintf(temppath, sizeof(temppath),
+					"%s/%s/device/dev", slavesdir,
+					slavepath);
 		if (read_block_dev_entry(temppath, &majdev, &mindev)) {
-			perror("Error getting slave device numbers.");
+			perror("Error getting slave device numbers");
+			closedir(dirhandle);
 			return;
 		}
 
@@ -256,7 +274,8 @@
 		if (slavedu)
 			continue;
 
-		sprintf(temppath, "%s%s%s", slavesdir, FIO_OS_PATH_SEPARATOR, slavepath);
+		nowarn_snprintf(temppath, sizeof(temppath), "%s/%s", slavesdir,
+				slavepath);
 		__init_per_file_disk_util(td, majdev, mindev, temppath);
 		slavedu = disk_util_exists(majdev, mindev);
 
@@ -281,11 +300,10 @@
 	dprint(FD_DISKUTIL, "add maj/min %d/%d: %s\n", majdev, mindev, path);
 
 	du = smalloc(sizeof(*du));
-	if (!du) {
-		log_err("fio: smalloc() pool exhausted\n");
+	if (!du)
 		return NULL;
-	}
 
+	DRD_IGNORE_VAR(du->users);
 	memset(du, 0, sizeof(*du));
 	INIT_FLIST_HEAD(&du->list);
 	l = snprintf(du->path, sizeof(du->path), "%s/stat", path);
@@ -295,16 +313,17 @@
 		sfree(du);
 		return NULL;
 	}
-	strncpy((char *) du->dus.name, basename(path), FIO_DU_NAME_SZ);
-	du->sysfs_root = path;
+	snprintf((char *) du->dus.name, ARRAY_SIZE(du->dus.name), "%s",
+		 basename(path));
+	du->sysfs_root = strdup(path);
 	du->major = majdev;
 	du->minor = mindev;
 	INIT_FLIST_HEAD(&du->slavelist);
 	INIT_FLIST_HEAD(&du->slaves);
-	du->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	du->lock = fio_sem_init(FIO_SEM_UNLOCKED);
 	du->users = 0;
 
-	fio_mutex_down(disk_util_mutex);
+	fio_sem_down(disk_util_sem);
 
 	flist_for_each(entry, &disk_list) {
 		__du = flist_entry(entry, struct disk_util, list);
@@ -313,7 +332,7 @@
 
 		if (!strcmp((char *) du->dus.name, (char *) __du->dus.name)) {
 			disk_util_free(du);
-			fio_mutex_up(disk_util_mutex);
+			fio_sem_up(disk_util_sem);
 			return __du;
 		}
 	}
@@ -324,7 +343,7 @@
 	get_io_ticks(du, &du->last_dus);
 
 	flist_add_tail(&du->list, &disk_list);
-	fio_mutex_up(disk_util_mutex);
+	fio_sem_up(disk_util_sem);
 
 	find_add_disk_slaves(td, path, du);
 	return du;
@@ -355,12 +374,12 @@
 		return 0;
 
 	while ((dir = readdir(D)) != NULL) {
-		char full_path[256];
+		char full_path[257];
 
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
 
-		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
+		sprintf(full_path, "%s/%s", path, dir->d_name);
 
 		if (!strcmp(dir->d_name, "dev")) {
 			if (!check_dev_match(majdev, mindev, full_path)) {
@@ -416,13 +435,10 @@
 			log_err("unknown sysfs layout\n");
 			return NULL;
 		}
-		strncpy(tmp, p, PATH_MAX - 1);
+		snprintf(tmp, ARRAY_SIZE(tmp), "%s", p);
 		sprintf(path, "%s", tmp);
 	}
 
-	if (td->o.ioscheduler && !td->sysfs_root)
-		td->sysfs_root = strdup(path);
-
 	return disk_util_add(td, majdev, mindev, path);
 }
 
@@ -441,12 +457,8 @@
 			mindev);
 
 	du = disk_util_exists(majdev, mindev);
-	if (du) {
-		if (td->o.ioscheduler && !td->sysfs_root)
-			td->sysfs_root = strdup(du->sysfs_root);
-
+	if (du)
 		return du;
-	}
 
 	/*
 	 * for an fs without a device, we will repeatedly stat through
@@ -479,39 +491,41 @@
 	unsigned int i;
 
 	if (!td->o.do_disk_util ||
-	    (td->io_ops->flags & (FIO_DISKLESSIO | FIO_NODISKUTIL)))
+	    td_ioengine_flagged(td, FIO_DISKLESSIO | FIO_NODISKUTIL))
 		return;
 
 	for_each_file(td, f, i)
 		f->du = __init_disk_util(td, f);
 }
 
-static void show_agg_stats(struct disk_util_agg *agg, int terse)
+static void show_agg_stats(struct disk_util_agg *agg, int terse,
+			   struct buf_output *out)
 {
 	if (!agg->slavecount)
 		return;
 
 	if (!terse) {
-		log_info(", aggrios=%u/%u, aggrmerge=%u/%u, aggrticks=%u/%u,"
-				" aggrin_queue=%u, aggrutil=%3.2f%%",
-				agg->ios[0] / agg->slavecount,
-				agg->ios[1] / agg->slavecount,
-				agg->merges[0] / agg->slavecount,
-				agg->merges[1] / agg->slavecount,
-				agg->ticks[0] / agg->slavecount,
-				agg->ticks[1] / agg->slavecount,
-				agg->time_in_queue / agg->slavecount,
-				agg->max_util.u.f);
+		log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
+			 "aggrticks=%llu/%llu, aggrin_queue=%llu, "
+			 "aggrutil=%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
 	} else {
-		log_info(";slaves;%u;%u;%u;%u;%u;%u;%u;%3.2f%%",
-				agg->ios[0] / agg->slavecount,
-				agg->ios[1] / agg->slavecount,
-				agg->merges[0] / agg->slavecount,
-				agg->merges[1] / agg->slavecount,
-				agg->ticks[0] / agg->slavecount,
-				agg->ticks[1] / agg->slavecount,
-				agg->time_in_queue / agg->slavecount,
-				agg->max_util.u.f);
+		log_buf(out, ";slaves;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
 	}
 }
 
@@ -526,18 +540,18 @@
 	flist_for_each(entry, &masterdu->slaves) {
 		slavedu = flist_entry(entry, struct disk_util, slavelist);
 		dus = &slavedu->dus;
-		agg->ios[0] += dus->ios[0];
-		agg->ios[1] += dus->ios[1];
-		agg->merges[0] += dus->merges[0];
-		agg->merges[1] += dus->merges[1];
-		agg->sectors[0] += dus->sectors[0];
-		agg->sectors[1] += dus->sectors[1];
-		agg->ticks[0] += dus->ticks[0];
-		agg->ticks[1] += dus->ticks[1];
-		agg->time_in_queue += dus->time_in_queue;
+		agg->ios[0] += dus->s.ios[0];
+		agg->ios[1] += dus->s.ios[1];
+		agg->merges[0] += dus->s.merges[0];
+		agg->merges[1] += dus->s.merges[1];
+		agg->sectors[0] += dus->s.sectors[0];
+		agg->sectors[1] += dus->s.sectors[1];
+		agg->ticks[0] += dus->s.ticks[0];
+		agg->ticks[1] += dus->s.ticks[1];
+		agg->time_in_queue += dus->s.time_in_queue;
 		agg->slavecount++;
 
-		util = (double) (100 * dus->io_ticks / (double) slavedu->dus.msec);
+		util = (double) (100 * dus->s.io_ticks / (double) slavedu->dus.s.msec);
 		/* System utilization is the utilization of the
 		 * component with the highest utilization.
 		 */
@@ -552,83 +566,91 @@
 
 void disk_util_prune_entries(void)
 {
-	fio_mutex_down(disk_util_mutex);
+	fio_sem_down(disk_util_sem);
 
 	while (!flist_empty(&disk_list)) {
 		struct disk_util *du;
 
-		du = flist_entry(disk_list.next, struct disk_util, list);
+		du = flist_first_entry(&disk_list, struct disk_util, list);
 		flist_del(&du->list);
 		disk_util_free(du);
 	}
 
 	last_majdev = last_mindev = -1;
-	fio_mutex_up(disk_util_mutex);
-	fio_mutex_remove(disk_util_mutex);
+	fio_sem_up(disk_util_sem);
+	fio_sem_remove(disk_util_sem);
 }
 
 void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg,
-		     int terse)
+		     int terse, struct buf_output *out)
 {
 	double util = 0;
 
-	if (dus->msec)
-		util = (double) 100 * dus->io_ticks / (double) dus->msec;
+	if (dus->s.msec)
+		util = (double) 100 * dus->s.io_ticks / (double) dus->s.msec;
 	if (util > 100.0)
 		util = 100.0;
 
 	if (!terse) {
 		if (agg->slavecount)
-			log_info("  ");
+			log_buf(out, "  ");
 
-		log_info("  %s: ios=%u/%u, merge=%u/%u, ticks=%u/%u, "
-			 "in_queue=%u, util=%3.2f%%", dus->name,
-					dus->ios[0], dus->ios[1],
-					dus->merges[0], dus->merges[1],
-					dus->ticks[0], dus->ticks[1],
-					dus->time_in_queue, util);
+		log_buf(out, "  %s: ios=%llu/%llu, merge=%llu/%llu, "
+			 "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
 	} else {
-		log_info(";%s;%u;%u;%u;%u;%u;%u;%u;%3.2f%%",
-					dus->name, dus->ios[0], dus->ios[1],
-					dus->merges[0], dus->merges[1],
-					dus->ticks[0], dus->ticks[1],
-					dus->time_in_queue, util);
+		log_buf(out, ";%s;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
 	}
 
 	/*
 	 * If the device has slaves, aggregate the stats for
 	 * those slave devices also.
 	 */
-	show_agg_stats(agg, terse);
+	show_agg_stats(agg, terse, out);
 
 	if (!terse)
-		log_info("\n");
+		log_buf(out, "\n");
 }
 
-static void print_disk_util_json(struct disk_util *du, struct json_array *array)
+void json_array_add_disk_util(struct disk_util_stat *dus,
+		struct disk_util_agg *agg, struct json_array *array)
 {
-	double util = 0;
-	struct disk_util_stat *dus = &du->dus;
-	struct disk_util_agg *agg = &du->agg;
 	struct json_object *obj;
+	double util = 0;
 
-	obj = json_create_object();
-	json_array_add_value_object(array, obj);
-
-	if (dus->msec)
-		util = (double) 100 * dus->io_ticks / (double) dus->msec;
+	if (dus->s.msec)
+		util = (double) 100 * dus->s.io_ticks / (double) dus->s.msec;
 	if (util > 100.0)
 		util = 100.0;
 
+	obj = json_create_object();
+	json_array_add_value_object(array, obj);
 
 	json_object_add_value_string(obj, "name", dus->name);
-	json_object_add_value_int(obj, "read_ios", dus->ios[0]);
-	json_object_add_value_int(obj, "write_ios", dus->ios[1]);
-	json_object_add_value_int(obj, "read_merges", dus->merges[0]);
-	json_object_add_value_int(obj, "write_merges", dus->merges[1]);
-	json_object_add_value_int(obj, "read_ticks", dus->ticks[0]);
-	json_object_add_value_int(obj, "write_ticks", dus->ticks[1]);
-	json_object_add_value_int(obj, "in_queue", dus->time_in_queue);
+	json_object_add_value_int(obj, "read_ios", dus->s.ios[0]);
+	json_object_add_value_int(obj, "write_ios", dus->s.ios[1]);
+	json_object_add_value_int(obj, "read_merges", dus->s.merges[0]);
+	json_object_add_value_int(obj, "write_merges", dus->s.merges[1]);
+	json_object_add_value_int(obj, "read_ticks", dus->s.ticks[0]);
+	json_object_add_value_int(obj, "write_ticks", dus->s.ticks[1]);
+	json_object_add_value_int(obj, "in_queue", dus->s.time_in_queue);
 	json_object_add_value_float(obj, "util", util);
 
 	/*
@@ -654,41 +676,63 @@
 	json_object_add_value_float(obj, "aggr_util", agg->max_util.u.f);
 }
 
-void show_disk_util(int terse, struct json_object *parent)
+static void json_object_add_disk_utils(struct json_object *obj,
+				       struct flist_head *head)
 {
+	struct json_array *array = json_create_array();
 	struct flist_head *entry;
 	struct disk_util *du;
-	struct json_array *array = NULL;
 
-	fio_mutex_down(disk_util_mutex);
+	json_object_add_value_array(obj, "disk_util", array);
 
-	if (flist_empty(&disk_list)) {
-		fio_mutex_up(disk_util_mutex);
-		return;
+	flist_for_each(entry, head) {
+		du = flist_entry(entry, struct disk_util, list);
+
+		aggregate_slaves_stats(du);
+		json_array_add_disk_util(&du->dus, &du->agg, array);
 	}
+}
 
-	if (!terse)
-		log_info("\nDisk stats (read/write):\n");
+void show_disk_util(int terse, struct json_object *parent,
+		    struct buf_output *out)
+{
+	struct flist_head *entry;
+	struct disk_util *du;
+	bool do_json;
+
+	if (!is_running_backend())
+		return;
+
+	fio_sem_down(disk_util_sem);
 
-	if (output_format == FIO_OUTPUT_JSON) {
-		array = json_create_array();
-		json_object_add_value_array(parent, "disk_util", array);
+	if (flist_empty(&disk_list)) {
+		fio_sem_up(disk_util_sem);
+		return;
 	}
 
-	flist_for_each(entry, &disk_list) {
-		du = flist_entry(entry, struct disk_util, list);
+	if ((output_format & FIO_OUTPUT_JSON) && parent)
+		do_json = true;
+	else
+		do_json = false;
+
+	if (!terse && !do_json)
+		log_buf(out, "\nDisk stats (read/write):\n");
+
+	if (do_json)
+		json_object_add_disk_utils(parent, &disk_list);
+	else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+		flist_for_each(entry, &disk_list) {
+			du = flist_entry(entry, struct disk_util, list);
 
-		aggregate_slaves_stats(du);
-		if (output_format == FIO_OUTPUT_JSON)
-			print_disk_util_json(du, array);
-		else
-			print_disk_util(&du->dus, &du->agg, terse);
+			aggregate_slaves_stats(du);
+			print_disk_util(&du->dus, &du->agg, terse, out);
+		}
 	}
 
-	fio_mutex_up(disk_util_mutex);
+	fio_sem_up(disk_util_sem);
 }
 
 void setup_disk_util(void)
 {
-	disk_util_mutex = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	disk_util_sem = fio_sem_init(FIO_SEM_UNLOCKED);
 }
diff -Nru fio-2.1.3/diskutil.h fio-3.16/diskutil.h
--- fio-2.1.3/diskutil.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/diskutil.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,30 +3,36 @@
 #include "json.h"
 #define FIO_DU_NAME_SZ		64
 
-extern volatile int disk_util_exit;
+#include "helper_thread.h"
+#include "fio_sem.h"
+
+struct disk_util_stats {
+	uint64_t ios[2];
+	uint64_t merges[2];
+	uint64_t sectors[2];
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
+	uint64_t msec;
+};
 
 /*
  * Disk utils as read in /sys/block/<dev>/stat
  */
 struct disk_util_stat {
 	uint8_t name[FIO_DU_NAME_SZ];
-	uint32_t ios[2];
-	uint32_t merges[2];
-	uint64_t sectors[2];
-	uint32_t ticks[2];
-	uint32_t io_ticks;
-	uint32_t time_in_queue;
-	uint64_t msec;
+	struct disk_util_stats s;
 };
 
 struct disk_util_agg {
-	uint32_t ios[2];
-	uint32_t merges[2];
+	uint64_t ios[2];
+	uint64_t merges[2];
 	uint64_t sectors[2];
-	uint32_t ticks[2];
-	uint32_t io_ticks;
-	uint32_t time_in_queue;
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
 	uint32_t slavecount;
+	uint32_t pad;
 	fio_fp64_t max_util;
 };
 
@@ -40,7 +46,6 @@
 	 */
 	struct flist_head slavelist;
 
-	char *name;
 	char *sysfs_root;
 	char path[PATH_MAX];
 	int major, minor;
@@ -59,9 +64,9 @@
 	 */
 	struct flist_head slaves;
 
-	struct timeval time;
+	struct timespec time;
 
-	struct fio_mutex *lock;
+	struct fio_sem *lock;
 	unsigned long users;
 };
 
@@ -70,7 +75,7 @@
 	if (du) {
 		struct flist_head *n;
 
-		fio_mutex_down(du->lock);
+		fio_sem_down(du->lock);
 		du->users += val;
 
 		flist_for_each(n, &du->slavelist) {
@@ -79,7 +84,7 @@
 			slave = flist_entry(n, struct disk_util, slavelist);
 			slave->users += val;
 		}
-		fio_mutex_up(du->lock);
+		fio_sem_up(du->lock);
 	}
 }
 static inline void disk_util_inc(struct disk_util *du)
@@ -96,36 +101,35 @@
 
 extern struct flist_head disk_list;
 
-extern void wait_for_disk_thread_exit(void);
-
 /*
  * disk util stuff
  */
 #ifdef FIO_HAVE_DISK_UTIL
-extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse);
-extern void show_disk_util(int terse, struct json_object *parent);
+extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
+extern void show_disk_util(int terse, struct json_object *parent, struct buf_output *);
+extern void json_array_add_disk_util(struct disk_util_stat *dus,
+		struct disk_util_agg *agg, struct json_array *parent);
 extern void init_disk_util(struct thread_data *);
 extern int update_io_ticks(void);
 extern void setup_disk_util(void);
 extern void disk_util_prune_entries(void);
 #else
+/* keep this as a function to avoid a warning in handle_du() */
 static inline void print_disk_util(struct disk_util_stat *du,
-				   struct disk_util_agg *agg, int terse)
+				   struct disk_util_agg *agg, int terse,
+				   struct buf_output *out)
 {
 }
-#define show_disk_util(terse, parent)
+#define show_disk_util(terse, parent, out)
 #define disk_util_prune_entries()
 #define init_disk_util(td)
 #define setup_disk_util()
+#define json_array_add_disk_util(dus, agg, parent)
+
 static inline int update_io_ticks(void)
 {
-	return disk_util_exit;
+	return helper_should_exit();
 }
 #endif
 
-static inline void disk_util_start_exit(void)
-{
-	disk_util_exit = 1;
-}
-
 #endif
diff -Nru fio-2.1.3/doc/conf.py fio-3.16/doc/conf.py
--- fio-2.1.3/doc/conf.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/conf.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+#
+# fio documentation build configuration file, created by
+# sphinx-quickstart on Mon Nov 14 13:56:30 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'fio'
+copyright = '2017, Jens Axboe <axboe@kernel.dk>'
+author = 'Jens Axboe <axboe@kernel.dk>'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+
+# The short X.Y version.
+# version = '1'
+# The full version, including alpha/beta/rc tags.
+# release = '1'
+
+def fio_version():
+
+	from os.path import exists, dirname, join
+	wsroot = dirname(dirname(__file__))
+	version_file = join(wsroot, "FIO-VERSION-FILE")
+	if not exists(version_file):
+		version_gen = join(wsroot, "FIO-VERSION-GEN")
+		from subprocess import call
+		rc = call(version_gen, shell=True, cwd=wsroot)
+		if rc:
+			print("Couldn't generate version file. rc=%r" % rc)
+			return "Unknown", "Unknown"
+
+	vsl = open(version_file).read().strip().split('-')
+	version = vsl[1]
+	release = '-'.join(vsl[1:])
+	return version, release
+
+version, release = fio_version()
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['output', 'Thumbs.db', '.DS_Store', 'fio_examples.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = 'fio v1'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'fiodoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'fio.tex', 'fio Documentation',
+     'a', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('fio_man', 'fio', 'flexible I/O tester',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'fio', 'fio Documentation',
+     author, 'fio', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
diff -Nru fio-2.1.3/doc/fio_doc.rst fio-3.16/doc/fio_doc.rst
--- fio-2.1.3/doc/fio_doc.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/fio_doc.rst	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,51 @@
+fio - Flexible I/O tester rev. |version|
+========================================
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
+
+
+
+Examples
+========
+
+.. include:: fio_examples.rst
+
+
+
+TODO
+====
+
+
+GFIO TODO
+---------
+
+.. include:: ../GFIO-TODO
+
+
+Server TODO
+-----------
+
+.. include:: ../SERVER-TODO
+
+
+Steady State TODO
+-----------------
+
+.. include:: ../STEADYSTATE-TODO
+
+
+
+Moral License
+=============
+
+.. include:: ../MORAL-LICENSE
+
+
+License
+=======
+
+.. literalinclude:: ../COPYING
diff -Nru fio-2.1.3/doc/fio_examples.rst fio-3.16/doc/fio_examples.rst
--- fio-2.1.3/doc/fio_examples.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/fio_examples.rst	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,72 @@
+Some job file examples.
+
+
+Poisson request flow
+--------------------
+
+.. only:: builder_html
+
+:download:`Download poisson-rate-submission.fio <../examples/poisson-rate-submission.fio>`
+
+.. literalinclude:: ../examples/poisson-rate-submission.fio
+	:language: ini
+
+Latency profile
+---------------
+
+.. only:: builder_html
+
+:download:`Download latency-profile.fio <../examples/latency-profile.fio>`
+
+.. literalinclude:: ../examples/latency-profile.fio
+	:language: ini
+
+Read 4 files with aio at different depths
+-----------------------------------------
+
+.. only:: builder_html
+
+:download:`Download aio-read.fio <../examples/aio-read.fio>`
+
+.. literalinclude:: ../examples/aio-read.fio
+	:language: ini
+
+Read backwards in a file
+------------------------
+
+.. only:: builder_html
+
+:download:`Download backwards-read.fio <../examples/backwards-read.fio>`
+
+.. literalinclude:: ../examples/backwards-read.fio
+	:language: ini
+
+Basic verification
+------------------
+
+.. only:: builder_html
+
+:download:`Download basic-verify.fio <../examples/basic-verify.fio>`
+
+.. literalinclude:: ../examples/basic-verify.fio
+	:language: ini
+
+Fixed rate submission
+---------------------
+
+.. only:: builder_html
+
+:download:`Download fixed-rate-submission.fio <../examples/fixed-rate-submission.fio>`
+
+.. literalinclude:: ../examples/fixed-rate-submission.fio
+	:language: ini
+
+Butterfly seek pattern
+-----------------------
+
+.. only:: builder_html
+
+:download:`Download butterfly.fio <../examples/butterfly.fio>`
+
+.. literalinclude:: ../examples/butterfly.fio
+	:language: ini
Binary files /tmp/tmpaiUoyN/55oBVA8RWn/fio-2.1.3/doc/fio-histo-log-pctiles.pdf and /tmp/tmpaiUoyN/623n9XWbO1/fio-3.16/doc/fio-histo-log-pctiles.pdf differ
diff -Nru fio-2.1.3/doc/fio_man.rst fio-3.16/doc/fio_man.rst
--- fio-2.1.3/doc/fio_man.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/fio_man.rst	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+:orphan:
+
+Fio Manpage
+===========
+
+(rev. |release|)
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
diff -Nru fio-2.1.3/doc/index.rst fio-3.16/doc/index.rst
--- fio-2.1.3/doc/index.rst	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/index.rst	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,25 @@
+.. FIO documentation master file, created by
+   sphinx-quickstart on Thu Mar 20 16:24:25 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to FIO's documentation!
+===============================
+
+**Version:** |release|
+
+Contents:
+
+.. toctree::
+   :maxdepth: 3
+   :numbered:
+
+	fio - Flexible I/O tester |version| <fio_doc>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
diff -Nru fio-2.1.3/doc/make.bat fio-3.16/doc/make.bat
--- fio-2.1.3/doc/make.bat	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/make.bat	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,281 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  epub3      to make an epub3
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	echo.  dummy      to check syntax errors of document sources
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fio.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fio.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "epub3" (
+	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+if "%1" == "dummy" (
+	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. Dummy builder generates no files.
+	goto end
+)
+
+:end
diff -Nru fio-2.1.3/doc/Makefile fio-3.16/doc/Makefile
--- fio-2.1.3/doc/Makefile	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/doc/Makefile	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = output
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  epub3      to make an epub3"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+	@echo "  dummy      to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fio.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fio.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/fio"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fio"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+	@echo
+	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+	@echo
+	@echo "Build finished. Dummy builder generates no files."
diff -Nru fio-2.1.3/engines/binject.c fio-3.16/engines/binject.c
--- fio-2.1.3/engines/binject.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/binject.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,451 +0,0 @@
-/*
- * binject engine
- *
- * IO engine that uses the Linux binject interface to directly inject
- * bio's to block devices.
- *
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <assert.h>
-#include <string.h>
-#include <sys/poll.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "../fio.h"
-
-#ifdef FIO_HAVE_BINJECT
-
-struct binject_data {
-	struct b_user_cmd *cmds;
-	struct io_u **events;
-	struct pollfd *pfds;
-	int *fd_flags;
-};
-
-struct binject_file {
-	unsigned int bs;
-	int minor;
-	int fd;
-};
-
-static void binject_buc_init(struct binject_data *bd, struct io_u *io_u)
-{
-	struct b_user_cmd *buc = &io_u->buc;
-
-	memset(buc, 0, sizeof(*buc));
-	binject_buc_set_magic(buc);
-
-	buc->buf = (unsigned long) io_u->xfer_buf;
-	buc->len = io_u->xfer_buflen;
-	buc->offset = io_u->offset;
-	buc->usr_ptr = (unsigned long) io_u;
-
-	buc->flags = B_FLAG_NOIDLE | B_FLAG_UNPLUG;
-	assert(buc->buf);
-}
-
-static int pollin_events(struct pollfd *pfds, int fds)
-{
-	int i;
-
-	for (i = 0; i < fds; i++)
-		if (pfds[i].revents & POLLIN)
-			return 1;
-
-	return 0;
-}
-
-static unsigned int binject_read_commands(struct thread_data *td, void *p,
-					  int left, int *err)
-{
-	struct binject_file *bf;
-	struct fio_file *f;
-	int i, ret, events;
-
-one_more:
-	events = 0;
-	for_each_file(td, f, i) {
-		bf = (struct binject_file *) (uintptr_t) f->engine_data;
-		ret = read(bf->fd, p, left * sizeof(struct b_user_cmd));
-		if (ret < 0) {
-			if (errno == EAGAIN)
-				continue;
-			*err = -errno;
-			td_verror(td, errno, "read");
-			break;
-		} else if (ret) {
-			p += ret;
-			events += ret / sizeof(struct b_user_cmd);
-		}
-	}
-
-	if (*err || events)
-		return events;
-
-	usleep(1000);
-	goto one_more;
-}
-
-static int fio_binject_getevents(struct thread_data *td, unsigned int min,
-			      unsigned int max, struct timespec fio_unused *t)
-{
-	struct binject_data *bd = td->io_ops->data;
-	int left = max, ret, r = 0, ev_index = 0;
-	void *buf = bd->cmds;
-	unsigned int i, events;
-	struct fio_file *f;
-	struct binject_file *bf;
-
-	/*
-	 * Fill in the file descriptors
-	 */
-	for_each_file(td, f, i) {
-		bf = (struct binject_file *) (uintptr_t) f->engine_data;
-
-		/*
-		 * don't block for min events == 0
-		 */
-		if (!min) {
-			bd->fd_flags[i] = fcntl(bf->fd, F_GETFL);
-			fcntl(bf->fd, F_SETFL, bd->fd_flags[i] | O_NONBLOCK);
-		}
-		bd->pfds[i].fd = bf->fd;
-		bd->pfds[i].events = POLLIN;
-	}
-
-	while (left) {
-		while (!min) {
-			ret = poll(bd->pfds, td->o.nr_files, -1);
-			if (ret < 0) {
-				if (!r)
-					r = -errno;
-				td_verror(td, errno, "poll");
-				break;
-			} else if (!ret)
-				continue;
-
-			if (pollin_events(bd->pfds, td->o.nr_files))
-				break;
-		}
-
-		if (r < 0)
-			break;
-
-		events = binject_read_commands(td, buf, left, &r);
-
-		if (r < 0)
-			break;
-
-		left -= events;
-		r += events;
-
-		for (i = 0; i < events; i++) {
-			struct b_user_cmd *buc = (struct b_user_cmd *) buf + i;
-
-			bd->events[ev_index] = (struct io_u *) (unsigned long) buc->usr_ptr;
-			ev_index++;
-		}
-	}
-
-	if (!min) {
-		for_each_file(td, f, i) {
-			bf = (struct binject_file *) (uintptr_t) f->engine_data;
-			fcntl(bf->fd, F_SETFL, bd->fd_flags[i]);
-		}
-	}
-
-	if (r > 0)
-		assert(ev_index == r);
-
-	return r;
-}
-
-static int fio_binject_doio(struct thread_data *td, struct io_u *io_u)
-{
-	struct b_user_cmd *buc = &io_u->buc;
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) io_u->file->engine_data;
-	int ret;
-
-	ret = write(bf->fd, buc, sizeof(*buc));
-	if (ret < 0)
-		return ret;
-
-	return FIO_Q_QUEUED;
-}
-
-static int fio_binject_prep(struct thread_data *td, struct io_u *io_u)
-{
-	struct binject_data *bd = td->io_ops->data;
-	struct b_user_cmd *buc = &io_u->buc;
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) io_u->file->engine_data;
-
-	if (io_u->xfer_buflen & (bf->bs - 1)) {
-		log_err("read/write not sector aligned\n");
-		return EINVAL;
-	}
-
-	if (io_u->ddir == DDIR_READ) {
-		binject_buc_init(bd, io_u);
-		buc->type = B_TYPE_READ;
-	} else if (io_u->ddir == DDIR_WRITE) {
-		binject_buc_init(bd, io_u);
-		if (io_u->flags & IO_U_F_BARRIER)
-			buc->type = B_TYPE_WRITEBARRIER;
-		else
-			buc->type = B_TYPE_WRITE;
-	} else if (io_u->ddir == DDIR_TRIM) {
-		binject_buc_init(bd, io_u);
-		buc->type = B_TYPE_DISCARD;
-	} else {
-		assert(0);
-	}
-
-	return 0;
-}
-
-static int fio_binject_queue(struct thread_data *td, struct io_u *io_u)
-{
-	int ret;
-
-	fio_ro_check(td, io_u);
-
-	ret = fio_binject_doio(td, io_u);
-
-	if (ret < 0)
-		io_u->error = errno;
-
-	if (io_u->error) {
-		td_verror(td, io_u->error, "xfer");
-		return FIO_Q_COMPLETED;
-	}
-
-	return ret;
-}
-
-static struct io_u *fio_binject_event(struct thread_data *td, int event)
-{
-	struct binject_data *bd = td->io_ops->data;
-
-	return bd->events[event];
-}
-
-static int binject_open_ctl(struct thread_data *td)
-{
-	int fd;
-
-	fd = open("/dev/binject-ctl", O_RDWR);
-	if (fd < 0)
-		td_verror(td, errno, "open binject-ctl");
-
-	return fd;
-}
-
-static void binject_unmap_dev(struct thread_data *td, struct binject_file *bf)
-{
-	struct b_ioctl_cmd bic;
-	int fdb;
-
-	if (bf->fd >= 0) {
-		close(bf->fd);
-		bf->fd = -1;
-	}
-
-	fdb = binject_open_ctl(td);
-	if (fdb < 0)
-		return;
-
-	bic.minor = bf->minor;
-
-	if (ioctl(fdb, B_IOCTL_DEL, &bic) < 0)
-		td_verror(td, errno, "binject dev unmap");
-
-	close(fdb);
-}
-
-static int binject_map_dev(struct thread_data *td, struct binject_file *bf,
-			   int fd)
-{
-	struct b_ioctl_cmd bic;
-	char name[80];
-	struct stat sb;
-	int fdb, dev_there, loops;
-
-	fdb = binject_open_ctl(td);
-	if (fdb < 0)
-		return 1;
-
-	bic.fd = fd;
-
-	if (ioctl(fdb, B_IOCTL_ADD, &bic) < 0) {
-		td_verror(td, errno, "binject dev map");
-		close(fdb);
-		return 1;
-	}
-
-	bf->minor = bic.minor;
-
-	sprintf(name, "/dev/binject%u", bf->minor);
-
-	/*
-	 * Wait for udev to create the node...
-	 */
-	dev_there = loops = 0;
-	do {
-		if (!stat(name, &sb)) {
-			dev_there = 1;
-			break;
-		}
-
-		usleep(10000);
-	} while (++loops < 100);
-
-	close(fdb);
-
-	if (!dev_there) {
-		log_err("fio: timed out waiting for binject dev\n");
-		goto err_unmap;
-	}
-
-	bf->fd = open(name, O_RDWR);
-	if (bf->fd < 0) {
-		td_verror(td, errno, "binject dev open");
-err_unmap:
-		binject_unmap_dev(td, bf);
-		return 1;
-	}
-
-	return 0;
-}
-
-static int fio_binject_close_file(struct thread_data *td, struct fio_file *f)
-{
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) f->engine_data;
-
-	if (bf) {
-		binject_unmap_dev(td, bf);
-		free(bf);
-		f->engine_data = 0;
-		return generic_close_file(td, f);
-	}
-
-	return 0;
-}
-
-static int fio_binject_open_file(struct thread_data *td, struct fio_file *f)
-{
-	struct binject_file *bf;
-	unsigned int bs;
-	int ret;
-
-	ret = generic_open_file(td, f);
-	if (ret)
-		return 1;
-
-	if (f->filetype != FIO_TYPE_BD) {
-		log_err("fio: binject only works with block devices\n");
-		goto err_close;
-	}
-	if (ioctl(f->fd, BLKSSZGET, &bs) < 0) {
-		td_verror(td, errno, "BLKSSZGET");
-		goto err_close;
-	}
-
-	bf = malloc(sizeof(*bf));
-	bf->bs = bs;
-	bf->minor = bf->fd = -1;
-	f->engine_data = (uintptr_t) bf;
-
-	if (binject_map_dev(td, bf, f->fd)) {
-err_close:
-		ret = generic_close_file(td, f);
-		return 1;
-	}
-
-	return 0;
-}
-
-static void fio_binject_cleanup(struct thread_data *td)
-{
-	struct binject_data *bd = td->io_ops->data;
-
-	if (bd) {
-		free(bd->events);
-		free(bd->cmds);
-		free(bd->fd_flags);
-		free(bd->pfds);
-		free(bd);
-	}
-}
-
-static int fio_binject_init(struct thread_data *td)
-{
-	struct binject_data *bd;
-
-	bd = malloc(sizeof(*bd));
-	memset(bd, 0, sizeof(*bd));
-
-	bd->cmds = malloc(td->o.iodepth * sizeof(struct b_user_cmd));
-	memset(bd->cmds, 0, td->o.iodepth * sizeof(struct b_user_cmd));
-
-	bd->events = malloc(td->o.iodepth * sizeof(struct io_u *));
-	memset(bd->events, 0, td->o.iodepth * sizeof(struct io_u *));
-
-	bd->pfds = malloc(sizeof(struct pollfd) * td->o.nr_files);
-	memset(bd->pfds, 0, sizeof(struct pollfd) * td->o.nr_files);
-
-	bd->fd_flags = malloc(sizeof(int) * td->o.nr_files);
-	memset(bd->fd_flags, 0, sizeof(int) * td->o.nr_files);
-
-	td->io_ops->data = bd;
-	return 0;
-}
-
-static struct ioengine_ops ioengine = {
-	.name		= "binject",
-	.version	= FIO_IOOPS_VERSION,
-	.init		= fio_binject_init,
-	.prep		= fio_binject_prep,
-	.queue		= fio_binject_queue,
-	.getevents	= fio_binject_getevents,
-	.event		= fio_binject_event,
-	.cleanup	= fio_binject_cleanup,
-	.open_file	= fio_binject_open_file,
-	.close_file	= fio_binject_close_file,
-	.get_file_size	= generic_get_file_size,
-	.flags		= FIO_RAWIO | FIO_BARRIER | FIO_MEMALIGN,
-};
-
-#else /* FIO_HAVE_BINJECT */
-
-/*
- * When we have a proper configure system in place, we simply wont build
- * and install this io engine. For now install a crippled version that
- * just complains and fails to load.
- */
-static int fio_binject_init(struct thread_data fio_unused *td)
-{
-	log_err("fio: ioengine binject not available\n");
-	return 1;
-}
-
-static struct ioengine_ops ioengine = {
-	.name		= "binject",
-	.version	= FIO_IOOPS_VERSION,
-	.init		= fio_binject_init,
-};
-
-#endif
-
-static void fio_init fio_binject_register(void)
-{
-	register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_binject_unregister(void)
-{
-	unregister_ioengine(&ioengine);
-}
diff -Nru fio-2.1.3/engines/cpu.c fio-3.16/engines/cpu.c
--- fio-2.1.3/engines/cpu.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/cpu.c	2019-09-20 01:01:52.000000000 +0000
@@ -6,11 +6,13 @@
  *
  */
 #include "../fio.h"
+#include "../optgroup.h"
 
 struct cpu_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int cpuload;
 	unsigned int cpucycle;
+	unsigned int exit_io_done;
 };
 
 static struct fio_option options[] = {
@@ -20,7 +22,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct cpu_options, cpuload),
 		.help	= "Use this percentage of CPU",
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -32,7 +34,17 @@
 		.def	= "50000",
 		.parent = "cpuload",
 		.hide	= 1,
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "exit_on_io_done",
+		.lname	= "Exit when IO threads are done",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct cpu_options, exit_io_done),
+		.help	= "Exit when IO threads finish",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -41,10 +53,16 @@
 };
 
 
-static int fio_cpuio_queue(struct thread_data *td, struct io_u fio_unused *io_u)
+static enum fio_q_status fio_cpuio_queue(struct thread_data *td,
+					 struct io_u fio_unused *io_u)
 {
 	struct cpu_options *co = td->eo;
 
+	if (co->exit_io_done && !fio_running_or_pending_io_threads()) {
+		td->done = 1;
+		return FIO_Q_BUSY;
+	}
+
 	usec_spin(co->cpucycle);
 	return FIO_Q_COMPLETED;
 }
@@ -67,12 +85,12 @@
 	 */
 	o->thinktime_blocks = 1;
 	o->thinktime_spin = 0;
-	o->thinktime = (co->cpucycle * (100 - co->cpuload)) / co->cpuload;
+	o->thinktime = ((unsigned long long) co->cpucycle * (100 - co->cpuload)) / co->cpuload;
 
 	o->nr_files = o->open_files = 1;
 
-	log_info("%s: ioengine=cpu, cpuload=%u, cpucycle=%u\n", td->o.name,
-						co->cpuload, co->cpucycle);
+	log_info("%s: ioengine=%s, cpuload=%u, cpucycle=%u\n",
+		td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
 
 	return 0;
 }
diff -Nru fio-2.1.3/engines/dev-dax.c fio-3.16/engines/dev-dax.c
--- fio-2.1.3/engines/dev-dax.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/dev-dax.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,351 @@
+/*
+ * device DAX engine
+ *
+ * IO engine that reads/writes from files by doing memcpy to/from
+ * a memory mapped region of DAX enabled device.
+ *
+ * Copyright (C) 2016 Intel Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * device dax engine
+ * IO engine that access a DAX device directly for read and write data
+ *
+ * To use:
+ *   ioengine=dev-dax
+ *
+ *   Other relevant settings:
+ *     iodepth=1
+ *     direct=0	   REQUIRED
+ *     filename=/dev/daxN.N
+ *     bs=2m
+ *
+ *     direct should be left to 0. Using dev-dax implies that memory access
+ *     is direct. However, dev-dax does not support O_DIRECT flag by design
+ *     since it is not necessary.
+ *
+ *     bs should adhere to the device dax alignment at minimally.
+ *
+ * libpmem.so
+ *   By default, the dev-dax engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * mmap engine behavior
+ */
+#define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
+
+struct fio_devdax_data {
+	void *devdax_ptr;
+	size_t devdax_sz;
+	off_t devdax_off;
+};
+
+static int fio_devdax_file(struct thread_data *td, struct fio_file *f,
+			   size_t length, off_t off)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int flags = 0;
+
+	if (td_rw(td))
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td)) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	fdd->devdax_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
+	if (fdd->devdax_ptr == MAP_FAILED) {
+		fdd->devdax_ptr = NULL;
+		td_verror(td, errno, "mmap");
+	}
+
+	if (td->error && fdd->devdax_ptr)
+		munmap(fdd->devdax_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_devdax_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	if (io_u->buflen > f->real_file_size) {
+		log_err("dev-dax: bs too big for dev-dax engine\n");
+		return EIO;
+	}
+
+	fdd->devdax_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+	if (fdd->devdax_sz > f->io_size)
+		fdd->devdax_sz = f->io_size;
+
+	fdd->devdax_off = io_u->offset;
+
+	return fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_devdax_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+
+	fdd->devdax_sz = f->io_size;
+	fdd->devdax_off = 0;
+
+	ret = fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_devdax_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	if (io_u->offset >= fdd->devdax_off &&
+	    io_u->offset + io_u->buflen <= fdd->devdax_off + fdd->devdax_sz)
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fdd->devdax_ptr) {
+		if (munmap(fdd->devdax_ptr, fdd->devdax_sz) < 0)
+			return errno;
+		fdd->devdax_ptr = NULL;
+	}
+
+	if (fio_devdax_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_devdax_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fdd->devdax_ptr + io_u->offset - fdd->devdax_off -
+				f->file_offset;
+	return 0;
+}
+
+static enum fio_q_status fio_devdax_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+	io_u->error = 0;
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+		break;
+	case DDIR_WRITE:
+		pmem_memcpy_persist(io_u->mmap_data, io_u->xfer_buf,
+				    io_u->xfer_buflen);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_devdax_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("dev-dax: mmap options dictate a minimum block size of %llu bytes\n",
+			(unsigned long long) page_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_devdax_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fdd = calloc(1, sizeof(*fdd));
+	if (!fdd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fdd);
+
+	return 0;
+}
+
+static int fio_devdax_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fdd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static int
+fio_devdax_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	char spath[PATH_MAX];
+	char npath[PATH_MAX];
+	char *rpath, *basename;
+	FILE *sfile;
+	uint64_t size;
+	struct stat st;
+	int rc;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_CHAR)
+		return -EINVAL;
+
+	rc = stat(f->file_name, &st);
+	if (rc < 0) {
+		log_err("%s: failed to stat file %s (%s)\n",
+			td->o.name, f->file_name, strerror(errno));
+		return -errno;
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/subsystem",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	rpath = realpath(spath, npath);
+	if (!rpath) {
+		log_err("%s: realpath on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return -errno;
+	}
+
+	/* check if DAX device */
+	basename = strrchr(rpath, '/');
+	if (!basename || strcmp("dax", basename+1)) {
+		log_err("%s: %s not a DAX device!\n",
+			td->o.name, f->file_name);
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/size",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	sfile = fopen(spath, "r");
+	if (!sfile) {
+		log_err("%s: fopen on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return 1;
+	}
+
+	rc = fscanf(sfile, "%lu", &size);
+	if (rc < 0) {
+		log_err("%s: fscanf on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		fclose(sfile);
+		return 1;
+	}
+
+	f->real_file_size = size;
+
+	fclose(sfile);
+
+	if (f->file_offset > f->real_file_size) {
+		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
+					(unsigned long long) f->file_offset,
+					(unsigned long long) f->real_file_size);
+		return 1;
+	}
+
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "dev-dax",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_devdax_init,
+	.prep		= fio_devdax_prep,
+	.queue		= fio_devdax_queue,
+	.open_file	= fio_devdax_open_file,
+	.close_file	= fio_devdax_close_file,
+	.get_file_size	= fio_devdax_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_devdax_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_devdax_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/e4defrag.c fio-3.16/engines/e4defrag.c
--- fio-2.1.3/engines/e4defrag.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/e4defrag.c	2019-09-20 01:01:52.000000000 +0000
@@ -9,14 +9,11 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/uio.h>
 #include <errno.h>
-#include <assert.h>
 #include <fcntl.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 #ifndef EXT4_IOC_MOVE_EXT
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
@@ -36,7 +33,7 @@
 };
 
 struct e4defrag_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int inplace;
 	char * donor_name;
 };
@@ -44,6 +41,7 @@
 static struct fio_option options[] = {
 	{
 		.name	= "donorname",
+		.lname	= "Donor Name",
 		.type	= FIO_OPT_STR_STORE,
 		.off1	= offsetof(struct e4defrag_options, donor_name),
 		.help	= "File used as a block donor",
@@ -52,6 +50,7 @@
 	},
 	{
 		.name	= "inplace",
+		.lname	= "In Place",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct e4defrag_options, inplace),
 		.minval	= 0,
@@ -80,7 +79,7 @@
 
 	ed = malloc(sizeof(*ed));
 	if (!ed) {
-		td_verror(td, -ENOMEM, "io_queue_init");
+		td_verror(td, ENOMEM, "io_queue_init");
 		return 1;
 	}
 	memset(ed, 0 ,sizeof(*ed));
@@ -91,15 +90,15 @@
 
 	ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644);
 	if (ed->donor_fd < 0) {
-		td_verror(td, ed->donor_fd, "io_queue_init");
-		log_err("Can't open donor file %s err:%d", donor_name, ed->donor_fd);
+		td_verror(td, errno, "io_queue_init");
+		log_err("Can't open donor file %s err:%d\n", donor_name, ed->donor_fd);
 		free(ed);
 		return 1;
 	}
 
 	if (!o->inplace) {
-		long long len = td->o.file_size_high - td->o.start_offset;
-		r = fallocate(ed->donor_fd, 0, td->o.start_offset, len);
+		long long __len = td->o.file_size_high - td->o.start_offset;
+		r = fallocate(ed->donor_fd, 0, td->o.start_offset, __len);
 		if (r)
 			goto err;
 	}
@@ -108,7 +107,7 @@
 		goto err;
 
 	ed->bsz = stub.st_blksize;
-	td->io_ops->data = ed;
+	td->io_ops_data = ed;
 	return 0;
 err:
 	td_verror(td, errno, "io_queue_init");
@@ -119,7 +118,7 @@
 
 static void fio_e4defrag_cleanup(struct thread_data *td)
 {
-	struct e4defrag_data *ed = td->io_ops->data;
+	struct e4defrag_data *ed = td->io_ops_data;
 	if (ed) {
 		if (ed->donor_fd >= 0)
 			close(ed->donor_fd);
@@ -128,14 +127,15 @@
 }
 
 
-static int fio_e4defrag_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_e4defrag_queue(struct thread_data *td,
+					    struct io_u *io_u)
 {
 
 	int ret;
 	unsigned long long len;
 	struct move_extent me;
 	struct fio_file *f = io_u->file;
-	struct e4defrag_data *ed = td->io_ops->data;
+	struct e4defrag_data *ed = td->io_ops_data;
 	struct e4defrag_options *o = td->eo;
 
 	fio_ro_check(td, io_u);
@@ -169,8 +169,13 @@
 		len = io_u->xfer_buflen;
 
 	if (len != io_u->xfer_buflen) {
-		io_u->resid = io_u->xfer_buflen - len;
-		io_u->error = 0;
+		if (len) {
+			io_u->resid = io_u->xfer_buflen - len;
+			io_u->error = 0;
+		} else {
+			/* access beyond i_size */
+			io_u->error = EINVAL;
+		}
 	}
 	if (ret)
 		io_u->error = errno;
diff -Nru fio-2.1.3/engines/falloc.c fio-3.16/engines/falloc.c
--- fio-2.1.3/engines/falloc.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/falloc.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,16 +4,12 @@
  * IO engine that does regular fallocate to simulate data transfer 
  * as fio ioengine.
  * DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
- * DDIR_WRITE does fallocate(,mode = 0) : fallocate with size extention 
+ * DDIR_WRITE does fallocate(,mode = 0) : fallocate with size extension
  * DDIR_TRIM  does fallocate(,mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)
  *
  */
 #include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/uio.h>
 #include <errno.h>
-#include <assert.h>
 #include <fcntl.h>
 
 #include "../fio.h"
@@ -23,7 +19,7 @@
  * generic_open_file is not appropriate because does not allow to perform
  * TRIM in to file
  */
-int open_file(struct thread_data *td, struct fio_file *f)
+static int open_file(struct thread_data *td, struct fio_file *f)
 {
 	int from_hash = 0;
 
@@ -43,9 +39,10 @@
 
 	if (f->fd == -1) {
 		char buf[FIO_VERROR_SIZE];
-		int __e = errno;
+		int e = errno;
+
 		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
-		td_verror(td, __e, buf);
+		td_verror(td, e, buf);
 	}
 
 	if (!from_hash && f->fd != -1) {
@@ -68,8 +65,10 @@
 #endif
 #ifndef FALLOC_FL_PUNCH_HOLE
 #define FALLOC_FL_PUNCH_HOLE    0x02 /* de-allocates range */
-#endif 
-static int fio_fallocate_queue(struct thread_data *td, struct io_u *io_u)
+#endif
+
+static enum fio_q_status fio_fallocate_queue(struct thread_data *td,
+					     struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
 	int ret;
diff -Nru fio-2.1.3/engines/filecreate.c fio-3.16/engines/filecreate.c
--- fio-2.1.3/engines/filecreate.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/filecreate.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,118 @@
+/*
+ * filecreate engine
+ *
+ * IO engine that doesn't do any IO, just creates files and tracks the latency
+ * of the file creation.
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include "../fio.h"
+
+struct fc_data {
+	enum fio_ddir stat_ddir;
+};
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported fallocate \n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0);
+	}
+
+	return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td,
+				  struct io_u fio_unused *io_u)
+{
+	return FIO_Q_COMPLETED;
+}
+
+/*
+ * Ensure that we at least have a block size worth of IO to do for each
+ * file. If the job file has td->o.size < nr_files * block_size, then
+ * fio won't do anything.
+ */
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	f->real_file_size = td_min_bs(td);
+	return 0;
+}
+
+static int init(struct thread_data *td)
+{
+	struct fc_data *data;
+
+	data = calloc(1, sizeof(*data));
+
+	if (td_read(td))
+		data->stat_ddir = DDIR_READ;
+	else if (td_write(td))
+		data->stat_ddir = DDIR_WRITE;
+
+	td->io_ops_data = data;
+	return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+	struct fc_data *data = td->io_ops_data;
+
+	free(data);
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "filecreate",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.get_file_size	= get_file_size,
+	.open_file	= open_file,
+	.close_file	= generic_close_file,
+	.flags		= FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static void fio_init fio_filecreate_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_filecreate_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/ftruncate.c fio-3.16/engines/ftruncate.c
--- fio-2.1.3/engines/ftruncate.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/ftruncate.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,52 @@
+/*
+ * ftruncate: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular truncates to simulate data transfer
+ * as fio ioengine.
+ * DDIR_WRITE does ftruncate
+ *
+ */
+#include <errno.h>
+#include <unistd.h>
+
+#include "../fio.h"
+
+static enum fio_q_status fio_ftruncate_queue(struct thread_data *td,
+					     struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+
+	ret = ftruncate(f->fd, io_u->offset);
+	if (ret)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "ftruncate",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_ftruncate_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_FAKEIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/fusion-aw.c fio-3.16/engines/fusion-aw.c
--- fio-2.1.3/engines/fusion-aw.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/fusion-aw.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,183 +0,0 @@
-/*
- * Custom fio(1) engine that submits synchronous atomic writes to file.
- *
- * Copyright (C) 2013 Fusion-io, Inc.
- * Author: Santhosh Kumar Koundinya (skoundinya@fusionio.com).
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; under version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License version
- * 2 for more details.
- *
- * You should have received a copy of the GNU General Public License Version 2
- * along with this program; if not see <http://www.gnu.org/licenses/>
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-
-#include "../fio.h"
-
-#include <nvm/nvm_primitives.h>
-
-#define NUM_ATOMIC_CAPABILITIES (5)
-
-struct fas_data {
-	nvm_handle_t nvm_handle;
-	size_t xfer_buf_align;
-	size_t xfer_buflen_align;
-	size_t xfer_buflen_max;
-	size_t sector_size;
-};
-
-static int queue(struct thread_data *td, struct io_u *io_u)
-{
-	int rc;
-	struct fas_data *d = (struct fas_data *) io_u->file->engine_data;
-
-	if (io_u->ddir != DDIR_WRITE) {
-		td_vmsg(td, EINVAL, "only writes supported", "io_u->ddir");
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if ((size_t) io_u->xfer_buf % d->xfer_buf_align) {
-		td_vmsg(td, EINVAL, "unaligned data buffer", "io_u->xfer_buf");
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if (io_u->xfer_buflen % d->xfer_buflen_align) {
-		td_vmsg(td, EINVAL, "unaligned data size", "io_u->xfer_buflen");
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if (io_u->xfer_buflen > d->xfer_buflen_max) {
-		td_vmsg(td, EINVAL, "data too big", "io_u->xfer_buflen");
-		rc = -EINVAL;
-		goto out;
-	}
-
-	rc = nvm_atomic_write(d->nvm_handle, (uint64_t) io_u->xfer_buf,
-		io_u->xfer_buflen, io_u->offset / d->sector_size);
-	if (rc == -1) {
-		td_verror(td, errno, "nvm_atomic_write");
-		rc = -errno;
-		goto out;
-	}
-	rc = FIO_Q_COMPLETED;
-out:
-	if (rc < 0)
-		io_u->error = -rc;
-
-	return rc;
-}
-
-static int open_file(struct thread_data *td, struct fio_file *f)
-{
-	int rc;
-	int fio_unused close_file_rc;
-	struct fas_data *d;
-	nvm_version_t nvm_version;
-	nvm_capability_t nvm_capability[NUM_ATOMIC_CAPABILITIES];
-
-
-	d = malloc(sizeof(*d));
-	if (!d) {
-		td_verror(td, ENOMEM, "malloc");
-		rc = ENOMEM;
-		goto error;
-	}
-	d->nvm_handle = -1;
-	f->engine_data = (uintptr_t) d;
-
-	rc = generic_open_file(td, f);
-
-	if (rc)
-		goto free_engine_data;
-
-	/* Set the version of the library as seen when engine is compiled */
-	nvm_version.major = NVM_PRIMITIVES_API_MAJOR;
-	nvm_version.minor = NVM_PRIMITIVES_API_MINOR;
-	nvm_version.micro = NVM_PRIMITIVES_API_MICRO;
-
-	d->nvm_handle = nvm_get_handle(f->fd, &nvm_version);
-	if (d->nvm_handle == -1) {
-		td_vmsg(td, errno, "nvm_get_handle failed", "nvm_get_handle");
-		rc = errno;
-		goto close_file;
-	}
-
-	nvm_capability[0].cap_id = NVM_CAP_ATOMIC_WRITE_START_ALIGN_ID;
-	nvm_capability[1].cap_id = NVM_CAP_ATOMIC_WRITE_MULTIPLICITY_ID;
-	nvm_capability[2].cap_id = NVM_CAP_ATOMIC_WRITE_MAX_VECTOR_SIZE_ID;
-	nvm_capability[3].cap_id = NVM_CAP_SECTOR_SIZE_ID;
-	nvm_capability[4].cap_id = NVM_CAP_ATOMIC_MAX_IOV_ID;
-	rc = nvm_get_capabilities(d->nvm_handle, nvm_capability,
-                                  NUM_ATOMIC_CAPABILITIES, false);
-	if (rc == -1) {
-		td_vmsg(td, errno, "error in getting atomic write capabilities", "nvm_get_capabilities");
-		rc = errno;
-		goto close_file;
-	} else if (rc < NUM_ATOMIC_CAPABILITIES) {
-		td_vmsg(td, EINVAL, "couldn't get all the atomic write capabilities" , "nvm_get_capabilities");
-		rc = ECANCELED;
-		goto close_file;
-	}
-	/* Reset rc to 0 because we got all capabilities we needed */
-	rc = 0;
-	d->xfer_buf_align = nvm_capability[0].cap_value;
-	d->xfer_buflen_align = nvm_capability[1].cap_value;
-	d->xfer_buflen_max = d->xfer_buflen_align * nvm_capability[2].cap_value * nvm_capability[4].cap_value;
-	d->sector_size = nvm_capability[3].cap_value;
-
-out:
-	return rc;
-close_file:
-	close_file_rc = generic_close_file(td, f);
-free_engine_data:
-	free(d);
-error:
-	f->fd = -1;
-	f->engine_data = 0;
-	goto out;
-}
-
-static int close_file(struct thread_data *td, struct fio_file *f)
-{
-	struct fas_data *d = (struct fas_data *) f->engine_data;
-
-	if (d) {
-		if (d->nvm_handle != -1)
-			nvm_release_handle(d->nvm_handle);
-		free(d);
-		f->engine_data = 0;
-	}
-
-	return generic_close_file(td, f);
-}
-
-static struct ioengine_ops ioengine = {
-	.name = "fusion-aw-sync",
-	.version = FIO_IOOPS_VERSION,
-	.queue = queue,
-	.open_file = open_file,
-	.close_file = close_file,
-	.get_file_size = generic_get_file_size,
-	.flags = FIO_SYNCIO | FIO_RAWIO | FIO_MEMALIGN
-};
-
-static void fio_init fio_fusion_aw_init(void)
-{
-	register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_fusion_aw_exit(void)
-{
-	unregister_ioengine(&ioengine);
-}
diff -Nru fio-2.1.3/engines/gfapi.h fio-3.16/engines/gfapi.h
--- fio-2.1.3/engines/gfapi.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/gfapi.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,23 @@
+#include <glusterfs/api/glfs.h>
+#include "../fio.h"
+
+struct gf_options {
+	void *pad;
+	char *gf_vol;
+	char *gf_brick;
+	int gf_single_instance;
+};
+
+struct gf_data {
+	glfs_t *fs;
+	glfs_fd_t *fd;
+	struct io_u **aio_events;
+};
+
+extern struct fio_option gfapi_options[];
+extern int fio_gf_setup(struct thread_data *td);
+extern void fio_gf_cleanup(struct thread_data *td);
+extern int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_open_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_close_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f);
diff -Nru fio-2.1.3/engines/glusterfs_async.c fio-3.16/engines/glusterfs_async.c
--- fio-2.1.3/engines/glusterfs_async.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/glusterfs_async.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,193 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi async interface
+ *
+ */
+#include "gfapi.h"
+#define NOT_YET 1
+struct fio_gf_iou {
+	struct io_u *io_u;
+	int io_complete;
+};
+
+static struct io_u *fio_gf_event(struct thread_data *td, int event)
+{
+	struct gf_data *gf_data = td->io_ops_data;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	return gf_data->aio_events[event];
+}
+
+static int fio_gf_getevents(struct thread_data *td, unsigned int min,
+			    unsigned int max, const struct timespec *t)
+{
+	struct gf_data *g = td->io_ops_data;
+	unsigned int events = 0;
+	struct io_u *io_u;
+	int i;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	do {
+		io_u_qiter(&td->io_u_all, io_u, i) {
+			struct fio_gf_iou *io;
+
+			if (!(io_u->flags & IO_U_F_FLIGHT))
+				continue;
+
+			io = io_u->engine_data;
+			if (io->io_complete) {
+				io->io_complete = 0;
+				g->aio_events[events] = io_u;
+				events++;
+
+				if (events >= max)
+					break;
+			}
+
+		}
+		if (events < min)
+			usleep(100);
+		else
+			break;
+
+	} while (1);
+
+	return events;
+}
+
+static void fio_gf_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_gf_iou *io = io_u->engine_data;
+
+	if (io) {
+		if (io->io_complete)
+			log_err("incomplete IO found.\n");
+		io_u->engine_data = NULL;
+		free(io);
+	}
+}
+
+static int fio_gf_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+    struct fio_gf_iou *io;
+	dprint(FD_FILE, "%s\n", __FUNCTION__);
+    
+    io = malloc(sizeof(struct fio_gf_iou));
+    if (!io) {
+        td_verror(td, errno, "malloc");
+        return 1;
+    }
+    io->io_complete = 0;
+    io->io_u = io_u;
+    io_u->engine_data = io;
+	return 0;
+}
+
+#if defined(CONFIG_GF_NEW_API)
+static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, struct glfs_stat *prestat,
+			struct glfs_stat *poststat, void *data)
+#else
+static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, void *data)
+#endif
+{
+	struct io_u *io_u = data;
+	struct fio_gf_iou *iou = io_u->engine_data;
+
+	dprint(FD_IO, "%s ret %zd\n", __FUNCTION__, ret);
+	iou->io_complete = 1;
+}
+
+static enum fio_q_status fio_gf_async_queue(struct thread_data fio_unused * td,
+					    struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops_data;
+	int r;
+
+	dprint(FD_IO, "%s op %s\n", __FUNCTION__, io_ddir_name(io_u->ddir));
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		r = glfs_pread_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				     io_u->offset, 0, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_WRITE)
+		r = glfs_pwrite_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				      io_u->offset, 0, gf_async_cb, io_u);
+#if defined(CONFIG_GF_TRIM)
+	else if (io_u->ddir == DDIR_TRIM)
+		r = glfs_discard_async(g->fd, io_u->offset, io_u->xfer_buflen,
+				       gf_async_cb, io_u);
+#endif
+	else if (io_u->ddir == DDIR_DATASYNC)
+		r = glfs_fdatasync_async(g->fd, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_SYNC)
+		r = glfs_fsync_async(g->fd, gf_async_cb, io_u);
+	else
+		r = EINVAL;
+
+	if (r) {
+		log_err("glfs queue failed.\n");
+		io_u->error = r;
+		goto failed;
+	}
+	return FIO_Q_QUEUED;
+
+failed:
+	io_u->error = r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_gf_async_setup(struct thread_data *td)
+{
+	struct gf_data *g;
+	int r;
+
+#if defined(NOT_YET)
+	log_err("the async interface is still very experimental...\n");
+#endif
+	r = fio_gf_setup(td);
+	if (r)
+		return r;
+
+	td->o.use_thread = 1;
+	g = td->io_ops_data;
+	g->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!g->aio_events) {
+		r = -ENOMEM;
+		fio_gf_cleanup(td);
+		return r;
+	}
+
+	return r;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi_async",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_async_setup,
+	.cleanup = fio_gf_cleanup,
+	.queue = fio_gf_async_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.getevents = fio_gf_getevents,
+	.event = fio_gf_event,
+	.io_u_init = fio_gf_io_u_init,
+	.io_u_free = fio_gf_io_u_free,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/glusterfs.c fio-3.16/engines/glusterfs.c
--- fio-2.1.3/engines/glusterfs.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/glusterfs.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,435 @@
+/*
+ * glusterfs engine
+ *
+ * common Glusterfs's gfapi interface
+ *
+ */
+
+#include "gfapi.h"
+#include "../optgroup.h"
+
+struct fio_option gfapi_options[] = {
+	{
+	 .name = "volume",
+	 .lname = "Glusterfs volume",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs volume",
+	 .off1 = offsetof(struct gf_options, gf_vol),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = "brick",
+	 .lname = "Glusterfs brick name",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs brick to connect",
+	 .off1 = offsetof(struct gf_options, gf_brick),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = "single-instance",
+	 .lname = "Single glusterfs instance",
+	 .type = FIO_OPT_BOOL,
+	 .help = "Only one glusterfs instance",
+	 .off1 = offsetof(struct gf_options, gf_single_instance),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = NULL,
+	 },
+};
+
+struct glfs_info {
+	struct flist_head	list;
+	char			*volume;
+	char			*brick;
+	glfs_t			*fs;
+	int			refcount;
+};
+
+static pthread_mutex_t glfs_lock = PTHREAD_MUTEX_INITIALIZER;
+static FLIST_HEAD(glfs_list_head);
+
+static glfs_t *fio_gf_new_fs(char *volume, char *brick)
+{
+	int r = 0;
+	glfs_t *fs;
+	struct stat sb = { 0, };
+
+	fs = glfs_new(volume);
+	if (!fs) {
+		log_err("glfs_new failed.\n");
+		goto out;
+	}
+	glfs_set_logging(fs, "/tmp/fio_gfapi.log", 7);
+	/* default to tcp */
+	r = glfs_set_volfile_server(fs, "tcp", brick, 0);
+	if (r) {
+		log_err("glfs_set_volfile_server failed.\n");
+		goto out;
+	}
+	r = glfs_init(fs);
+	if (r) {
+		log_err("glfs_init failed. Is glusterd running on brick?\n");
+		goto out;
+	}
+	sleep(2);
+	r = glfs_lstat(fs, ".", &sb);
+	if (r) {
+		log_err("glfs_lstat failed.\n");
+		goto out;
+	}
+
+out:
+	if (r) {
+		glfs_fini(fs);
+		fs = NULL;
+	}
+	return fs;
+}
+
+static glfs_t *fio_gf_get_glfs(struct gf_options *opt,
+			       char *volume, char *brick)
+{
+	struct glfs_info *glfs = NULL;
+	struct glfs_info *tmp;
+	struct flist_head *entry;
+
+	if (!opt->gf_single_instance)
+		return fio_gf_new_fs(volume, brick);
+
+	pthread_mutex_lock (&glfs_lock);
+
+	flist_for_each(entry, &glfs_list_head) {
+		tmp = flist_entry(entry, struct glfs_info, list);
+		if (!strcmp(volume, tmp->volume) &&
+		    !strcmp(brick, tmp->brick)) {
+			glfs = tmp;
+			break;
+		}
+	}
+
+	if (glfs) {
+		glfs->refcount++;
+	} else {
+		glfs = malloc(sizeof(*glfs));
+		if (!glfs)
+			goto out;
+		INIT_FLIST_HEAD(&glfs->list);
+		glfs->refcount = 0;
+		glfs->volume = strdup(volume);
+		glfs->brick = strdup(brick);
+		glfs->fs = fio_gf_new_fs(volume, brick);
+		if (!glfs->fs) {
+			free(glfs);
+			glfs = NULL;
+			goto out;
+		}
+
+		flist_add_tail(&glfs->list, &glfs_list_head);
+		glfs->refcount = 1;
+	}
+
+out:
+	pthread_mutex_unlock (&glfs_lock);
+
+	if (glfs)
+		return glfs->fs;
+	return NULL;
+}
+
+static void fio_gf_put_glfs(struct gf_options *opt, glfs_t *fs)
+{
+	struct glfs_info *glfs = NULL;
+	struct glfs_info *tmp;
+	struct flist_head *entry;
+
+	if (!opt->gf_single_instance) {
+		glfs_fini(fs);
+		return;
+	}
+
+	pthread_mutex_lock (&glfs_lock);
+
+	flist_for_each(entry, &glfs_list_head) {
+		tmp = flist_entry(entry, struct glfs_info, list);
+		if (tmp->fs == fs) {
+			glfs = tmp;
+			break;
+		}
+	}
+
+	if (!glfs) {
+		log_err("glfs not found to fini.\n");
+	} else {
+		glfs->refcount--;
+
+		if (glfs->refcount == 0) {
+			glfs_fini(glfs->fs);
+			free(glfs->volume);
+			free(glfs->brick);
+			flist_del(&glfs->list);
+		}
+	}
+
+	pthread_mutex_unlock (&glfs_lock);
+}
+
+int fio_gf_setup(struct thread_data *td)
+{
+	struct gf_data *g = NULL;
+	struct gf_options *opt = td->eo;
+
+	dprint(FD_IO, "fio setup\n");
+
+	if (td->io_ops_data)
+		return 0;
+
+	g = malloc(sizeof(struct gf_data));
+	if (!g) {
+		log_err("malloc failed.\n");
+		return -ENOMEM;
+	}
+	g->fd = NULL;
+	g->aio_events = NULL;
+
+	g->fs = fio_gf_get_glfs(opt, opt->gf_vol, opt->gf_brick);
+	if (!g->fs)
+		goto cleanup;
+
+	dprint(FD_FILE, "fio setup %p\n", g->fs);
+	td->io_ops_data = g;
+	return 0;
+cleanup:
+	free(g);
+	td->io_ops_data = NULL;
+	return -EIO;
+}
+
+void fio_gf_cleanup(struct thread_data *td)
+{
+	struct gf_data *g = td->io_ops_data;
+
+	if (g) {
+		if (g->aio_events)
+			free(g->aio_events);
+		if (g->fd)
+			glfs_close(g->fd);
+		if (g->fs)
+			fio_gf_put_glfs(td->eo, g->fs);
+		free(g);
+		td->io_ops_data = NULL;
+	}
+}
+
+int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat buf;
+	int ret;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "get file size %s\n", f->file_name);
+
+	if (!g || !g->fs) {
+		return 0;
+	}
+	if (fio_file_size_known(f))
+		return 0;
+
+	ret = glfs_lstat(g->fs, f->file_name, &buf);
+	if (ret < 0) {
+		log_err("glfs_lstat failed.\n");
+		return ret;
+	}
+
+	f->real_file_size = buf.st_size;
+	fio_file_set_size_known(f);
+
+	return 0;
+
+}
+
+int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
+{
+
+	int flags = 0;
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+	struct stat sb = { 0, };
+
+	if (td_write(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+	} else if (td_read(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+		else
+			flags = O_RDONLY;
+	}
+
+	if (td->o.odirect)
+		flags |= OS_O_DIRECT;
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+
+	dprint(FD_FILE, "fio file %s open mode %s td rw %s\n", f->file_name,
+	       flags & O_RDONLY ? "ro" : "rw", td_read(td) ? "read" : "write");
+	g->fd = glfs_creat(g->fs, f->file_name, flags, 0644);
+	if (!g->fd) {
+		ret = errno;
+		log_err("glfs_creat failed.\n");
+		return ret;
+	}
+	/* file for read doesn't exist or shorter than required, create/extend it */
+	if (td_read(td)) {
+		if (glfs_lstat(g->fs, f->file_name, &sb)
+		    || sb.st_size < f->real_file_size) {
+			dprint(FD_FILE, "fio extend file %s from %jd to %" PRIu64 "\n",
+			       f->file_name, (intmax_t) sb.st_size, f->real_file_size);
+#if defined(CONFIG_GF_NEW_API)
+			ret = glfs_ftruncate(g->fd, f->real_file_size, NULL, NULL);
+#else
+			ret = glfs_ftruncate(g->fd, f->real_file_size);
+#endif
+			if (ret) {
+				log_err("failed fio extend file %s to %" PRIu64 "\n",
+					f->file_name, f->real_file_size);
+			} else {
+				unsigned long long left;
+				unsigned int bs;
+				char *b;
+				int r;
+
+				/* fill the file, copied from extend_file */
+				b = malloc(td->o.max_bs[DDIR_WRITE]);
+
+				left = f->real_file_size;
+				while (left && !td->terminate) {
+					bs = td->o.max_bs[DDIR_WRITE];
+					if (bs > left)
+						bs = left;
+
+					fill_io_buffer(td, b, bs, bs);
+
+					r = glfs_write(g->fd, b, bs, 0);
+					dprint(FD_IO,
+					       "fio write %d of %" PRIu64 " file %s\n",
+					       r, f->real_file_size,
+					       f->file_name);
+
+					if (r > 0) {
+						left -= r;
+						continue;
+					} else {
+						if (r < 0) {
+							int __e = errno;
+
+							if (__e == ENOSPC) {
+								if (td->o.
+								    fill_device)
+									break;
+								log_info
+								    ("fio: ENOSPC on laying out "
+								     "file, stopping\n");
+								break;
+							}
+							td_verror(td, errno,
+								  "write");
+						} else
+							td_verror(td, EIO,
+								  "write");
+
+						break;
+					}
+				}
+
+				if (b)
+					free(b);
+				glfs_lseek(g->fd, 0, SEEK_SET);
+
+				if (td->terminate && td->o.unlink) {
+					dprint(FD_FILE, "terminate unlink %s\n",
+					       f->file_name);
+					glfs_unlink(g->fs, f->file_name);
+				} else if (td->o.create_fsync) {
+#if defined(CONFIG_GF_NEW_API)
+					if (glfs_fsync(g->fd, NULL, NULL) < 0) {
+#else
+					if (glfs_fsync(g->fd) < 0) {
+#endif
+						dprint(FD_FILE,
+						       "failed to sync, close %s\n",
+						       f->file_name);
+						td_verror(td, errno, "fsync");
+						glfs_close(g->fd);
+						g->fd = NULL;
+						return 1;
+					}
+				}
+			}
+		}
+	}
+#if defined(GFAPI_USE_FADVISE)
+	{
+		int r = 0;
+		if (td_random(td)) {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_RANDOM);
+		} else {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_SEQUENTIAL);
+		}
+		if (r) {
+			dprint(FD_FILE, "fio %p fadvise %s status %d\n", g->fs,
+			       f->file_name, r);
+		}
+	}
+#endif
+	dprint(FD_FILE, "fio %p created %s\n", g->fs, f->file_name);
+	f->fd = -1;
+	f->shadow_fd = -1;
+	td->o.open_files ++;
+	return ret;
+}
+
+int fio_gf_close_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+		g->fd = NULL;
+	}
+
+	return ret;
+}
+
+int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fd unlink %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+
+		glfs_unlink(g->fs, f->file_name);
+
+		if (g->fs)
+			glfs_fini(g->fs);
+
+		g->fd = NULL;
+		free(g);
+	}
+	td->io_ops_data = NULL;
+
+	return ret;
+}
diff -Nru fio-2.1.3/engines/glusterfs_sync.c fio-3.16/engines/glusterfs_sync.c
--- fio-2.1.3/engines/glusterfs_sync.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/glusterfs_sync.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,107 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi sync interface
+ *
+ */
+
+#include "gfapi.h"
+
+#define LAST_POS(f)	((f)->engine_pos)
+static int fio_gf_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct gf_data *g = td->io_ops_data;
+
+	dprint(FD_FILE, "fio prep\n");
+
+	if (!ddir_rw(io_u->ddir))
+		return 0;
+
+	if (LAST_POS(f) != -1ULL && LAST_POS(f) == io_u->offset)
+		return 0;
+
+	if (glfs_lseek(g->fd, io_u->offset, SEEK_SET) < 0) {
+		td_verror(td, errno, "lseek");
+		return 1;
+	}
+
+	return 0;
+}
+
+static enum fio_q_status fio_gf_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops_data;
+	int ret = 0;
+
+	dprint(FD_FILE, "fio queue len %llu\n", io_u->xfer_buflen);
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = glfs_read(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = glfs_write(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_SYNC)
+#if defined(CONFIG_GF_NEW_API)
+		ret = glfs_fsync(g->fd, NULL, NULL);
+#else
+		ret = glfs_fsync(g->fd);
+#endif
+	else if (io_u->ddir == DDIR_DATASYNC)
+#if defined(CONFIG_GF_NEW_API)
+		ret = glfs_fdatasync(g->fd, NULL, NULL);
+#else
+		ret = glfs_fdatasync(g->fd);
+#endif
+	else {
+		log_err("unsupported operation.\n");
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+	dprint(FD_FILE, "fio len %llu ret %d\n", io_u->xfer_buflen, ret);
+	if (io_u->file && ret >= 0 && ddir_rw(io_u->ddir))
+		LAST_POS(io_u->file) = io_u->offset + ret;
+
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		log_err("IO failed.\n");
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_setup,
+	.cleanup = fio_gf_cleanup,
+	.prep = fio_gf_prep,
+	.queue = fio_gf_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/guasi.c fio-3.16/engines/guasi.c
--- fio-2.1.3/engines/guasi.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/guasi.c	2019-09-20 01:01:52.000000000 +0000
@@ -50,7 +50,7 @@
 
 static struct io_u *fio_guasi_event(struct thread_data *td, int event)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	struct io_u *io_u;
 	struct guasi_reqinfo rinf;
 
@@ -80,9 +80,9 @@
 }
 
 static int fio_guasi_getevents(struct thread_data *td, unsigned int min,
-			       unsigned int max, struct timespec *t)
+			       unsigned int max, const struct timespec *t)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int n, r;
 	long timeo = -1;
 
@@ -113,9 +113,10 @@
 	return n;
 }
 
-static int fio_guasi_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_guasi_queue(struct thread_data *td,
+					 struct io_u *io_u)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -132,7 +133,7 @@
 {
 	int i;
 	struct io_u *io_u;
-	struct timeval now;
+	struct timespec now;
 
 	if (!fio_fill_issue_time(td))
 		return;
@@ -148,7 +149,7 @@
 
 static int fio_guasi_commit(struct thread_data *td)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int i;
 	struct io_u *io_u;
 	struct fio_file *f;
@@ -198,7 +199,7 @@
 
 static void fio_guasi_cleanup(struct thread_data *td)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int n;
 
 	GDBG_PRINT(("fio_guasi_cleanup(%p)\n", ld));
@@ -235,7 +236,7 @@
 	ld->queued_nr = 0;
 	ld->reqs_nr = 0;
 
-	td->io_ops->data = ld;
+	td->io_ops_data = ld;
 	GDBG_PRINT(("fio_guasi_init(): depth=%d -> %p\n", td->o.iodepth, ld));
 
 	return 0;
diff -Nru fio-2.1.3/engines/http.c fio-3.16/engines/http.c
--- fio-2.1.3/engines/http.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/http.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,665 @@
+/*
+ * HTTP GET/PUT IO engine
+ *
+ * IO engine to perform HTTP(S) GET/PUT requests via libcurl-easy.
+ *
+ * Copyright (C) 2018 SUSE LLC
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <pthread.h>
+#include <time.h>
+#include <curl/curl.h>
+#include <openssl/hmac.h>
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+#include "fio.h"
+#include "../optgroup.h"
+
+
+enum {
+	FIO_HTTP_WEBDAV	    = 0,
+	FIO_HTTP_S3	    = 1,
+	FIO_HTTP_SWIFT	    = 2,
+
+	FIO_HTTPS_OFF	    = 0,
+	FIO_HTTPS_ON	    = 1,
+	FIO_HTTPS_INSECURE  = 2,
+};
+
+struct http_data {
+	CURL *curl;
+};
+
+struct http_options {
+	void *pad;
+	unsigned int https;
+	char *host;
+	char *user;
+	char *pass;
+	char *s3_key;
+	char *s3_keyid;
+	char *s3_region;
+	char *swift_auth_token;
+	int verbose;
+	unsigned int mode;
+};
+
+struct http_curl_stream {
+	char *buf;
+	size_t pos;
+	size_t max;
+};
+
+static struct fio_option options[] = {
+	{
+		.name     = "https",
+		.lname    = "https",
+		.type     = FIO_OPT_STR,
+		.help     = "Enable https",
+		.off1     = offsetof(struct http_options, https),
+		.def      = "off",
+		.posval = {
+			  { .ival = "off",
+			    .oval = FIO_HTTPS_OFF,
+			    .help = "No HTTPS",
+			  },
+			  { .ival = "on",
+			    .oval = FIO_HTTPS_ON,
+			    .help = "Enable HTTPS",
+			  },
+			  { .ival = "insecure",
+			    .oval = FIO_HTTPS_INSECURE,
+			    .help = "Enable HTTPS, disable peer verification",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_host",
+		.lname    = "http_host",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Hostname (S3 bucket)",
+		.off1     = offsetof(struct http_options, host),
+		.def	  = "localhost",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_user",
+		.lname    = "http_user",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "HTTP user name",
+		.off1     = offsetof(struct http_options, user),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_pass",
+		.lname    = "http_pass",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "HTTP password",
+		.off1     = offsetof(struct http_options, pass),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_key",
+		.lname    = "S3 secret key",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 secret key",
+		.off1     = offsetof(struct http_options, s3_key),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_keyid",
+		.lname    = "S3 key id",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 key id",
+		.off1     = offsetof(struct http_options, s3_keyid),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_swift_auth_token",
+		.lname    = "Swift auth token",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "OpenStack Swift auth token",
+		.off1     = offsetof(struct http_options, swift_auth_token),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_region",
+		.lname    = "S3 region",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 region",
+		.off1     = offsetof(struct http_options, s3_region),
+		.def	  = "us-east-1",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_mode",
+		.lname    = "Request mode to use",
+		.type     = FIO_OPT_STR,
+		.help     = "Whether to use WebDAV, Swift, or S3",
+		.off1     = offsetof(struct http_options, mode),
+		.def	  = "webdav",
+		.posval = {
+			  { .ival = "webdav",
+			    .oval = FIO_HTTP_WEBDAV,
+			    .help = "WebDAV server",
+			  },
+			  { .ival = "s3",
+			    .oval = FIO_HTTP_S3,
+			    .help = "S3 storage backend",
+			  },
+			  { .ival = "swift",
+			    .oval = FIO_HTTP_SWIFT,
+			    .help = "OpenStack Swift storage",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_verbose",
+		.lname    = "HTTP verbosity level",
+		.type     = FIO_OPT_INT,
+		.help     = "increase http engine verbosity",
+		.off1     = offsetof(struct http_options, verbose),
+		.def	  = "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = NULL,
+	},
+};
+
+static char *_aws_uriencode(const char *uri)
+{
+	size_t bufsize = 1024;
+	char *r = malloc(bufsize);
+	char c;
+	int i, n;
+	const char *hex = "0123456789ABCDEF";
+
+	if (!r) {
+		log_err("malloc failed\n");
+		return NULL;
+	}
+
+	n = 0;
+	for (i = 0; (c = uri[i]); i++) {
+		if (n > bufsize-5) {
+			log_err("encoding the URL failed\n");
+			return NULL;
+		}
+
+		if ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+		|| (c >= '0' && c <= '9') || c == '_' || c == '-'
+		|| c == '~' || c == '.' || c == '/')
+			r[n++] = c;
+		else {
+			r[n++] = '%';
+			r[n++] = hex[(c >> 4 ) & 0xF];
+			r[n++] = hex[c & 0xF];
+		}
+	}
+	r[n++] = 0;
+	return r;
+}
+
+static char *_conv_hex(const unsigned char *p, size_t len)
+{
+	char *r;
+	int i,n;
+	const char *hex = "0123456789abcdef";
+	r = malloc(len * 2 + 1);
+	n = 0;
+	for (i = 0; i < len; i++) {
+		r[n++] = hex[(p[i] >> 4 ) & 0xF];
+		r[n++] = hex[p[i] & 0xF];
+	}
+	r[n] = 0;
+
+	return r;
+}
+
+static char *_gen_hex_sha256(const char *p, size_t len)
+{
+	unsigned char hash[SHA256_DIGEST_LENGTH];
+
+	SHA256((unsigned char*)p, len, hash);
+	return _conv_hex(hash, SHA256_DIGEST_LENGTH);
+}
+
+static char *_gen_hex_md5(const char *p, size_t len)
+{
+	unsigned char hash[MD5_DIGEST_LENGTH];
+
+	MD5((unsigned char*)p, len, hash);
+	return _conv_hex(hash, MD5_DIGEST_LENGTH);
+}
+
+static void _hmac(unsigned char *md, void *key, int key_len, char *data) {
+#ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	HMAC_CTX _ctx;
+#endif
+	HMAC_CTX *ctx;
+	unsigned int hmac_len;
+
+#ifdef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	ctx = HMAC_CTX_new();
+#else
+	ctx = &_ctx;
+	/* work-around crash in certain versions of libssl */
+	HMAC_CTX_init(ctx);
+#endif
+	HMAC_Init_ex(ctx, key, key_len, EVP_sha256(), NULL);
+	HMAC_Update(ctx, (unsigned char*)data, strlen(data));
+	HMAC_Final(ctx, md, &hmac_len);
+#ifdef CONFIG_HAVE_OPAQUE_HMAC_CTX
+	HMAC_CTX_free(ctx);
+#else
+	HMAC_CTX_cleanup(ctx);
+#endif
+}
+
+static int _curl_trace(CURL *handle, curl_infotype type,
+	     char *data, size_t size,
+	     void *userp)
+{
+	const char *text;
+	(void)handle; /* prevent compiler warning */
+	(void)userp;
+
+	switch (type) {
+	case CURLINFO_TEXT:
+		fprintf(stderr, "== Info: %s", data);
+		/* fall through */
+	default:
+	case CURLINFO_SSL_DATA_OUT:
+		/* fall through */
+	case CURLINFO_SSL_DATA_IN:
+		return 0;
+
+	case CURLINFO_HEADER_OUT:
+		text = "=> Send header";
+		break;
+	case CURLINFO_DATA_OUT:
+		text = "=> Send data";
+		break;
+	case CURLINFO_HEADER_IN:
+		text = "<= Recv header";
+		break;
+	case CURLINFO_DATA_IN:
+		text = "<= Recv data";
+		break;
+	}
+
+	log_info("%s: %s", text, data);
+	return 0;
+}
+
+/* https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html
+ * https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html#signing-request-intro
+ */
+static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
+		int op, const char *uri, char *buf, size_t len)
+{
+	char date_short[16];
+	char date_iso[32];
+	char method[8];
+	char dkey[128];
+	char creq[512];
+	char sts[256];
+	char s[512];
+	char *uri_encoded = NULL;
+	char *dsha = NULL;
+	char *csha = NULL;
+	char *signature = NULL;
+	const char *service = "s3";
+	const char *aws = "aws4_request";
+	unsigned char md[SHA256_DIGEST_LENGTH];
+
+	time_t t = time(NULL);
+	struct tm *gtm = gmtime(&t);
+
+	strftime (date_short, sizeof(date_short), "%Y%m%d", gtm);
+	strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm);
+	uri_encoded = _aws_uriencode(uri);
+
+	if (op == DDIR_WRITE) {
+		dsha = _gen_hex_sha256(buf, len);
+		sprintf(method, "PUT");
+	} else {
+		/* DDIR_READ && DDIR_TRIM supply an empty body */
+		if (op == DDIR_READ)
+			sprintf(method, "GET");
+		else
+			sprintf(method, "DELETE");
+		dsha = _gen_hex_sha256("", 0);
+	}
+
+	/* Create the canonical request first */
+	snprintf(creq, sizeof(creq),
+	"%s\n"
+	"%s\n"
+	"\n"
+	"host:%s\n"
+	"x-amz-content-sha256:%s\n"
+	"x-amz-date:%s\n"
+	"\n"
+	"host;x-amz-content-sha256;x-amz-date\n"
+	"%s"
+	, method
+	, uri_encoded, o->host, dsha, date_iso, dsha);
+
+	csha = _gen_hex_sha256(creq, strlen(creq));
+	snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s",
+		date_iso, date_short, o->s3_region, service, aws, csha);
+
+	snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key);
+	_hmac(md, dkey, strlen(dkey), date_short);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, o->s3_region);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, (char*) service);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, (char*) aws);
+	_hmac(md, md, SHA256_DIGEST_LENGTH, sts);
+
+	signature = _conv_hex(md, SHA256_DIGEST_LENGTH);
+
+	/* Surpress automatic Accept: header */
+	slist = curl_slist_append(slist, "Accept:");
+
+	snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "x-amz-date: %s", date_iso);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+	"SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s",
+	o->s3_keyid, date_short, o->s3_region, signature);
+	slist = curl_slist_append(slist, s);
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
+
+	free(uri_encoded);
+	free(csha);
+	free(dsha);
+	free(signature);
+}
+
+static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
+		int op, const char *uri, char *buf, size_t len)
+{
+	char *dsha = NULL;
+	char s[512];
+
+	if (op == DDIR_WRITE) {
+		dsha = _gen_hex_md5(buf, len);
+	}
+	/* Surpress automatic Accept: header */
+	slist = curl_slist_append(slist, "Accept:");
+
+	snprintf(s, sizeof(s), "etag: %s", dsha);
+	slist = curl_slist_append(slist, s);
+
+	snprintf(s, sizeof(s), "x-auth-token: %s", o->swift_auth_token);
+	slist = curl_slist_append(slist, s);
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
+
+	free(dsha);
+}
+
+static void fio_http_cleanup(struct thread_data *td)
+{
+	struct http_data *http = td->io_ops_data;
+
+	if (http) {
+		curl_easy_cleanup(http->curl);
+		free(http);
+	}
+}
+
+static size_t _http_read(void *ptr, size_t size, size_t nmemb, void *stream)
+{
+	struct http_curl_stream *state = stream;
+	size_t len = size * nmemb;
+	/* We're retrieving; nothing is supposed to be read locally */
+	if (!stream)
+		return 0;
+	if (len+state->pos > state->max)
+		len = state->max - state->pos;
+	memcpy(ptr, &state->buf[state->pos], len);
+	state->pos += len;
+	return len;
+}
+
+static size_t _http_write(void *ptr, size_t size, size_t nmemb, void *stream)
+{
+	struct http_curl_stream *state = stream;
+	/* We're just discarding the returned body after a PUT */
+	if (!stream)
+		return nmemb;
+	if (size != 1)
+		return CURLE_WRITE_ERROR;
+	if (nmemb + state->pos > state->max)
+		return CURLE_WRITE_ERROR;
+	memcpy(&state->buf[state->pos], ptr, nmemb);
+	state->pos += nmemb;
+	return nmemb;
+}
+
+static int _http_seek(void *stream, curl_off_t offset, int origin)
+{
+	struct http_curl_stream *state = stream;
+	if (offset < state->max && origin == SEEK_SET) {
+		state->pos = offset;
+		return CURL_SEEKFUNC_OK;
+	} else
+		return CURL_SEEKFUNC_FAIL;
+}
+
+static enum fio_q_status fio_http_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct http_data *http = td->io_ops_data;
+	struct http_options *o = td->eo;
+	struct http_curl_stream _curl_stream;
+	struct curl_slist *slist = NULL;
+	char object[512];
+	char url[1024];
+	long status;
+	CURLcode res;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+	memset(&_curl_stream, 0, sizeof(_curl_stream));
+	snprintf(object, sizeof(object), "%s_%llu_%llu", td->files[0]->file_name,
+		io_u->offset, io_u->xfer_buflen);
+	if (o->https == FIO_HTTPS_OFF)
+		snprintf(url, sizeof(url), "http://%s%s", o->host, object);
+	else
+		snprintf(url, sizeof(url), "https://%s%s", o->host, object);
+	curl_easy_setopt(http->curl, CURLOPT_URL, url);
+	_curl_stream.buf = io_u->xfer_buf;
+	_curl_stream.max = io_u->xfer_buflen;
+	curl_easy_setopt(http->curl, CURLOPT_SEEKDATA, &_curl_stream);
+	curl_easy_setopt(http->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)io_u->xfer_buflen);
+
+	if (o->mode == FIO_HTTP_S3)
+		_add_aws_auth_header(http->curl, slist, o, io_u->ddir, object,
+			io_u->xfer_buf, io_u->xfer_buflen);
+	else if (o->mode == FIO_HTTP_SWIFT)
+		_add_swift_header(http->curl, slist, o, io_u->ddir, object,
+			io_u->xfer_buf, io_u->xfer_buflen);
+
+	if (io_u->ddir == DDIR_WRITE) {
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, &_curl_stream);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_UPLOAD, 1L);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 100 || (status >= 200 && status <= 204))
+				goto out;
+			log_err("DDIR_WRITE failed with HTTP status code %ld\n", status);
+			goto err;
+		}
+	} else if (io_u->ddir == DDIR_READ) {
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream);
+		curl_easy_setopt(http->curl, CURLOPT_HTTPGET, 1L);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 200)
+				goto out;
+			else if (status == 404) {
+				/* Object doesn't exist. Pretend we read
+				 * zeroes */
+				memset(io_u->xfer_buf, 0, io_u->xfer_buflen);
+				goto out;
+			}
+			log_err("DDIR_READ failed with HTTP status code %ld\n", status);
+		}
+		goto err;
+	} else if (io_u->ddir == DDIR_TRIM) {
+		curl_easy_setopt(http->curl, CURLOPT_HTTPGET, 1L);
+		curl_easy_setopt(http->curl, CURLOPT_CUSTOMREQUEST, "DELETE");
+		curl_easy_setopt(http->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)0);
+		curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
+		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, NULL);
+		res = curl_easy_perform(http->curl);
+		if (res == CURLE_OK) {
+			curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status);
+			if (status == 200 || status == 202 || status == 204 || status == 404)
+				goto out;
+			log_err("DDIR_TRIM failed with HTTP status code %ld\n", status);
+		}
+		goto err;
+	}
+
+	log_err("WARNING: Only DDIR_READ/DDIR_WRITE/DDIR_TRIM are supported!\n");
+
+err:
+	io_u->error = r;
+	td_verror(td, io_u->error, "transfer");
+out:
+	curl_slist_free_all(slist);
+	return FIO_Q_COMPLETED;
+}
+
+static struct io_u *fio_http_event(struct thread_data *td, int event)
+{
+	/* sync IO engine - never any outstanding events */
+	return NULL;
+}
+
+int fio_http_getevents(struct thread_data *td, unsigned int min,
+	unsigned int max, const struct timespec *t)
+{
+	/* sync IO engine - never any outstanding events */
+	return 0;
+}
+
+static int fio_http_setup(struct thread_data *td)
+{
+	struct http_data *http = NULL;
+	struct http_options *o = td->eo;
+
+	/* allocate engine specific structure to deal with libhttp. */
+	http = calloc(1, sizeof(*http));
+	if (!http) {
+		log_err("calloc failed.\n");
+		goto cleanup;
+	}
+
+	http->curl = curl_easy_init();
+	if (o->verbose)
+		curl_easy_setopt(http->curl, CURLOPT_VERBOSE, 1L);
+	if (o->verbose > 1)
+		curl_easy_setopt(http->curl, CURLOPT_DEBUGFUNCTION, &_curl_trace);
+	curl_easy_setopt(http->curl, CURLOPT_NOPROGRESS, 1L);
+	curl_easy_setopt(http->curl, CURLOPT_FOLLOWLOCATION, 1L);
+	curl_easy_setopt(http->curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP|CURLPROTO_HTTPS);
+	if (o->https == FIO_HTTPS_INSECURE) {
+		curl_easy_setopt(http->curl, CURLOPT_SSL_VERIFYPEER, 0L);
+		curl_easy_setopt(http->curl, CURLOPT_SSL_VERIFYHOST, 0L);
+	}
+	curl_easy_setopt(http->curl, CURLOPT_READFUNCTION, _http_read);
+	curl_easy_setopt(http->curl, CURLOPT_WRITEFUNCTION, _http_write);
+	curl_easy_setopt(http->curl, CURLOPT_SEEKFUNCTION, &_http_seek);
+	if (o->user && o->pass) {
+		curl_easy_setopt(http->curl, CURLOPT_USERNAME, o->user);
+		curl_easy_setopt(http->curl, CURLOPT_PASSWORD, o->pass);
+		curl_easy_setopt(http->curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
+	}
+
+	td->io_ops_data = http;
+
+	/* Force single process mode. */
+	td->o.use_thread = 1;
+
+	return 0;
+cleanup:
+	fio_http_cleanup(td);
+	return 1;
+}
+
+static int fio_http_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+static int fio_http_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "http",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_DISKLESSIO | FIO_SYNCIO,
+	.setup			= fio_http_setup,
+	.queue			= fio_http_queue,
+	.getevents		= fio_http_getevents,
+	.event			= fio_http_event,
+	.cleanup		= fio_http_cleanup,
+	.open_file		= fio_http_open,
+	.invalidate		= fio_http_invalidate,
+	.options		= options,
+	.option_struct_size	= sizeof(struct http_options),
+};
+
+static void fio_init fio_http_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_http_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/ime.c fio-3.16/engines/ime.c
--- fio-2.1.3/engines/ime.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/ime.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,899 @@
+/*
+ * FIO engines for DDN's Infinite Memory Engine.
+ * This file defines 3 engines: ime_psync, ime_psyncv, and ime_aio
+ *
+ * Copyright (C) 2018      DataDirect Networks. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * Some details about the new engines are given below:
+ *
+ *
+ * ime_psync:
+ * Most basic engine that issues calls to ime_native whenever an IO is queued.
+ *
+ * ime_psyncv:
+ * This engine tries to queue the IOs (by creating iovecs) if asked by FIO (via
+ * iodepth_batch). It refuses to queue when the iovecs can't be appended, and
+ * waits for FIO to issue a commit. After a call to commit and get_events, new
+ * IOs can be queued.
+ *
+ * ime_aio:
+ * This engine tries to queue the IOs (by creating iovecs) if asked by FIO (via
+ * iodepth_batch). When the iovecs can't be appended to the current request, a
+ * new request for IME is created. These requests will be issued to IME when
+ * commit is called. Contrary to ime_psyncv, there can be several requests at
+ * once. We don't need to wait for a request to terminate before creating a new
+ * one.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <linux/limits.h>
+#include <ime_native.h>
+
+#include "../fio.h"
+
+
+/**************************************************************
+ *              Types and constants definitions
+ *
+ **************************************************************/
+
+/* define constants for async IOs */
+#define FIO_IME_IN_PROGRESS -1
+#define FIO_IME_REQ_ERROR   -2
+
+/* This flag is used when some jobs were created using threads. In that
+   case, IME can't be finalized in the engine-specific cleanup function,
+   because other threads might still use IME. Instead, IME is finalized
+   in the destructor (see fio_ime_unregister), only when the flag
+   fio_ime_is_initialized is true (which means at least one thread has
+   initialized IME). */
+static bool fio_ime_is_initialized = false;
+
+struct imesio_req {
+	int 			fd;		/* File descriptor */
+	enum fio_ddir	ddir;	/* Type of IO (read or write) */
+	off_t			offset;	/* File offset */
+};
+struct imeaio_req {
+	struct ime_aiocb 	iocb;			/* IME aio request */
+	ssize_t      		status;			/* Status of the IME request */
+	enum fio_ddir		ddir;			/* Type of IO (read or write) */
+	pthread_cond_t		cond_endio;		/* Condition var to notify FIO */
+	pthread_mutex_t		status_mutex;	/* Mutex for cond_endio */
+};
+
+/* This structure will be used for 2 engines: ime_psyncv and ime_aio */
+struct ime_data {
+	union {
+		struct imeaio_req 	*aioreqs;	/* array of aio requests */
+		struct imesio_req	*sioreq;	/* pointer to the only syncio request */
+	};
+	struct iovec 	*iovecs;		/* array of queued iovecs */
+	struct io_u 	**io_us;		/* array of queued io_u pointers */
+	struct io_u 	**event_io_us;	/* array of the events retieved afer get_events*/
+	unsigned int 	queued;			/* iovecs/io_us in the queue */
+	unsigned int 	events;			/* number of committed iovecs/io_us */
+
+	/* variables used to implement a "ring" queue */
+	unsigned int depth;			/* max entries in the queue */
+	unsigned int head;			/* index used to append */
+	unsigned int tail;			/* index used to pop */
+	unsigned int cur_commit;	/* index of the first uncommitted req */
+
+	/* offset used by the last iovec (used to check if the iovecs can be appended)*/
+	unsigned long long	last_offset;
+
+	/* The variables below are used for aio only */
+	struct imeaio_req	*last_req; /* last request awaiting committing */
+};
+
+
+/**************************************************************
+ *         Private functions for queueing/unqueueing
+ *
+ **************************************************************/
+
+static void fio_ime_queue_incr (struct ime_data *ime_d)
+{
+	ime_d->head = (ime_d->head + 1) % ime_d->depth;
+	ime_d->queued++;
+}
+
+static void fio_ime_queue_red (struct ime_data *ime_d)
+{
+	ime_d->tail = (ime_d->tail + 1) % ime_d->depth;
+	ime_d->queued--;
+	ime_d->events--;
+}
+
+static void fio_ime_queue_commit (struct ime_data *ime_d, int iovcnt)
+{
+	ime_d->cur_commit = (ime_d->cur_commit + iovcnt) % ime_d->depth;
+	ime_d->events += iovcnt;
+}
+
+static void fio_ime_queue_reset (struct ime_data *ime_d)
+{
+	ime_d->head = 0;
+	ime_d->tail = 0;
+	ime_d->cur_commit = 0;
+	ime_d->queued = 0;
+	ime_d->events = 0;
+}
+
+/**************************************************************
+ *                   General IME functions
+ *             (needed for both sync and async IOs)
+ **************************************************************/
+
+static char *fio_set_ime_filename(char* filename)
+{
+	static __thread char ime_filename[PATH_MAX];
+	int ret;
+
+	ret = snprintf(ime_filename, PATH_MAX, "%s%s", DEFAULT_IME_FILE_PREFIX, filename);
+	if (ret < PATH_MAX)
+		return ime_filename;
+
+	return NULL;
+}
+
+static int fio_ime_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat buf;
+	int ret;
+	char *ime_filename;
+
+	dprint(FD_FILE, "get file size %s\n", f->file_name);
+
+	ime_filename = fio_set_ime_filename(f->file_name);
+	if (ime_filename == NULL)
+		return 1;
+	ret = ime_native_stat(ime_filename, &buf);
+	if (ret == -1) {
+		td_verror(td, errno, "fstat");
+		return 1;
+	}
+
+	f->real_file_size = buf.st_size;
+	return 0;
+}
+
+/* This functions mimics the generic_file_open function, but issues
+   IME native calls instead of POSIX calls. */
+static int fio_ime_open_file(struct thread_data *td, struct fio_file *f)
+{
+	int flags = 0;
+	int ret;
+	uint64_t desired_fs;
+	char *ime_filename;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (td_trim(td)) {
+		td_verror(td, EINVAL, "IME does not support TRIM operation");
+		return 1;
+	}
+
+	if (td->o.oatomic) {
+		td_verror(td, EINVAL, "IME does not support atomic IO");
+		return 1;
+	}
+	if (td->o.odirect)
+		flags |= O_DIRECT;
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+	if (td->o.create_on_open && td->o.allow_create)
+		flags |= O_CREAT;
+
+	if (td_write(td)) {
+		if (!read_only)
+			flags |= O_RDWR;
+
+		if (td->o.allow_create)
+			flags |= O_CREAT;
+	} else if (td_read(td)) {
+		flags |= O_RDONLY;
+	} else {
+		/* We should never go here. */
+		td_verror(td, EINVAL, "Unsopported open mode");
+		return 1;
+	}
+
+	ime_filename = fio_set_ime_filename(f->file_name);
+	if (ime_filename == NULL)
+		return 1;
+	f->fd = ime_native_open(ime_filename, flags, 0600);
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int __e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, __e, buf);
+		return 1;
+	}
+
+	/* Now we need to make sure the real file size is sufficient for FIO
+	   to do its things. This is normally done before the file open function
+	   is called, but because FIO would use POSIX calls, we need to do it
+	   ourselves */
+	ret = fio_ime_get_file_size(td, f);
+	if (ret < 0) {
+		ime_native_close(f->fd);
+		td_verror(td, errno, "ime_get_file_size");
+		return 1;
+	}
+
+	desired_fs = f->io_size + f->file_offset;
+	if (td_write(td)) {
+		dprint(FD_FILE, "Laying out file %s%s\n",
+			DEFAULT_IME_FILE_PREFIX, f->file_name);
+		if (!td->o.create_on_open &&
+				f->real_file_size < desired_fs &&
+				ime_native_ftruncate(f->fd, desired_fs) < 0) {
+			ime_native_close(f->fd);
+			td_verror(td, errno, "ime_native_ftruncate");
+			return 1;
+		}
+		if (f->real_file_size < desired_fs)
+			f->real_file_size = desired_fs;
+	} else if (td_read(td) && f->real_file_size < desired_fs) {
+		ime_native_close(f->fd);
+		log_err("error: can't read %lu bytes from file with "
+						"%lu bytes\n", desired_fs, f->real_file_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_ime_close_file(struct thread_data fio_unused *td, struct fio_file *f)
+{
+	int ret = 0;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (ime_native_close(f->fd) < 0)
+		ret = errno;
+
+	f->fd = -1;
+	return ret;
+}
+
+static int fio_ime_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	char *ime_filename = fio_set_ime_filename(f->file_name);
+	int ret;
+
+	if (ime_filename == NULL)
+		return 1;
+
+	ret = unlink(ime_filename);
+	return ret < 0 ? errno : 0;
+}
+
+static struct io_u *fio_ime_event(struct thread_data *td, int event)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	return ime_d->event_io_us[event];
+}
+
+/* Setup file used to replace get_file_sizes when settin up the file.
+   Instead we will set real_file_sie to 0 for each file. This way we
+   can avoid calling ime_native_init before the forks are created. */
+static int fio_ime_setup(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		dprint(FD_FILE, "setup: set file size to 0 for %p/%d/%s\n",
+			f, i, f->file_name);
+		f->real_file_size = 0;
+	}
+
+	return 0;
+}
+
+static int fio_ime_engine_init(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	dprint(FD_IO, "ime engine init\n");
+	if (fio_ime_is_initialized && !td->o.use_thread) {
+		log_err("Warning: something might go wrong. Not all threads/forks were"
+				" created before the FIO jobs were initialized.\n");
+	}
+
+	ime_native_init();
+	fio_ime_is_initialized = true;
+
+	/* We have to temporarily set real_file_size so that
+	   FIO can initialize properly. It will be corrected
+	   on file open. */
+	for_each_file(td, f, i)
+		f->real_file_size = f->io_size + f->file_offset;
+
+	return 0;
+}
+
+static void fio_ime_engine_finalize(struct thread_data *td)
+{
+	/* Only finalize IME when using forks */
+	if (!td->o.use_thread) {
+		if (ime_native_finalize() < 0)
+			log_err("error in ime_native_finalize\n");
+		fio_ime_is_initialized = false;
+	}
+}
+
+
+/**************************************************************
+ *             Private functions for blocking IOs
+ *                     (without iovecs)
+ **************************************************************/
+
+/* Notice: this function comes from the sync engine */
+/* It is used by the commit function to return a proper code and fill
+   some attributes in the io_u used for the IO. */
+static int fio_ime_psync_end(struct thread_data *td, struct io_u *io_u, ssize_t ret)
+{
+	if (ret != (ssize_t) io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		io_u_log_error(td, io_u);
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static enum fio_q_status fio_ime_psync_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	ssize_t ret;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = ime_native_pread(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = ime_native_pwrite(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	else if (io_u->ddir == DDIR_SYNC)
+		ret = ime_native_fsync(f->fd);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	return fio_ime_psync_end(td, io_u, ret);
+}
+
+
+/**************************************************************
+ *             Private functions for blocking IOs
+ *                       (with iovecs)
+ **************************************************************/
+
+static bool fio_ime_psyncv_can_queue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* We can only queue if:
+	  - There are no queued iovecs
+	  - Or if there is at least one:
+		 - There must be no event waiting for retrieval
+		 - The offsets must be contiguous
+		 - The ddir and fd must be the same */
+	return (ime_d->queued == 0 || (
+			ime_d->events == 0 &&
+			ime_d->last_offset == io_u->offset &&
+			ime_d->sioreq->ddir == io_u->ddir &&
+			ime_d->sioreq->fd == io_u->file->fd));
+}
+
+/* Before using this function, we should have already
+   ensured that the queue is not full */
+static void fio_ime_psyncv_enqueue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	struct imesio_req *ioreq = ime_d->sioreq;
+	struct iovec *iov = &ime_d->iovecs[ime_d->head];
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (ime_d->queued == 0) {
+		ioreq->offset = io_u->offset;
+		ioreq->ddir = io_u->ddir;
+		ioreq->fd = io_u->file->fd;
+	}
+
+	ime_d->io_us[ime_d->head] = io_u;
+	ime_d->last_offset = io_u->offset + io_u->xfer_buflen;
+	fio_ime_queue_incr(ime_d);
+}
+
+/* Tries to queue an IO. It will fail if the IO can't be appended to the
+   current request or if the current request has been committed but not
+   yet retrieved by get_events. */
+static enum fio_q_status fio_ime_psyncv_queue(struct thread_data *td,
+	struct io_u *io_u)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	if (ime_d->queued == ime_d->depth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (!fio_ime_psyncv_can_queue(ime_d, io_u))
+			return FIO_Q_BUSY;
+
+		dprint(FD_IO, "queue: ddir=%d at %u commit=%u queued=%u events=%u\n",
+			io_u->ddir, ime_d->head, ime_d->cur_commit,
+			ime_d->queued, ime_d->events);
+		fio_ime_psyncv_enqueue(ime_d, io_u);
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_SYNC) {
+		if (ime_native_fsync(io_u->file->fd) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "fsync");
+		}
+		return FIO_Q_COMPLETED;
+	} else {
+		io_u->error = EINVAL;
+		td_verror(td, io_u->error, "wrong ddir");
+		return FIO_Q_COMPLETED;
+	}
+}
+
+/* Notice: this function comes from the sync engine */
+/* It is used by the commit function to return a proper code and fill
+   some attributes in the io_us appended to the current request. */
+static int fio_ime_psyncv_end(struct thread_data *td, ssize_t bytes)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct io_u *io_u;
+	unsigned int i;
+	int err = errno;
+
+	for (i = 0; i < ime_d->queued; i++) {
+		io_u = ime_d->io_us[i];
+
+		if (bytes == -1)
+			io_u->error = err;
+		else {
+			unsigned int this_io;
+
+			this_io = bytes;
+			if (this_io > io_u->xfer_buflen)
+				this_io = io_u->xfer_buflen;
+
+			io_u->resid = io_u->xfer_buflen - this_io;
+			io_u->error = 0;
+			bytes -= this_io;
+		}
+	}
+
+	if (bytes == -1) {
+		td_verror(td, err, "xfer psyncv");
+		return -err;
+	}
+
+	return 0;
+}
+
+/* Commits the current request by calling ime_native (with one or several
+   iovecs). After this commit, the corresponding events (one per iovec)
+   can be retrieved by get_events. */
+static int fio_ime_psyncv_commit(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imesio_req *ioreq;
+	int ret = 0;
+
+	/* Exit if there are no (new) events to commit
+	   or if the previous committed event haven't been retrieved */
+	if (!ime_d->queued || ime_d->events)
+		return 0;
+
+	ioreq = ime_d->sioreq;
+	ime_d->events = ime_d->queued;
+	if (ioreq->ddir == DDIR_READ)
+		ret = ime_native_preadv(ioreq->fd, ime_d->iovecs, ime_d->queued, ioreq->offset);
+	else
+		ret = ime_native_pwritev(ioreq->fd, ime_d->iovecs, ime_d->queued, ioreq->offset);
+
+	dprint(FD_IO, "committed %d iovecs\n", ime_d->queued);
+
+	return fio_ime_psyncv_end(td, ret);
+}
+
+static int fio_ime_psyncv_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct io_u *io_u;
+	int events = 0;
+	unsigned int count;
+
+	if (ime_d->events) {
+		for (count = 0; count < ime_d->events; count++) {
+			io_u = ime_d->io_us[count];
+			ime_d->event_io_us[events] = io_u;
+			events++;
+		}
+		fio_ime_queue_reset(ime_d);
+	}
+
+	dprint(FD_IO, "getevents(%u,%u) ret=%d queued=%u events=%u\n",
+		min, max, events, ime_d->queued, ime_d->events);
+	return events;
+}
+
+static int fio_ime_psyncv_init(struct thread_data *td)
+{
+	struct ime_data *ime_d;
+
+	if (fio_ime_engine_init(td) < 0)
+		return 1;
+
+	ime_d = calloc(1, sizeof(*ime_d));
+
+	ime_d->sioreq = malloc(sizeof(struct imesio_req));
+	ime_d->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
+	ime_d->io_us = malloc(2 * td->o.iodepth * sizeof(struct io_u *));
+	ime_d->event_io_us = ime_d->io_us + td->o.iodepth;
+
+	ime_d->depth = td->o.iodepth;
+
+	td->io_ops_data = ime_d;
+	return 0;
+}
+
+static void fio_ime_psyncv_clean(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	if (ime_d) {
+		free(ime_d->sioreq);
+		free(ime_d->iovecs);
+		free(ime_d->io_us);
+		free(ime_d);
+		td->io_ops_data = NULL;
+	}
+
+	fio_ime_engine_finalize(td);
+}
+
+
+/**************************************************************
+ *           Private functions for non-blocking IOs
+ *
+ **************************************************************/
+
+void fio_ime_aio_complete_cb  (struct ime_aiocb *aiocb, int err,
+							   ssize_t bytes)
+{
+	struct imeaio_req *ioreq = (struct imeaio_req *) aiocb->user_context;
+
+	pthread_mutex_lock(&ioreq->status_mutex);
+	ioreq->status = err == 0 ? bytes : FIO_IME_REQ_ERROR;
+	pthread_mutex_unlock(&ioreq->status_mutex);
+
+	pthread_cond_signal(&ioreq->cond_endio);
+}
+
+static bool fio_ime_aio_can_queue (struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* So far we can queue in any case. */
+	return true;
+}
+static bool fio_ime_aio_can_append (struct ime_data *ime_d, struct io_u *io_u)
+{
+	/* We can only append if:
+		- The iovecs will be contiguous in the array
+		- There is already a queued iovec
+		- The offsets are contiguous
+		- The ddir and fs are the same */
+	return (ime_d->head != 0 &&
+			ime_d->queued - ime_d->events > 0 &&
+			ime_d->last_offset == io_u->offset &&
+			ime_d->last_req->ddir == io_u->ddir &&
+			ime_d->last_req->iocb.fd == io_u->file->fd);
+}
+
+/* Before using this function, we should have already
+   ensured that the queue is not full */
+static void fio_ime_aio_enqueue(struct ime_data *ime_d, struct io_u *io_u)
+{
+	struct imeaio_req *ioreq = &ime_d->aioreqs[ime_d->head];
+	struct ime_aiocb *iocb = &ioreq->iocb;
+	struct iovec *iov = &ime_d->iovecs[ime_d->head];
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (fio_ime_aio_can_append(ime_d, io_u))
+		ime_d->last_req->iocb.iovcnt++;
+	else {
+		ioreq->status = FIO_IME_IN_PROGRESS;
+		ioreq->ddir = io_u->ddir;
+		ime_d->last_req = ioreq;
+
+		iocb->complete_cb = &fio_ime_aio_complete_cb;
+		iocb->fd = io_u->file->fd;
+		iocb->file_offset = io_u->offset;
+		iocb->iov = iov;
+		iocb->iovcnt = 1;
+		iocb->flags = 0;
+		iocb->user_context = (intptr_t) ioreq;
+	}
+
+	ime_d->io_us[ime_d->head] = io_u;
+	ime_d->last_offset = io_u->offset + io_u->xfer_buflen;
+	fio_ime_queue_incr(ime_d);
+}
+
+/* Tries to queue an IO. It will create a new request if the IO can't be
+   appended to the current request. It will fail if the queue can't contain
+   any more io_u/iovec. In this case, commit and then get_events need to be
+   called. */
+static enum fio_q_status fio_ime_aio_queue(struct thread_data *td,
+		struct io_u *io_u)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+
+	fio_ro_check(td, io_u);
+
+	dprint(FD_IO, "queue: ddir=%d at %u commit=%u queued=%u events=%u\n",
+		io_u->ddir, ime_d->head, ime_d->cur_commit,
+		ime_d->queued, ime_d->events);
+
+	if (ime_d->queued == ime_d->depth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (!fio_ime_aio_can_queue(ime_d, io_u))
+			return FIO_Q_BUSY;
+
+		fio_ime_aio_enqueue(ime_d, io_u);
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_SYNC) {
+		if (ime_native_fsync(io_u->file->fd) < 0) {
+			io_u->error = errno;
+			td_verror(td, io_u->error, "fsync");
+		}
+		return FIO_Q_COMPLETED;
+	} else {
+		io_u->error = EINVAL;
+		td_verror(td, io_u->error, "wrong ddir");
+		return FIO_Q_COMPLETED;
+	}
+}
+
+static int fio_ime_aio_commit(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	int ret = 0;
+
+	/* Loop while there are events to commit */
+	while (ime_d->queued - ime_d->events) {
+		ioreq = &ime_d->aioreqs[ime_d->cur_commit];
+		if (ioreq->ddir == DDIR_READ)
+			ret = ime_native_aio_read(&ioreq->iocb);
+		else
+			ret = ime_native_aio_write(&ioreq->iocb);
+
+		fio_ime_queue_commit(ime_d, ioreq->iocb.iovcnt);
+
+		/* fio needs a negative error code */
+		if (ret < 0) {
+			ioreq->status = FIO_IME_REQ_ERROR;
+			return -errno;
+		}
+
+		io_u_mark_submit(td, ioreq->iocb.iovcnt);
+		dprint(FD_IO, "committed %d iovecs commit=%u queued=%u events=%u\n",
+			ioreq->iocb.iovcnt, ime_d->cur_commit,
+			ime_d->queued, ime_d->events);
+	}
+
+	return 0;
+}
+
+static int fio_ime_aio_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	struct io_u *io_u;
+	int events = 0;
+	unsigned int count;
+	ssize_t bytes;
+
+	while (ime_d->events) {
+		ioreq = &ime_d->aioreqs[ime_d->tail];
+
+		/* Break if we already got events, and if we will
+		   exceed max if we append the next events */
+		if (events && events + ioreq->iocb.iovcnt > max)
+			break;
+
+		if (ioreq->status != FIO_IME_IN_PROGRESS) {
+
+			bytes = ioreq->status;
+			for (count = 0; count < ioreq->iocb.iovcnt; count++) {
+				io_u = ime_d->io_us[ime_d->tail];
+				ime_d->event_io_us[events] = io_u;
+				events++;
+				fio_ime_queue_red(ime_d);
+
+				if (ioreq->status == FIO_IME_REQ_ERROR)
+					io_u->error = EIO;
+				else {
+					io_u->resid = bytes > io_u->xfer_buflen ?
+									0 : io_u->xfer_buflen - bytes;
+					io_u->error = 0;
+					bytes -= io_u->xfer_buflen - io_u->resid;
+				}
+			}
+		} else {
+			pthread_mutex_lock(&ioreq->status_mutex);
+			while (ioreq->status == FIO_IME_IN_PROGRESS)
+				pthread_cond_wait(&ioreq->cond_endio, &ioreq->status_mutex);
+			pthread_mutex_unlock(&ioreq->status_mutex);
+		}
+
+	}
+
+	dprint(FD_IO, "getevents(%u,%u) ret=%d queued=%u events=%u\n", min, max,
+		events, ime_d->queued, ime_d->events);
+	return events;
+}
+
+static int fio_ime_aio_init(struct thread_data *td)
+{
+	struct ime_data *ime_d;
+	struct imeaio_req *ioreq;
+	unsigned int i;
+
+	if (fio_ime_engine_init(td) < 0)
+		return 1;
+
+	ime_d = calloc(1, sizeof(*ime_d));
+
+	ime_d->aioreqs = malloc(td->o.iodepth * sizeof(struct imeaio_req));
+	ime_d->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
+	ime_d->io_us = malloc(2 * td->o.iodepth * sizeof(struct io_u *));
+	ime_d->event_io_us = ime_d->io_us + td->o.iodepth;
+
+	ime_d->depth = td->o.iodepth;
+	for (i = 0; i < ime_d->depth; i++) {
+		ioreq = &ime_d->aioreqs[i];
+		pthread_cond_init(&ioreq->cond_endio, NULL);
+		pthread_mutex_init(&ioreq->status_mutex, NULL);
+	}
+
+	td->io_ops_data = ime_d;
+	return 0;
+}
+
+static void fio_ime_aio_clean(struct thread_data *td)
+{
+	struct ime_data *ime_d = td->io_ops_data;
+	struct imeaio_req *ioreq;
+	unsigned int i;
+
+	if (ime_d) {
+		for (i = 0; i < ime_d->depth; i++) {
+			ioreq = &ime_d->aioreqs[i];
+			pthread_cond_destroy(&ioreq->cond_endio);
+			pthread_mutex_destroy(&ioreq->status_mutex);
+		}
+		free(ime_d->aioreqs);
+		free(ime_d->iovecs);
+		free(ime_d->io_us);
+		free(ime_d);
+		td->io_ops_data = NULL;
+	}
+
+	fio_ime_engine_finalize(td);
+}
+
+
+/**************************************************************
+ *                   IO engines definitions
+ *
+ **************************************************************/
+
+/* The FIO_DISKLESSIO flag used for these engines is necessary to prevent
+   FIO from using POSIX calls. See fio_ime_open_file for more details. */
+
+static struct ioengine_ops ioengine_prw = {
+	.name		= "ime_psync",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_engine_init,
+	.cleanup	= fio_ime_engine_finalize,
+	.queue		= fio_ime_psync_queue,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags	    	= FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static struct ioengine_ops ioengine_pvrw = {
+	.name		= "ime_psyncv",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_psyncv_init,
+	.cleanup	= fio_ime_psyncv_clean,
+	.queue		= fio_ime_psyncv_queue,
+	.commit		= fio_ime_psyncv_commit,
+	.getevents	= fio_ime_psyncv_getevents,
+	.event		= fio_ime_event,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags	    	= FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static struct ioengine_ops ioengine_aio = {
+	.name		= "ime_aio",
+	.version	= FIO_IOOPS_VERSION,
+	.setup		= fio_ime_setup,
+	.init		= fio_ime_aio_init,
+	.cleanup	= fio_ime_aio_clean,
+	.queue		= fio_ime_aio_queue,
+	.commit		= fio_ime_aio_commit,
+	.getevents	= fio_ime_aio_getevents,
+	.event		= fio_ime_event,
+	.open_file	= fio_ime_open_file,
+	.close_file	= fio_ime_close_file,
+	.get_file_size	= fio_ime_get_file_size,
+	.unlink_file  	= fio_ime_unlink_file,
+	.flags       	= FIO_DISKLESSIO,
+};
+
+static void fio_init fio_ime_register(void)
+{
+	register_ioengine(&ioengine_prw);
+	register_ioengine(&ioengine_pvrw);
+	register_ioengine(&ioengine_aio);
+}
+
+static void fio_exit fio_ime_unregister(void)
+{
+	unregister_ioengine(&ioengine_prw);
+	unregister_ioengine(&ioengine_pvrw);
+	unregister_ioengine(&ioengine_aio);
+
+	if (fio_ime_is_initialized && ime_native_finalize() < 0)
+		log_err("Warning: IME did not finalize properly\n");
+}
diff -Nru fio-2.1.3/engines/io_uring.c fio-3.16/engines/io_uring.c
--- fio-2.1.3/engines/io_uring.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/io_uring.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,671 @@
+/*
+ * io_uring engine
+ *
+ * IO engine using the new native Linux aio io_uring interface. See:
+ *
+ * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
+ *
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+#include "../lib/fls.h"
+
+#ifdef ARCH_HAVE_IOURING
+
+#include "../lib/types.h"
+#include "../os/linux/io_uring.h"
+
+struct io_sq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	unsigned *flags;
+	unsigned *array;
+};
+
+struct io_cq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	struct io_uring_cqe *cqes;
+};
+
+struct ioring_mmap {
+	void *ptr;
+	size_t len;
+};
+
+struct ioring_data {
+	int ring_fd;
+
+	struct io_u **io_u_index;
+
+	int *fds;
+
+	struct io_sq_ring sq_ring;
+	struct io_uring_sqe *sqes;
+	struct iovec *iovecs;
+	unsigned sq_ring_mask;
+
+	struct io_cq_ring cq_ring;
+	unsigned cq_ring_mask;
+
+	int queued;
+	int cq_ring_off;
+	unsigned iodepth;
+
+	struct ioring_mmap mmap[3];
+};
+
+struct ioring_options {
+	void *pad;
+	unsigned int hipri;
+	unsigned int fixedbufs;
+	unsigned int registerfiles;
+	unsigned int sqpoll_thread;
+	unsigned int sqpoll_set;
+	unsigned int sqpoll_cpu;
+};
+
+static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
+{
+	struct ioring_options *o = data;
+
+	o->sqpoll_cpu = *val;
+	o->sqpoll_set = 1;
+	return 0;
+}
+
+static struct fio_option options[] = {
+	{
+		.name	= "hipri",
+		.lname	= "High Priority",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, hipri),
+		.help	= "Use polled IO completions",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "fixedbufs",
+		.lname	= "Fixed (pre-mapped) IO buffers",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, fixedbufs),
+		.help	= "Pre map IO buffers",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "registerfiles",
+		.lname	= "Register file set",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct ioring_options, registerfiles),
+		.help	= "Pre-open/register files",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "sqthread_poll",
+		.lname	= "Kernel SQ thread polling",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, sqpoll_thread),
+		.help	= "Offload submission/completion to kernel thread",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= "sqthread_poll_cpu",
+		.lname	= "SQ Thread Poll CPU",
+		.type	= FIO_OPT_INT,
+		.cb	= fio_ioring_sqpoll_cb,
+		.help	= "What CPU to run SQ thread polling on",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
+			 unsigned int min_complete, unsigned int flags)
+{
+	return syscall(__NR_sys_io_uring_enter, ld->ring_fd, to_submit,
+			min_complete, flags, NULL, 0);
+}
+
+static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct fio_file *f = io_u->file;
+	struct io_uring_sqe *sqe;
+
+	sqe = &ld->sqes[io_u->index];
+
+	/* zero out fields not used in this submission */
+	memset(sqe, 0, sizeof(*sqe));
+
+	if (o->registerfiles) {
+		sqe->fd = f->engine_pos;
+		sqe->flags = IOSQE_FIXED_FILE;
+	} else {
+		sqe->fd = f->fd;
+	}
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (o->fixedbufs) {
+			if (io_u->ddir == DDIR_READ)
+				sqe->opcode = IORING_OP_READ_FIXED;
+			else
+				sqe->opcode = IORING_OP_WRITE_FIXED;
+			sqe->addr = (unsigned long) io_u->xfer_buf;
+			sqe->len = io_u->xfer_buflen;
+			sqe->buf_index = io_u->index;
+		} else {
+			if (io_u->ddir == DDIR_READ)
+				sqe->opcode = IORING_OP_READV;
+			else
+				sqe->opcode = IORING_OP_WRITEV;
+			sqe->addr = (unsigned long) &ld->iovecs[io_u->index];
+			sqe->len = 1;
+		}
+		sqe->off = io_u->offset;
+	} else if (ddir_sync(io_u->ddir)) {
+		if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
+			sqe->off = f->first_write;
+			sqe->len = f->last_write - f->first_write;
+			sqe->sync_range_flags = td->o.sync_file_range;
+			sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
+		} else {
+			if (io_u->ddir == DDIR_DATASYNC)
+				sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
+			sqe->opcode = IORING_OP_FSYNC;
+		}
+	}
+
+	sqe->user_data = (unsigned long) io_u;
+	return 0;
+}
+
+static struct io_u *fio_ioring_event(struct thread_data *td, int event)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_uring_cqe *cqe;
+	struct io_u *io_u;
+	unsigned index;
+
+	index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
+
+	cqe = &ld->cq_ring.cqes[index];
+	io_u = (struct io_u *) (uintptr_t) cqe->user_data;
+
+	if (cqe->res != io_u->xfer_buflen) {
+		if (cqe->res > io_u->xfer_buflen)
+			io_u->error = -cqe->res;
+		else
+			io_u->resid = io_u->xfer_buflen - cqe->res;
+	} else
+		io_u->error = 0;
+
+	return io_u;
+}
+
+static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
+				   unsigned int max)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_cq_ring *ring = &ld->cq_ring;
+	unsigned head, reaped = 0;
+
+	head = *ring->head;
+	do {
+		read_barrier();
+		if (head == *ring->tail)
+			break;
+		reaped++;
+		head++;
+	} while (reaped + events < max);
+
+	*ring->head = head;
+	write_barrier();
+	return reaped;
+}
+
+static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
+				unsigned int max, const struct timespec *t)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
+	struct ioring_options *o = td->eo;
+	struct io_cq_ring *ring = &ld->cq_ring;
+	unsigned events = 0;
+	int r;
+
+	ld->cq_ring_off = *ring->head;
+	do {
+		r = fio_ioring_cqring_reap(td, events, max);
+		if (r) {
+			events += r;
+			if (actual_min != 0)
+				actual_min -= r;
+			continue;
+		}
+
+		if (!o->sqpoll_thread) {
+			r = io_uring_enter(ld, 0, actual_min,
+						IORING_ENTER_GETEVENTS);
+			if (r < 0) {
+				if (errno == EAGAIN || errno == EINTR)
+					continue;
+				td_verror(td, errno, "io_uring_enter");
+				break;
+			}
+		}
+	} while (events < min);
+
+	return r < 0 ? r : events;
+}
+
+static enum fio_q_status fio_ioring_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct io_sq_ring *ring = &ld->sq_ring;
+	unsigned tail, next_tail;
+
+	fio_ro_check(td, io_u);
+
+	if (ld->queued == ld->iodepth)
+		return FIO_Q_BUSY;
+
+	if (io_u->ddir == DDIR_TRIM) {
+		if (ld->queued)
+			return FIO_Q_BUSY;
+
+		do_io_u_trim(td, io_u);
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
+		return FIO_Q_COMPLETED;
+	}
+
+	tail = *ring->tail;
+	next_tail = tail + 1;
+	read_barrier();
+	if (next_tail == *ring->head)
+		return FIO_Q_BUSY;
+
+	/* ensure sqe stores are ordered with tail update */
+	write_barrier();
+	ring->array[tail & ld->sq_ring_mask] = io_u->index;
+	*ring->tail = next_tail;
+	write_barrier();
+
+	ld->queued++;
+	return FIO_Q_QUEUED;
+}
+
+static void fio_ioring_queued(struct thread_data *td, int start, int nr)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct timespec now;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	fio_gettime(&now, NULL);
+
+	while (nr--) {
+		struct io_sq_ring *ring = &ld->sq_ring;
+		int index = ring->array[start & ld->sq_ring_mask];
+		struct io_u *io_u = ld->io_u_index[index];
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+
+		start++;
+	}
+}
+
+static int fio_ioring_commit(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	int ret;
+
+	if (!ld->queued)
+		return 0;
+
+	/*
+	 * Kernel side does submission. just need to check if the ring is
+	 * flagged as needing a kick, if so, call io_uring_enter(). This
+	 * only happens if we've been idle too long.
+	 */
+	if (o->sqpoll_thread) {
+		struct io_sq_ring *ring = &ld->sq_ring;
+
+		read_barrier();
+		if (*ring->flags & IORING_SQ_NEED_WAKEUP)
+			io_uring_enter(ld, ld->queued, 0,
+					IORING_ENTER_SQ_WAKEUP);
+		ld->queued = 0;
+		return 0;
+	}
+
+	do {
+		unsigned start = *ld->sq_ring.head;
+		long nr = ld->queued;
+
+		ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
+		if (ret > 0) {
+			fio_ioring_queued(td, start, ret);
+			io_u_mark_submit(td, ret);
+
+			ld->queued -= ret;
+			ret = 0;
+		} else if (!ret) {
+			io_u_mark_submit(td, ret);
+			continue;
+		} else {
+			if (errno == EAGAIN || errno == EINTR) {
+				ret = fio_ioring_cqring_reap(td, 0, ld->queued);
+				if (ret)
+					continue;
+				/* Shouldn't happen */
+				usleep(1);
+				continue;
+			}
+			td_verror(td, errno, "io_uring_enter submit");
+			break;
+		}
+	} while (ld->queued);
+
+	return ret;
+}
+
+static void fio_ioring_unmap(struct ioring_data *ld)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
+		munmap(ld->mmap[i].ptr, ld->mmap[i].len);
+	close(ld->ring_fd);
+}
+
+static void fio_ioring_cleanup(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+
+	if (ld) {
+		if (!(td->flags & TD_F_CHILD))
+			fio_ioring_unmap(ld);
+
+		free(ld->io_u_index);
+		free(ld->iovecs);
+		free(ld->fds);
+		free(ld);
+	}
+}
+
+static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
+{
+	struct io_sq_ring *sring = &ld->sq_ring;
+	struct io_cq_ring *cring = &ld->cq_ring;
+	void *ptr;
+
+	ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
+	ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+			IORING_OFF_SQ_RING);
+	ld->mmap[0].ptr = ptr;
+	sring->head = ptr + p->sq_off.head;
+	sring->tail = ptr + p->sq_off.tail;
+	sring->ring_mask = ptr + p->sq_off.ring_mask;
+	sring->ring_entries = ptr + p->sq_off.ring_entries;
+	sring->flags = ptr + p->sq_off.flags;
+	sring->array = ptr + p->sq_off.array;
+	ld->sq_ring_mask = *sring->ring_mask;
+
+	ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
+	ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+				IORING_OFF_SQES);
+	ld->mmap[1].ptr = ld->sqes;
+
+	ld->mmap[2].len = p->cq_off.cqes +
+				p->cq_entries * sizeof(struct io_uring_cqe);
+	ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+			IORING_OFF_CQ_RING);
+	ld->mmap[2].ptr = ptr;
+	cring->head = ptr + p->cq_off.head;
+	cring->tail = ptr + p->cq_off.tail;
+	cring->ring_mask = ptr + p->cq_off.ring_mask;
+	cring->ring_entries = ptr + p->cq_off.ring_entries;
+	cring->cqes = ptr + p->cq_off.cqes;
+	ld->cq_ring_mask = *cring->ring_mask;
+	return 0;
+}
+
+static int fio_ioring_queue_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	int depth = td->o.iodepth;
+	struct io_uring_params p;
+	int ret;
+
+	memset(&p, 0, sizeof(p));
+
+	if (o->hipri)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (o->sqpoll_thread) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (o->sqpoll_set) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = o->sqpoll_cpu;
+		}
+	}
+
+	ret = syscall(__NR_sys_io_uring_setup, depth, &p);
+	if (ret < 0)
+		return ret;
+
+	ld->ring_fd = ret;
+
+	if (o->fixedbufs) {
+		struct rlimit rlim = {
+			.rlim_cur = RLIM_INFINITY,
+			.rlim_max = RLIM_INFINITY,
+		};
+
+		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0)
+			return -1;
+
+		ret = syscall(__NR_sys_io_uring_register, ld->ring_fd,
+				IORING_REGISTER_BUFFERS, ld->iovecs, depth);
+		if (ret < 0)
+			return ret;
+	}
+
+	return fio_ioring_mmap(ld, &p);
+}
+
+static int fio_ioring_register_files(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct fio_file *f;
+	unsigned int i;
+	int ret;
+
+	ld->fds = calloc(td->o.nr_files, sizeof(int));
+
+	for_each_file(td, f, i) {
+		ret = generic_open_file(td, f);
+		if (ret)
+			goto err;
+		ld->fds[i] = f->fd;
+		f->engine_pos = i;
+	}
+
+	ret = syscall(__NR_sys_io_uring_register, ld->ring_fd,
+			IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
+	if (ret) {
+err:
+		free(ld->fds);
+		ld->fds = NULL;
+	}
+
+	/*
+	 * Pretend the file is closed again, and really close it if we hit
+	 * an error.
+	 */
+	for_each_file(td, f, i) {
+		if (ret) {
+			int fio_unused ret2;
+			ret2 = generic_close_file(td, f);
+		} else
+			f->fd = -1;
+	}
+
+	return ret;
+}
+
+static int fio_ioring_post_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct io_u *io_u;
+	int err, i;
+
+	for (i = 0; i < td->o.iodepth; i++) {
+		struct iovec *iov = &ld->iovecs[i];
+
+		io_u = ld->io_u_index[i];
+		iov->iov_base = io_u->buf;
+		iov->iov_len = td_max_bs(td);
+	}
+
+	err = fio_ioring_queue_init(td);
+	if (err) {
+		td_verror(td, errno, "io_queue_init");
+		return 1;
+	}
+
+	if (o->registerfiles) {
+		err = fio_ioring_register_files(td);
+		if (err) {
+			td_verror(td, errno, "ioring_register_files");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static unsigned roundup_pow2(unsigned depth)
+{
+	return 1UL << __fls(depth - 1);
+}
+
+static int fio_ioring_init(struct thread_data *td)
+{
+	struct ioring_options *o = td->eo;
+	struct ioring_data *ld;
+
+	/* sqthread submission requires registered files */
+	if (o->sqpoll_thread)
+		o->registerfiles = 1;
+
+	if (o->registerfiles && td->o.nr_files != td->o.open_files) {
+		log_err("fio: io_uring registered files require nr_files to "
+			"be identical to open_files\n");
+		return 1;
+	}
+
+	ld = calloc(1, sizeof(*ld));
+
+	/* ring depth must be a power-of-2 */
+	ld->iodepth = td->o.iodepth;
+	td->o.iodepth = roundup_pow2(td->o.iodepth);
+
+	/* io_u index */
+	ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
+	ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
+
+	td->io_ops_data = ld;
+	return 0;
+}
+
+static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+
+	ld->io_u_index[io_u->index] = io_u;
+	return 0;
+}
+
+static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (!ld || !o->registerfiles)
+		return generic_open_file(td, f);
+
+	f->fd = ld->fds[f->engine_pos];
+	return 0;
+}
+
+static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (!ld || !o->registerfiles)
+		return generic_close_file(td, f);
+
+	f->fd = -1;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "io_uring",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM,
+	.init			= fio_ioring_init,
+	.post_init		= fio_ioring_post_init,
+	.io_u_init		= fio_ioring_io_u_init,
+	.prep			= fio_ioring_prep,
+	.queue			= fio_ioring_queue,
+	.commit			= fio_ioring_commit,
+	.getevents		= fio_ioring_getevents,
+	.event			= fio_ioring_event,
+	.cleanup		= fio_ioring_cleanup,
+	.open_file		= fio_ioring_open_file,
+	.close_file		= fio_ioring_close_file,
+	.get_file_size		= generic_get_file_size,
+	.options		= options,
+	.option_struct_size	= sizeof(struct ioring_options),
+};
+
+static void fio_init fio_ioring_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_ioring_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
+#endif
diff -Nru fio-2.1.3/engines/libaio.c fio-3.16/engines/libaio.c
--- fio-2.1.3/engines/libaio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/libaio.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,26 +4,47 @@
  * IO engine using the Linux native aio interface.
  *
  */
-#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
-#include <assert.h>
 #include <libaio.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 
 #include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+static int fio_libaio_commit(struct thread_data *td);
 
 struct libaio_data {
 	io_context_t aio_ctx;
 	struct io_event *aio_events;
 	struct iocb **iocbs;
 	struct io_u **io_us;
-	int iocbs_nr;
+
+	struct io_u **io_u_index;
+
+	/*
+	 * Basic ring buffer. 'head' is incremented in _queue(), and
+	 * 'tail' is incremented in _commit(). We keep 'queued' so
+	 * that we know if the ring is full or empty, when
+	 * 'head' == 'tail'. 'entries' is the ring size, and
+	 * 'is_pow2' is just an optimization to use AND instead of
+	 * modulus to get the remainder on ring increment.
+	 */
+	int is_pow2;
+	unsigned int entries;
+	unsigned int queued;
+	unsigned int head;
+	unsigned int tail;
 };
 
 struct libaio_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int userspace_reap;
+	unsigned int hipri;
 };
 
 static struct fio_option options[] = {
@@ -41,23 +62,33 @@
 	},
 };
 
+static inline void ring_inc(struct libaio_data *ld, unsigned int *val,
+			    unsigned int add)
+{
+	if (ld->is_pow2)
+		*val = (*val + add) & (ld->entries - 1);
+	else
+		*val = (*val + add) % ld->entries;
+}
+
 static int fio_libaio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct iocb *iocb = &io_u->iocb;
 
-	if (io_u->ddir == DDIR_READ)
-		io_prep_pread(&io_u->iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
-	else if (io_u->ddir == DDIR_WRITE)
-		io_prep_pwrite(&io_u->iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
-	else if (ddir_sync(io_u->ddir))
-		io_prep_fsync(&io_u->iocb, f->fd);
+	if (io_u->ddir == DDIR_READ) {
+		io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	} else if (io_u->ddir == DDIR_WRITE) {
+		io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+	} else if (ddir_sync(io_u->ddir))
+		io_prep_fsync(iocb, f->fd);
 
 	return 0;
 }
 
 static struct io_u *fio_libaio_event(struct thread_data *td, int event)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct io_event *ev;
 	struct io_u *io_u;
 
@@ -117,13 +148,19 @@
 }
 
 static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
-				unsigned int max, struct timespec *t)
+				unsigned int max, const struct timespec *t)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct libaio_options *o = td->eo;
-	unsigned actual_min = td->o.iodepth_batch_complete == 0 ? 0 : min;
+	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
+	struct timespec __lt, *lt = NULL;
 	int r, events = 0;
 
+	if (t) {
+		__lt = *t;
+		lt = &__lt;
+	}
+
 	do {
 		if (o->userspace_reap == 1
 		    && actual_min == 0
@@ -133,24 +170,29 @@
 				ld->aio_events + events);
 		} else {
 			r = io_getevents(ld->aio_ctx, actual_min,
-				max, ld->aio_events + events, t);
+				max, ld->aio_events + events, lt);
 		}
-		if (r >= 0)
+		if (r > 0)
 			events += r;
-		else if (r == -EAGAIN)
-			usleep(100);
+		else if ((min && r == 0) || r == -EAGAIN) {
+			fio_libaio_commit(td);
+			if (actual_min)
+				usleep(10);
+		} else if (r != -EINTR)
+			break;
 	} while (events < min);
 
 	return r < 0 ? r : events;
 }
 
-static int fio_libaio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_libaio_queue(struct thread_data *td,
+					  struct io_u *io_u)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
-	if (ld->iocbs_nr == (int) td->o.iodepth)
+	if (ld->queued == td->o.iodepth)
 		return FIO_Q_BUSY;
 
 	/*
@@ -160,7 +202,7 @@
 	 * have pending io, to let fio complete those first.
 	 */
 	if (ddir_sync(io_u->ddir)) {
-		if (ld->iocbs_nr)
+		if (ld->queued)
 			return FIO_Q_BUSY;
 
 		do_io_u_sync(td, io_u);
@@ -168,23 +210,26 @@
 	}
 
 	if (io_u->ddir == DDIR_TRIM) {
-		if (ld->iocbs_nr)
+		if (ld->queued)
 			return FIO_Q_BUSY;
 
 		do_io_u_trim(td, io_u);
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
 		return FIO_Q_COMPLETED;
 	}
 
-	ld->iocbs[ld->iocbs_nr] = &io_u->iocb;
-	ld->io_us[ld->iocbs_nr] = io_u;
-	ld->iocbs_nr++;
+	ld->iocbs[ld->head] = &io_u->iocb;
+	ld->io_us[ld->head] = io_u;
+	ring_inc(ld, &ld->head, 1);
+	ld->queued++;
 	return FIO_Q_QUEUED;
 }
 
 static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
 			      unsigned int nr)
 {
-	struct timeval now;
+	struct timespec now;
 	unsigned int i;
 
 	if (!fio_fill_issue_time(td))
@@ -202,49 +247,94 @@
 
 static int fio_libaio_commit(struct thread_data *td)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct iocb **iocbs;
 	struct io_u **io_us;
-	int ret;
+	struct timespec ts;
+	int ret, wait_start = 0;
 
-	if (!ld->iocbs_nr)
+	if (!ld->queued)
 		return 0;
 
-	io_us = ld->io_us;
-	iocbs = ld->iocbs;
 	do {
-		ret = io_submit(ld->aio_ctx, ld->iocbs_nr, iocbs);
+		long nr = ld->queued;
+
+		nr = min((unsigned int) nr, ld->entries - ld->tail);
+		io_us = ld->io_us + ld->tail;
+		iocbs = ld->iocbs + ld->tail;
+
+		ret = io_submit(ld->aio_ctx, nr, iocbs);
 		if (ret > 0) {
 			fio_libaio_queued(td, io_us, ret);
 			io_u_mark_submit(td, ret);
-			ld->iocbs_nr -= ret;
-			io_us += ret;
-			iocbs += ret;
+
+			ld->queued -= ret;
+			ring_inc(ld, &ld->tail, ret);
 			ret = 0;
-		} else if (!ret || ret == -EAGAIN || ret == -EINTR) {
+			wait_start = 0;
+		} else if (ret == -EINTR || !ret) {
 			if (!ret)
 				io_u_mark_submit(td, ret);
+			wait_start = 0;
+			continue;
+		} else if (ret == -EAGAIN) {
+			/*
+			 * If we get EAGAIN, we should break out without
+			 * error and let the upper layer reap some
+			 * events for us. If we have no queued IO, we
+			 * must loop here. If we loop for more than 30s,
+			 * just error out, something must be buggy in the
+			 * IO path.
+			 */
+			if (ld->queued) {
+				ret = 0;
+				break;
+			}
+			if (!wait_start) {
+				fio_gettime(&ts, NULL);
+				wait_start = 1;
+			} else if (mtime_since_now(&ts) > 30000) {
+				log_err("fio: aio appears to be stalled, giving up\n");
+				break;
+			}
+			usleep(1);
 			continue;
+		} else if (ret == -ENOMEM) {
+			/*
+			 * If we get -ENOMEM, reap events if we can. If
+			 * we cannot, treat it as a fatal event since there's
+			 * nothing we can do about it.
+			 */
+			if (ld->queued)
+				ret = 0;
+			break;
 		} else
 			break;
-	} while (ld->iocbs_nr);
+	} while (ld->queued);
 
 	return ret;
 }
 
 static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
 }
 
 static void fio_libaio_cleanup(struct thread_data *td)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	if (ld) {
-		io_destroy(ld->aio_ctx);
+		/*
+		 * Work-around to avoid huge RCU stalls at exit time. If we
+		 * don't do this here, then it'll be torn down by exit_aio().
+		 * But for that case we can parallellize the freeing, thus
+		 * speeding it up a lot.
+		 */
+		if (!(td->flags & TD_F_CHILD))
+			io_destroy(ld->aio_ctx);
 		free(ld->aio_events);
 		free(ld->iocbs);
 		free(ld->io_us);
@@ -252,46 +342,42 @@
 	}
 }
 
-static int fio_libaio_init(struct thread_data *td)
+static int fio_libaio_post_init(struct thread_data *td)
 {
-	struct libaio_data *ld = malloc(sizeof(*ld));
-	struct libaio_options *o = td->eo;
-	int err = 0;
-
-	memset(ld, 0, sizeof(*ld));
+	struct libaio_data *ld = td->io_ops_data;
+	int err;
 
-	/*
-	 * First try passing in 0 for queue depth, since we don't
-	 * care about the user ring. If that fails, the kernel is too old
-	 * and we need the right depth.
-	 */
-	if (!o->userspace_reap)
-		err = io_queue_init(INT_MAX, &ld->aio_ctx);
-	if (o->userspace_reap || err == -EINVAL)
-		err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
+	err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
 	if (err) {
 		td_verror(td, -err, "io_queue_init");
-		log_err("fio: check /proc/sys/fs/aio-max-nr\n");
-		free(ld);
 		return 1;
 	}
 
-	ld->aio_events = malloc(td->o.iodepth * sizeof(struct io_event));
-	memset(ld->aio_events, 0, td->o.iodepth * sizeof(struct io_event));
-	ld->iocbs = malloc(td->o.iodepth * sizeof(struct iocb *));
-	memset(ld->iocbs, 0, sizeof(struct iocb *));
-	ld->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
-	memset(ld->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
-	ld->iocbs_nr = 0;
+	return 0;
+}
+
+static int fio_libaio_init(struct thread_data *td)
+{
+	struct libaio_data *ld;
+
+	ld = calloc(1, sizeof(*ld));
+
+	ld->entries = td->o.iodepth;
+	ld->is_pow2 = is_power_of_2(ld->entries);
+	ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
+	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
+	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
 
-	td->io_ops->data = ld;
+	td->io_ops_data = ld;
 	return 0;
 }
 
 static struct ioengine_ops ioengine = {
 	.name			= "libaio",
 	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM,
 	.init			= fio_libaio_init,
+	.post_init		= fio_libaio_post_init,
 	.prep			= fio_libaio_prep,
 	.queue			= fio_libaio_queue,
 	.commit			= fio_libaio_commit,
diff -Nru fio-2.1.3/engines/libhdfs.c fio-3.16/engines/libhdfs.c
--- fio-2.1.3/engines/libhdfs.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/libhdfs.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,421 @@
+/*
+ * libhdfs engine
+ *
+ * this engine helps perform read/write operations on hdfs cluster using
+ * libhdfs. hdfs doesnot support modification of data once file is created.
+ *
+ * so to mimic that create many files of small size (e.g 256k), and this
+ * engine select a file based on the offset generated by fio.
+ *
+ * thus, random reads and writes can also be achieved with this logic.
+ *
+ */
+
+#include <math.h>
+#include <hdfs.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#define CHUNCK_NAME_LENGTH_MAX 80
+#define CHUNCK_CREATION_BUFFER_SIZE 65536
+
+struct hdfsio_data {
+	hdfsFS fs;
+	hdfsFile fp;
+	uint64_t curr_file_id;
+};
+
+struct hdfsio_options {
+	void *pad;			/* needed because offset can't be 0 for a option defined used offsetof */
+	char *host;
+	char *directory;
+	unsigned int port;
+	unsigned int chunck_size;
+	unsigned int single_instance;
+	unsigned int use_direct;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "namenode",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hostname",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "port",
+		.lname	= "hdfs namenode port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, port),
+		.def    = "9000",
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port used by the HDFS cluster namenode",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfsdirectory",
+		.lname	= "hfds directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, directory),
+		.def    = "/",
+		.help	= "The HDFS directory where fio will create chuncks",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "chunk_size",
+		.alias	= "chunck_size",
+		.lname	= "Chunk size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, chunck_size),
+		.def    = "1048576",
+		.help	= "Size of individual chunck",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "single_instance",
+		.lname	= "Single Instance",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, single_instance),
+		.def    = "1",
+		.help	= "Use a single instance",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfs_use_direct",
+		.lname	= "HDFS Use Direct",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, use_direct),
+		.def    = "0",
+		.help	= "Use readDirect instead of hdfsRead",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+
+static int get_chunck_name(char *dest, char *file_name, uint64_t chunk_id) {
+	return snprintf(dest, CHUNCK_NAME_LENGTH_MAX, "%s_%lu", file_name, chunk_id);
+}
+
+static int fio_hdfsio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
+	unsigned long f_id;
+	char fname[CHUNCK_NAME_LENGTH_MAX];
+	int open_flags;
+
+	/* find out file id based on the offset generated by fio */
+	f_id = floor(io_u->offset / options-> chunck_size);
+
+	if (f_id == hd->curr_file_id) {
+		/* file is already open */
+		return 0;
+	}
+
+	if (hd->curr_file_id != -1) {
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
+	}
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_SYNC) {
+		open_flags = O_RDONLY;
+	} else if (io_u->ddir == DDIR_WRITE) {
+		open_flags = O_WRONLY;
+	} else {
+		log_err("hdfs: Invalid I/O Operation\n");
+		return 0;
+	}
+	
+	get_chunck_name(fname, io_u->file->file_name, f_id);
+	hd->fp = hdfsOpenFile(hd->fs, fname, open_flags, 0, 0,
+			      options->chunck_size);
+	if(hd->fp == NULL) {
+		log_err("hdfs: unable to open file: %s: %d\n", fname, strerror(errno));
+		return errno;
+	}
+	hd->curr_file_id = f_id;
+
+	return 0;
+}
+
+static enum fio_q_status fio_hdfsio_queue(struct thread_data *td,
+					  struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int ret;
+	unsigned long offset;
+	
+	offset = io_u->offset % options->chunck_size;
+	
+	if( (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) && 
+	     hdfsTell(hd->fs, hd->fp) != offset && hdfsSeek(hd->fs, hd->fp, offset) != 0 ) {
+		log_err("hdfs: seek failed: %s, are you doing random write smaller than chunck size ?\n", strerror(errno));
+		io_u->error = errno;
+		return FIO_Q_COMPLETED;
+	};
+
+	// do the IO
+	if (io_u->ddir == DDIR_READ) {
+		if (options->use_direct) {
+			ret = readDirect(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		} else {
+			ret = hdfsRead(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		}
+	} else if (io_u->ddir == DDIR_WRITE) {
+		ret = hdfsWrite(hd->fs, hd->fp, io_u->xfer_buf,
+				io_u->xfer_buflen);
+	} else if (io_u->ddir == DDIR_SYNC) {
+		ret = hdfsFlush(hd->fs, hd->fp);
+	} else {
+		log_err("hdfs: Invalid I/O Operation: %d\n", io_u->ddir);
+		ret = EINVAL;
+	}
+
+	// Check if the IO went fine, or is incomplete
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else {
+			io_u->error = errno;
+		}
+	}
+
+	if (io_u->error)
+		td_verror(td, io_u->error, "xfer");
+
+	return FIO_Q_COMPLETED;
+}
+
+int fio_hdfsio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->o.odirect) {
+		td->error = EINVAL;
+		return 0;
+	}
+
+	return 0;
+}
+
+int fio_hdfsio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+
+	if (hd->curr_file_id != -1) {
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
+	}
+	return 0;
+}
+
+static int fio_hdfsio_init(struct thread_data *td)
+{
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct fio_file *f;
+	uint64_t j,k;
+	int i, failure = 0;
+	uint8_t buffer[CHUNCK_CREATION_BUFFER_SIZE];
+	uint64_t bytes_left;	
+	char fname[CHUNCK_NAME_LENGTH_MAX];	
+	hdfsFile fp;
+	hdfsFileInfo *fi;
+	tOffset fi_size;
+
+	for_each_file(td, f, i) {
+		k = 0;
+		for(j=0; j < f->real_file_size; j += options->chunck_size) {
+			get_chunck_name(fname, f->file_name, k++);
+			fi = hdfsGetPathInfo(hd->fs, fname);
+			fi_size = fi ? fi->mSize : 0;
+			// fill exist and is big enough, nothing to do
+			if( fi && fi_size >= options->chunck_size) {
+				continue;
+			}
+			fp = hdfsOpenFile(hd->fs, fname, O_WRONLY, 0, 0,
+					  options->chunck_size);
+			if(fp == NULL) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+			bytes_left = options->chunck_size;
+			memset(buffer, 0, CHUNCK_CREATION_BUFFER_SIZE);
+			while( bytes_left > CHUNCK_CREATION_BUFFER_SIZE) {
+				if( hdfsWrite(hd->fs, fp, buffer, CHUNCK_CREATION_BUFFER_SIZE)
+				    != CHUNCK_CREATION_BUFFER_SIZE) {
+    					failure = errno;
+	    				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+					break;
+				};
+				bytes_left -= CHUNCK_CREATION_BUFFER_SIZE;
+			}
+			if(bytes_left > 0) {
+				if( hdfsWrite(hd->fs, fp, buffer, bytes_left)
+				    != bytes_left) {
+					failure = errno;
+					break;
+				};
+			}
+			if( hdfsCloseFile(hd->fs, fp) != 0) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+		}
+		if(failure) {
+			break;
+		}
+	}
+	
+	if( !failure ) {
+		fio_file_set_size_known(f);
+	}
+
+	return failure;
+}
+
+static int fio_hdfsio_setup(struct thread_data *td)
+{
+	struct hdfsio_data *hd;
+	struct fio_file *f;
+	int i;
+	uint64_t file_size, total_file_size;
+
+	if (!td->io_ops_data) {
+		hd = malloc(sizeof(*hd));
+		memset(hd, 0, sizeof(*hd));
+		
+		hd->curr_file_id = -1;
+
+		td->io_ops_data = hd;
+	}
+	
+	total_file_size = 0;
+	file_size = 0;
+
+	for_each_file(td, f, i) {
+		if(!td->o.file_size_low) {
+			file_size = floor(td->o.size / td->o.nr_files);
+			total_file_size += file_size;
+		}
+		else if (td->o.file_size_low == td->o.file_size_high)
+			file_size = td->o.file_size_low;
+		else {
+			file_size = get_rand_file_size(td);
+		}
+		f->real_file_size = file_size;
+	}
+	/* If the size doesn't divide nicely with the chunck size,
+	 * make the last files bigger.
+	 * Used only if filesize was not explicitely given
+	 */
+	if (!td->o.file_size_low && total_file_size < td->o.size) {
+		f->real_file_size += (td->o.size - total_file_size);
+	}
+
+	return 0;
+}
+
+static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int failure;
+	struct hdfsBuilder *bld;
+
+	if (options->host == NULL || options->port == 0) {
+		log_err("hdfs: server not defined\n");
+		return EINVAL;
+	}
+	
+	bld = hdfsNewBuilder();
+	if (!bld) {
+		failure = errno;
+		log_err("hdfs: unable to allocate connect builder\n");
+		return failure;
+	}
+	hdfsBuilderSetNameNode(bld, options->host);
+	hdfsBuilderSetNameNodePort(bld, options->port);
+	if(! options->single_instance) {
+		hdfsBuilderSetForceNewInstance(bld);
+	}
+	hd->fs = hdfsBuilderConnect(bld);
+	
+	/* hdfsSetWorkingDirectory succeed on non existend directory */
+	if (hdfsExists(hd->fs, options->directory) < 0 || hdfsSetWorkingDirectory(hd->fs, options->directory) < 0) {
+		failure = errno;
+		log_err("hdfs: invalid working directory %s: %s\n", options->directory, strerror(errno));
+		return failure;
+	}
+	
+	return 0;
+}
+
+static void fio_hdfsio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+
+	if (hd->fs && hdfsDisconnect(hd->fs) < 0) {
+		log_err("hdfs: disconnect failed: %d\n", errno);
+	}
+}
+
+static struct ioengine_ops ioengine_hdfs = {
+	.name = "libhdfs",
+	.version = FIO_IOOPS_VERSION,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NODISKUTIL,
+	.setup = fio_hdfsio_setup,
+	.init = fio_hdfsio_init,
+	.prep = fio_hdfsio_prep,
+	.queue = fio_hdfsio_queue,
+	.open_file = fio_hdfsio_open_file,
+	.close_file = fio_hdfsio_close_file,
+	.io_u_init = fio_hdfsio_io_u_init,
+	.io_u_free = fio_hdfsio_io_u_free,
+	.option_struct_size	= sizeof(struct hdfsio_options),
+	.options		= options,
+};
+
+
+static void fio_init fio_hdfsio_register(void)
+{
+	register_ioengine(&ioengine_hdfs);
+}
+
+static void fio_exit fio_hdfsio_unregister(void)
+{
+	unregister_ioengine(&ioengine_hdfs);
+}
diff -Nru fio-2.1.3/engines/libiscsi.c fio-3.16/engines/libiscsi.c
--- fio-2.1.3/engines/libiscsi.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/libiscsi.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,411 @@
+/*
+ * libiscsi engine
+ *
+ * this engine read/write iscsi lun with libiscsi.
+ */
+
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#include <stdlib.h>
+#include <iscsi/iscsi.h>
+#include <iscsi/scsi-lowlevel.h>
+#include <poll.h>
+
+struct iscsi_lun;
+struct iscsi_info;
+
+struct iscsi_task {
+	struct scsi_task	*scsi_task;
+	struct iscsi_lun	*iscsi_lun;
+	struct io_u		*io_u;
+};
+
+struct iscsi_lun {
+	struct iscsi_info	*iscsi_info;
+	struct iscsi_context	*iscsi;
+	struct iscsi_url        *url;
+	int			 block_size;
+	uint64_t		 num_blocks;
+};
+
+struct iscsi_info {
+	struct iscsi_lun	**luns;
+	int			  nr_luns;
+	struct pollfd		 *pfds;
+	struct iscsi_task	**complete_events;
+	int			  nr_events;
+};
+
+struct iscsi_options {
+	void	*pad;
+	char	*initiator;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	  = "initiator",
+		.lname	  = "initiator",
+		.type	  = FIO_OPT_STR_STORE,
+		.off1	  = offsetof(struct iscsi_options, initiator),
+		.def	  = "iqn.2019-04.org.fio:fio",
+		.help	  = "initiator name",
+		.category = FIO_OPT_C_ENGINE,
+		.group	  = FIO_OPT_G_ISCSI,
+	},
+
+	{
+		.name = NULL,
+	},
+};
+
+static int fio_iscsi_setup_lun(struct iscsi_info *iscsi_info,
+			       char *initiator, struct fio_file *f, int i)
+{
+	struct iscsi_lun		*iscsi_lun  = NULL;
+	struct scsi_task		*task	    = NULL;
+	struct scsi_readcapacity16	*rc16	    = NULL;
+	int				 ret	    = 0;
+
+	iscsi_lun = malloc(sizeof(struct iscsi_lun));
+	memset(iscsi_lun, 0, sizeof(struct iscsi_lun));
+
+	iscsi_lun->iscsi_info = iscsi_info;
+
+	iscsi_lun->url = iscsi_parse_full_url(NULL, f->file_name);
+	if (iscsi_lun->url == NULL) {
+		log_err("iscsi: failed to parse url: %s\n", f->file_name);
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_lun->iscsi = iscsi_create_context(initiator);
+	if (iscsi_lun->iscsi == NULL) {
+		log_err("iscsi: failed to create iscsi context.\n");
+		ret = 1;
+		goto out;
+	}
+
+	if (iscsi_set_targetname(iscsi_lun->iscsi, iscsi_lun->url->target)) {
+		log_err("iscsi: failed to set target name.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_set_session_type(iscsi_lun->iscsi, ISCSI_SESSION_NORMAL) != 0) {
+		log_err("iscsi: failed to set session type.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_set_header_digest(iscsi_lun->iscsi,
+				    ISCSI_HEADER_DIGEST_NONE_CRC32C) != 0) {
+		log_err("iscsi: failed to set header digest.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (iscsi_full_connect_sync(iscsi_lun->iscsi,
+				    iscsi_lun->url->portal,
+				    iscsi_lun->url->lun)) {
+		log_err("sicsi: failed to connect to LUN : %s\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+		ret = EINVAL;
+		goto out;
+	}
+
+	task = iscsi_readcapacity16_sync(iscsi_lun->iscsi, iscsi_lun->url->lun);
+	if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+		log_err("iscsi: failed to send readcapacity command: %s\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+		ret = EINVAL;
+		goto out;
+	}
+
+	rc16 = scsi_datain_unmarshall(task);
+	if (rc16 == NULL) {
+		log_err("iscsi: failed to unmarshal readcapacity16 data.\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_lun->block_size = rc16->block_length;
+	iscsi_lun->num_blocks = rc16->returned_lba + 1;
+
+	scsi_free_scsi_task(task);
+	task = NULL;
+
+	f->real_file_size = iscsi_lun->num_blocks * iscsi_lun->block_size;
+	f->engine_data	  = iscsi_lun;
+
+	iscsi_info->luns[i]    = iscsi_lun;
+	iscsi_info->pfds[i].fd = iscsi_get_fd(iscsi_lun->iscsi);
+
+out:
+	if (task) {
+		scsi_free_scsi_task(task);
+	}
+
+	if (ret && iscsi_lun) {
+		if (iscsi_lun->iscsi != NULL) {
+			if (iscsi_is_logged_in(iscsi_lun->iscsi)) {
+				iscsi_logout_sync(iscsi_lun->iscsi);
+			}
+			iscsi_destroy_context(iscsi_lun->iscsi);
+		}
+		free(iscsi_lun);
+	}
+
+	return ret;
+}
+
+static int fio_iscsi_setup(struct thread_data *td)
+{
+	struct iscsi_options	*options    = td->eo;
+	struct iscsi_info	*iscsi_info = NULL;
+	int			 ret	    = 0;
+	struct fio_file		*f;
+	int			 i;
+
+	iscsi_info	    = malloc(sizeof(struct iscsi_info));
+	iscsi_info->nr_luns = td->o.nr_files;
+	iscsi_info->luns    = calloc(iscsi_info->nr_luns, sizeof(struct iscsi_lun*));
+	iscsi_info->pfds    = calloc(iscsi_info->nr_luns, sizeof(struct pollfd));
+
+	iscsi_info->nr_events	    = 0;
+	iscsi_info->complete_events = calloc(td->o.iodepth, sizeof(struct iscsi_task*));
+
+	td->io_ops_data = iscsi_info;
+
+	for_each_file(td, f, i) {
+		ret = fio_iscsi_setup_lun(iscsi_info, options->initiator, f, i);
+		if (ret < 0) break;
+	}
+
+	return ret;
+}
+
+static int fio_iscsi_init(struct thread_data *td) {
+	return 0;
+}
+
+static void fio_iscsi_cleanup_lun(struct iscsi_lun *iscsi_lun) {
+	if (iscsi_lun->iscsi != NULL) {
+		if (iscsi_is_logged_in(iscsi_lun->iscsi)) {
+			iscsi_logout_sync(iscsi_lun->iscsi);
+		}
+		iscsi_destroy_context(iscsi_lun->iscsi);
+	}
+	free(iscsi_lun);
+}
+
+static void fio_iscsi_cleanup(struct thread_data *td)
+{
+	struct iscsi_info *iscsi_info = td->io_ops_data;
+
+	for (int i = 0; i < iscsi_info->nr_luns; i++) {
+		if (iscsi_info->luns[i]) {
+			fio_iscsi_cleanup_lun(iscsi_info->luns[i]);
+			iscsi_info->luns[i] = NULL;
+		}
+	}
+
+	free(iscsi_info->luns);
+	free(iscsi_info->pfds);
+	free(iscsi_info->complete_events);
+	free(iscsi_info);
+}
+
+static int fio_iscsi_prep(struct thread_data *td, struct io_u *io_u)
+{
+	return 0;
+}
+
+static int fio_iscsi_open_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int fio_iscsi_close_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static void iscsi_cb(struct iscsi_context *iscsi, int status,
+		     void *command_data, void *private_data)
+{
+	struct iscsi_task	*iscsi_task = (struct iscsi_task*)private_data;
+	struct iscsi_lun	*iscsi_lun  = iscsi_task->iscsi_lun;
+	struct iscsi_info       *iscsi_info = iscsi_lun->iscsi_info;
+	struct io_u             *io_u	    = iscsi_task->io_u;
+
+	if (status == SCSI_STATUS_GOOD) {
+		io_u->error = 0;
+	} else {
+		log_err("iscsi: request failed with error %s.\n",
+			iscsi_get_error(iscsi_lun->iscsi));
+
+		io_u->error = 1;
+		io_u->resid = io_u->xfer_buflen;
+	}
+
+	iscsi_info->complete_events[iscsi_info->nr_events] = iscsi_task;
+	iscsi_info->nr_events++;
+}
+
+static enum fio_q_status fio_iscsi_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct iscsi_lun	*iscsi_lun  = io_u->file->engine_data;
+	struct scsi_task	*scsi_task  = NULL;
+	struct iscsi_task	*iscsi_task = malloc(sizeof(struct iscsi_task));
+	int			 ret	    = -1;
+
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+		if (io_u->offset % iscsi_lun->block_size != 0) {
+			log_err("iscsi: offset is not align to block size.\n");
+			ret = -1;
+			goto out;
+		}
+
+		if (io_u->xfer_buflen % iscsi_lun->block_size != 0) {
+			log_err("iscsi: buflen is not align to block size.\n");
+			ret = -1;
+			goto out;
+		}
+	}
+
+	if (io_u->ddir == DDIR_READ) {
+		scsi_task = scsi_cdb_read16(io_u->offset / iscsi_lun->block_size,
+					    io_u->xfer_buflen,
+					    iscsi_lun->block_size,
+					    0, 0, 0, 0, 0);
+		ret = scsi_task_add_data_in_buffer(scsi_task, io_u->xfer_buflen,
+						   io_u->xfer_buf);
+		if (ret < 0) {
+			log_err("iscsi: failed to add data in buffer.\n");
+			goto out;
+		}
+	} else if (io_u->ddir == DDIR_WRITE) {
+		scsi_task = scsi_cdb_write16(io_u->offset / iscsi_lun->block_size,
+					     io_u->xfer_buflen,
+					     iscsi_lun->block_size,
+					     0, 0, 0, 0, 0);
+		ret = scsi_task_add_data_out_buffer(scsi_task, io_u->xfer_buflen,
+						    io_u->xfer_buf);
+		if (ret < 0) {
+			log_err("iscsi: failed to add data out buffer.\n");
+			goto out;
+		}
+	} else if (ddir_sync(io_u->ddir)) {
+		scsi_task = scsi_cdb_synchronizecache16(
+			0, iscsi_lun->num_blocks * iscsi_lun->block_size, 0, 0);
+	} else {
+		log_err("iscsi: invalid I/O operation: %d\n", io_u->ddir);
+		ret = EINVAL;
+		goto out;
+	}
+
+	iscsi_task->scsi_task = scsi_task;
+	iscsi_task->iscsi_lun = iscsi_lun;
+	iscsi_task->io_u      = io_u;
+
+	ret = iscsi_scsi_command_async(iscsi_lun->iscsi, iscsi_lun->url->lun,
+				       scsi_task, iscsi_cb, NULL, iscsi_task);
+	if (ret < 0) {
+		log_err("iscsi: failed to send scsi command.\n");
+		goto out;
+	}
+
+	return FIO_Q_QUEUED;
+
+out:
+	if (iscsi_task) {
+		free(iscsi_task);
+	}
+
+	if (scsi_task) {
+		scsi_free_scsi_task(scsi_task);
+	}
+
+	if (ret) {
+		io_u->error = ret;
+	}
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_iscsi_getevents(struct thread_data *td, unsigned int min,
+			       unsigned int max, const struct timespec *t)
+{
+	struct iscsi_info	*iscsi_info = td->io_ops_data;
+	int			 ret	    = 0;
+
+	iscsi_info->nr_events = 0;
+
+	while (iscsi_info->nr_events < min) {
+		for (int i = 0; i < iscsi_info->nr_luns; i++) {
+			int events = iscsi_which_events(iscsi_info->luns[i]->iscsi);
+			iscsi_info->pfds[i].events = events;
+		}
+
+		ret = poll(iscsi_info->pfds, iscsi_info->nr_luns, -1);
+		if (ret < 0) {
+			if (errno == EINTR || errno == EAGAIN) {
+				continue;
+			}
+			log_err("iscsi: failed to poll events: %s.\n",
+				strerror(errno));
+			break;
+		}
+
+		for (int i = 0; i < iscsi_info->nr_luns; i++) {
+			ret = iscsi_service(iscsi_info->luns[i]->iscsi,
+					    iscsi_info->pfds[i].revents);
+			assert(ret >= 0);
+		}
+	}
+
+	return ret < 0 ? ret : iscsi_info->nr_events;
+}
+
+static struct io_u *fio_iscsi_event(struct thread_data *td, int event)
+{
+	struct iscsi_info	*iscsi_info = (struct iscsi_info*)td->io_ops_data;
+	struct iscsi_task	*iscsi_task = iscsi_info->complete_events[event];
+	struct io_u		*io_u	    = iscsi_task->io_u;
+
+	iscsi_info->complete_events[event] = NULL;
+
+	scsi_free_scsi_task(iscsi_task->scsi_task);
+	free(iscsi_task);
+
+	return io_u;
+}
+
+static struct ioengine_ops ioengine_iscsi = {
+	.name               = "libiscsi",
+	.version            = FIO_IOOPS_VERSION,
+	.flags              = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NODISKUTIL,
+	.setup              = fio_iscsi_setup,
+	.init               = fio_iscsi_init,
+	.prep               = fio_iscsi_prep,
+	.queue              = fio_iscsi_queue,
+	.getevents          = fio_iscsi_getevents,
+	.event              = fio_iscsi_event,
+	.cleanup            = fio_iscsi_cleanup,
+	.open_file          = fio_iscsi_open_file,
+	.close_file         = fio_iscsi_close_file,
+	.option_struct_size = sizeof(struct iscsi_options),
+	.options	    = options,
+};
+
+static void fio_init fio_iscsi_register(void)
+{
+	register_ioengine(&ioengine_iscsi);
+}
+
+static void fio_exit fio_iscsi_unregister(void)
+{
+	unregister_ioengine(&ioengine_iscsi);
+}
diff -Nru fio-2.1.3/engines/libpmem.c fio-3.16/engines/libpmem.c
--- fio-2.1.3/engines/libpmem.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/libpmem.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,592 @@
+/*
+ * libpmem: IO engine that uses PMDK libpmem to read and write data
+ *
+ * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * libpmem engine
+ *
+ * IO engine that uses libpmem to read and write data
+ *
+ * To use:
+ *   ioengine=libpmem
+ *
+ * Other relevant settings:
+ *   iodepth=1
+ *   direct=1
+ *   directory=/mnt/pmem0/
+ *   bs=4k
+ *
+ *   direct=1 means that pmem_drain() is executed for each write operation.
+ *   In contrast, direct=0 means that pmem_drain() is not executed.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled. directory must point to a mount point of DAX FS.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *
+ * See examples/libpmem.fio for more.
+ *
+ *
+ * libpmem.so
+ *   By default, the libpmem engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * libpmem engine behavior
+ */
+#define MMAP_TOTAL_SZ   (1 * 1024 * 1024 * 1024UL)
+
+struct fio_libpmem_data {
+	void *libpmem_ptr;
+	size_t libpmem_sz;
+	off_t libpmem_off;
+};
+
+#define MEGABYTE ((uintptr_t)1 << 20)
+#define GIGABYTE ((uintptr_t)1 << 30)
+#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
+#define roundup(x, y)   ((((x) + ((y) - 1)) / (y)) * (y))
+
+static bool Mmap_no_random;
+static void *Mmap_hint;
+static unsigned long long Mmap_align;
+
+/*
+ * util_map_hint_align -- choose the desired mapping alignment
+ *
+ * Use 2MB/1GB page alignment only if the mapping length is at least
+ * twice as big as the page size.
+ */
+static inline size_t util_map_hint_align(size_t len, size_t req_align)
+{
+	size_t align = Mmap_align;
+
+	dprint(FD_IO, "DEBUG util_map_hint_align\n" );
+
+	if (req_align)
+		align = req_align;
+	else if (len >= 2 * GIGABYTE)
+		align = GIGABYTE;
+	else if (len >= 4 * MEGABYTE)
+		align = 2 * MEGABYTE;
+
+	dprint(FD_IO, "align=%d\n", (int)align);
+	return align;
+}
+
+#ifdef __FreeBSD__
+static const char *sscanf_os = "%p %p";
+#define MAP_NORESERVE 0
+#define OS_MAPFILE "/proc/curproc/map"
+#else
+static const char *sscanf_os = "%p-%p";
+#define OS_MAPFILE "/proc/self/maps"
+#endif
+
+/*
+ * util_map_hint_unused -- use /proc to determine a hint address for mmap()
+ *
+ * This is a helper function for util_map_hint().
+ * It opens up /proc/self/maps and looks for the first unused address
+ * in the process address space that is:
+ * - greater or equal 'minaddr' argument,
+ * - large enough to hold range of given length,
+ * - aligned to the specified unit.
+ *
+ * Asking for aligned address like this will allow the DAX code to use large
+ * mappings.  It is not an error if mmap() ignores the hint and chooses
+ * different address.
+ */
+static char *util_map_hint_unused(void *minaddr, size_t len, size_t align)
+{
+	char *lo = NULL;        /* beginning of current range in maps file */
+	char *hi = NULL;        /* end of current range in maps file */
+	char *raddr = minaddr;  /* ignore regions below 'minaddr' */
+
+#ifdef WIN32
+	MEMORY_BASIC_INFORMATION mi;
+#else
+	FILE *fp;
+	char line[PROCMAXLEN];  /* for fgets() */
+#endif
+
+	dprint(FD_IO, "DEBUG util_map_hint_unused\n");
+	assert(align > 0);
+
+	if (raddr == NULL)
+		raddr += page_size;
+
+	raddr = (char *)roundup((uintptr_t)raddr, align);
+
+#ifdef WIN32
+	while ((uintptr_t)raddr < UINTPTR_MAX - len) {
+		size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
+		if (ret == 0) {
+			ERR("VirtualQuery %p", raddr);
+			return MAP_FAILED;
+		}
+		dprint(FD_IO, "addr %p len %zu state %d",
+				mi.BaseAddress, mi.RegionSize, mi.State);
+
+		if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
+			raddr = (char *)mi.BaseAddress + mi.RegionSize;
+			raddr = (char *)roundup((uintptr_t)raddr, align);
+			dprint(FD_IO, "nearest aligned addr %p", raddr);
+		} else {
+			dprint(FD_IO, "unused region of size %zu found at %p",
+					mi.RegionSize, mi.BaseAddress);
+			return mi.BaseAddress;
+		}
+	}
+
+	dprint(FD_IO, "end of address space reached");
+	return MAP_FAILED;
+#else
+	fp = fopen(OS_MAPFILE, "r");
+	if (!fp) {
+		log_err("!%s\n", OS_MAPFILE);
+		return MAP_FAILED;
+	}
+
+	while (fgets(line, PROCMAXLEN, fp) != NULL) {
+		/* check for range line */
+		if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
+			dprint(FD_IO, "%p-%p\n", lo, hi);
+			if (lo > raddr) {
+				if ((uintptr_t)(lo - raddr) >= len) {
+					dprint(FD_IO, "unused region of size "
+							"%zu found at %p\n",
+							lo - raddr, raddr);
+					break;
+				} else {
+					dprint(FD_IO, "region is too small: "
+							"%zu < %zu\n",
+							lo - raddr, len);
+				}
+			}
+
+			if (hi > raddr) {
+				raddr = (char *)roundup((uintptr_t)hi, align);
+				dprint(FD_IO, "nearest aligned addr %p\n",
+						raddr);
+			}
+
+			if (raddr == 0) {
+				dprint(FD_IO, "end of address space reached\n");
+				break;
+			}
+		}
+	}
+
+	/*
+	 * Check for a case when this is the last unused range in the address
+	 * space, but is not large enough. (very unlikely)
+	 */
+	if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
+		dprint(FD_IO, "end of address space reached");
+		raddr = MAP_FAILED;
+	}
+
+	fclose(fp);
+
+	dprint(FD_IO, "returning %p", raddr);
+	return raddr;
+#endif
+}
+
+/*
+ * util_map_hint -- determine hint address for mmap()
+ *
+ * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
+ * the randomized mapping address.  Otherwise, a user-defined hint address
+ * is used.
+ *
+ * Windows Environment:
+ *   XXX - Windows doesn't support large DAX pages yet, so there is
+ *   no point in aligning for the same.
+ *
+ * Except for Windows Environment:
+ *   ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
+ *   (bit positions 12-39), which means the base mapping address is randomized
+ *   within [0..1024GB] range, with 4KB granularity.  Assuming additional
+ *   1GB alignment, it results in 1024 possible locations.
+ *
+ *   Configuring the hint address via PMEM_MMAP_HINT environment variable
+ *   disables address randomization.  In such case, the function will search for
+ *   the first unused, properly aligned region of given size, above the
+ *   specified address.
+ */
+static char *util_map_hint(size_t len, size_t req_align)
+{
+	char *addr;
+	size_t align = 0;
+	char *e = NULL;
+
+	dprint(FD_IO, "DEBUG util_map_hint\n");
+	dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);
+
+	/* choose the desired alignment based on the requested length */
+	align = util_map_hint_align(len, req_align);
+
+	e = getenv("PMEM_MMAP_HINT");
+	if (e) {
+		char *endp;
+		unsigned long long val = 0;
+
+		errno = 0;
+
+		val = strtoull(e, &endp, 16);
+		if (errno || endp == e) {
+			dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
+		} else {
+			Mmap_hint = (void *)val;
+			Mmap_no_random = true;
+			dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
+		}
+	}
+
+	if (Mmap_no_random) {
+		dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
+		addr = util_map_hint_unused((void *)Mmap_hint, len, align);
+	} else {
+		/*
+		 * Create dummy mapping to find an unused region of given size.
+		 * * Request for increased size for later address alignment.
+		 *
+		 * Windows Environment: 
+		 *   Use MAP_NORESERVE flag to only reserve the range of pages
+		 *   rather than commit.  We don't want the pages to be actually
+		 *   backed by the operating system paging file, as the swap
+		 *   file is usually too small to handle terabyte pools.
+		 *
+		 * Except for Windows Environment:
+		 *   Use MAP_PRIVATE with read-only access to simulate
+		 *   zero cost for overcommit accounting.  Note: MAP_NORESERVE
+		 *   flag is ignored if overcommit is disabled (mode 2).
+		 */
+#ifndef WIN32
+		addr = mmap(NULL, len + align, PROT_READ,
+				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+#else
+		addr = mmap(NULL, len + align, PROT_READ,
+				MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
+#endif
+		if (addr != MAP_FAILED) {
+			dprint(FD_IO, "system choice %p\n", addr);
+			munmap(addr, len + align);
+			addr = (char *)roundup((uintptr_t)addr, align);
+		}
+	}
+
+	dprint(FD_IO, "hint %p\n", addr);
+
+	return addr;
+}
+
+/*
+ * This is the mmap execution function
+ */
+static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
+			    size_t length, off_t off)
+{
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int flags = 0;
+	void *addr = NULL;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_file\n");
+
+	if (td_rw(td))
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td)) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	dprint(FD_IO, "f->file_name = %s  td->o.verify = %d \n", f->file_name,
+			td->o.verify);
+	dprint(FD_IO, "length = %ld  flags = %d  f->fd = %d off = %ld \n",
+			length, flags, f->fd,off);
+
+	addr = util_map_hint(length, 0);
+
+	fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
+	if (fdd->libpmem_ptr == MAP_FAILED) {
+		fdd->libpmem_ptr = NULL;
+		td_verror(td, errno, "mmap");
+	}
+
+	if (td->error && fdd->libpmem_ptr)
+		munmap(fdd->libpmem_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * XXX Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );
+
+	if (io_u->buflen > f->real_file_size) {
+		log_err("libpmem: bs too big for libpmem engine\n");
+		return EIO;
+	}
+
+	fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+	if (fdd->libpmem_sz > f->io_size)
+		fdd->libpmem_sz = f->io_size;
+
+	fdd->libpmem_off = io_u->offset;
+
+	return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+
+	dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
+			f->io_size, io_u->offset);
+
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+	fdd->libpmem_sz = f->io_size;
+	fdd->libpmem_off = 0;
+
+	ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %llu : "
+			"io_u->buflen %llu : fdd->libpmem_sz %llu\n",
+			io_u->offset, (unsigned long long) fdd->libpmem_off,
+			io_u->buflen, (unsigned long long) fdd->libpmem_sz);
+
+	if (io_u->offset >= fdd->libpmem_off &&
+	    (io_u->offset + io_u->buflen <=
+	     fdd->libpmem_off + fdd->libpmem_sz))
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fdd->libpmem_ptr) {
+		dprint(FD_IO,"munmap \n");
+		if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
+			return errno;
+		fdd->libpmem_ptr = NULL;
+	}
+
+	if (fio_libpmem_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_libpmem_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
+				- f->file_offset;
+	return 0;
+}
+
+static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+	io_u->error = 0;
+
+	dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+		break;
+	case DDIR_WRITE:
+		dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
+				io_u->mmap_data, io_u->xfer_buf );
+		dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+		if (td->o.odirect) {
+			pmem_memcpy_persist(io_u->mmap_data,
+						io_u->xfer_buf,
+						io_u->xfer_buflen);
+		} else {
+			pmem_memcpy_nodrain(io_u->mmap_data,
+						io_u->xfer_buf,
+						io_u->xfer_buflen);
+		}
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_libpmem_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
+			o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+	dprint(FD_IO, "DEBUG fio_libpmem_init\n");
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("libpmem: mmap options dictate a minimum block size of "
+				"%llu bytes\n",	(unsigned long long) page_size);
+		return 1;
+	}
+	return 0;
+}
+
+static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_libpmem_data *fdd;
+	int ret;
+
+	dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
+	dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
+	dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
+	dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
+	dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fdd = calloc(1, sizeof(*fdd));
+	if (!fdd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fdd);
+
+	return 0;
+}
+
+static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+	dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
+	dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+
+	if (!td->o.odirect) {
+		dprint(FD_IO,"pmem_drain\n");
+		pmem_drain();
+	}
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fdd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "libpmem",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_libpmem_init,
+	.prep		= fio_libpmem_prep,
+	.queue		= fio_libpmem_queue,
+	.open_file	= fio_libpmem_open_file,
+	.close_file	= fio_libpmem_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO |FIO_NOEXTEND,
+};
+
+static void fio_init fio_libpmem_register(void)
+{
+#ifndef WIN32
+	Mmap_align = page_size;
+#else
+	if (Mmap_align == 0) {
+		SYSTEM_INFO si;
+
+		GetSystemInfo(&si);
+		Mmap_align = si.dwAllocationGranularity;
+	}
+#endif
+
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_libpmem_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/mmap.c fio-3.16/engines/mmap.c
--- fio-2.1.3/engines/mmap.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/mmap.c	2019-09-20 01:01:52.000000000 +0000
@@ -7,29 +7,104 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 #include "../verify.h"
 
 /*
- * Limits us to 1GB of mapped files in total
+ * Limits us to 1GiB of mapped files in total
  */
 #define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
 
 static unsigned long mmap_map_size;
-static unsigned long mmap_map_mask;
+
+struct fio_mmap_data {
+	void *mmap_ptr;
+	size_t mmap_sz;
+	off_t mmap_off;
+};
+
+#ifdef CONFIG_HAVE_THP
+struct mmap_options {
+	void *pad;
+	unsigned int thp;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "thp",
+		.lname	= "Transparent Huge Pages",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct mmap_options, thp),
+		.help	= "Memory Advise Huge Page",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_MMAP,
+	},
+	{
+		.name = NULL,
+	},
+};
+#endif
+
+static bool fio_madvise_file(struct thread_data *td, struct fio_file *f,
+			     size_t length)
+
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+#ifdef CONFIG_HAVE_THP
+	struct mmap_options *o = td->eo;
+
+	/* Ignore errors on this optional advisory */
+	if (o->thp)
+		madvise(fmd->mmap_ptr, length, MADV_HUGEPAGE);
+#endif
+
+	if (!td->o.fadvise_hint)
+		return true;
+
+	if (!td_random(td)) {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) {
+			td_verror(td, errno, "madvise");
+			return false;
+		}
+	} else {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) {
+			td_verror(td, errno, "madvise");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+#ifdef CONFIG_HAVE_THP
+static int fio_mmap_get_shared(struct thread_data *td)
+{
+	struct mmap_options *o = td->eo;
+
+	if (o->thp)
+		return MAP_PRIVATE;
+	return MAP_SHARED;
+}
+#else
+static int fio_mmap_get_shared(struct thread_data *td)
+{
+	return MAP_SHARED;
+}
+#endif
 
 static int fio_mmap_file(struct thread_data *td, struct fio_file *f,
 			 size_t length, off_t off)
 {
-	int flags = 0;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+	int flags = 0, shared = fio_mmap_get_shared(td);
 
-	if (td_rw(td))
+	if (td_rw(td) && !td->o.verify_only)
 		flags = PROT_READ | PROT_WRITE;
-	else if (td_write(td)) {
+	else if (td_write(td) && !td->o.verify_only) {
 		flags = PROT_WRITE;
 
 		if (td->o.verify != VERIFY_NONE)
@@ -37,28 +112,29 @@
 	} else
 		flags = PROT_READ;
 
-	f->mmap_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
-	if (f->mmap_ptr == MAP_FAILED) {
-		f->mmap_ptr = NULL;
+	fmd->mmap_ptr = mmap(NULL, length, flags, shared, f->fd, off);
+	if (fmd->mmap_ptr == MAP_FAILED) {
+		fmd->mmap_ptr = NULL;
 		td_verror(td, errno, "mmap");
 		goto err;
 	}
 
-	if (!td_random(td)) {
-		if (posix_madvise(f->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) {
-			td_verror(td, errno, "madvise");
-			goto err;
-		}
-	} else {
-		if (posix_madvise(f->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) {
-			td_verror(td, errno, "madvise");
-			goto err;
-		}
+	if (!fio_madvise_file(td, f, length))
+		goto err;
+
+	if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_DONTNEED) < 0) {
+		td_verror(td, errno, "madvise");
+		goto err;
 	}
 
+#ifdef FIO_MADV_FREE
+	if (f->filetype == FIO_TYPE_BLOCK)
+		(void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE);
+#endif
+
 err:
-	if (td->error && f->mmap_ptr)
-		munmap(f->mmap_ptr, length);
+	if (td->error && fmd->mmap_ptr)
+		munmap(fmd->mmap_ptr, length);
 
 	return td->error;
 }
@@ -69,19 +145,20 @@
 static int fio_mmapio_prep_limited(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 
 	if (io_u->buflen > mmap_map_size) {
 		log_err("fio: bs too big for mmap engine\n");
 		return EIO;
 	}
 
-	f->mmap_sz = mmap_map_size;
-	if (f->mmap_sz  > f->io_size)
-		f->mmap_sz = f->io_size;
+	fmd->mmap_sz = mmap_map_size;
+	if (fmd->mmap_sz  > f->io_size)
+		fmd->mmap_sz = f->io_size;
 
-	f->mmap_off = io_u->offset;
+	fmd->mmap_off = io_u->offset;
 
-	return fio_mmap_file(td, f, f->mmap_sz, f->mmap_off);
+	return fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
 }
 
 /*
@@ -90,15 +167,21 @@
 static int fio_mmapio_prep_full(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 	int ret;
 
 	if (fio_file_partial_mmap(f))
 		return EINVAL;
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
 
-	f->mmap_sz = f->io_size;
-	f->mmap_off = 0;
+	fmd->mmap_sz = f->io_size;
+	fmd->mmap_off = 0;
 
-	ret = fio_mmap_file(td, f, f->mmap_sz, f->mmap_off);
+	ret = fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
 	if (ret)
 		fio_file_set_partial_mmap(f);
 
@@ -108,22 +191,23 @@
 static int fio_mmapio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 	int ret;
 
 	/*
 	 * It fits within existing mapping, use it
 	 */
-	if (io_u->offset >= f->mmap_off &&
-	    io_u->offset + io_u->buflen < f->mmap_off + f->mmap_sz)
+	if (io_u->offset >= fmd->mmap_off &&
+	    io_u->offset + io_u->buflen <= fmd->mmap_off + fmd->mmap_sz)
 		goto done;
 
 	/*
 	 * unmap any existing mapping
 	 */
-	if (f->mmap_ptr) {
-		if (munmap(f->mmap_ptr, f->mmap_sz) < 0)
+	if (fmd->mmap_ptr) {
+		if (munmap(fmd->mmap_ptr, fmd->mmap_sz) < 0)
 			return errno;
-		f->mmap_ptr = NULL;
+		fmd->mmap_ptr = NULL;
 	}
 
 	if (fio_mmapio_prep_full(td, io_u)) {
@@ -134,14 +218,16 @@
 	}
 
 done:
-	io_u->mmap_data = f->mmap_ptr + io_u->offset - f->mmap_off -
+	io_u->mmap_data = fmd->mmap_ptr + io_u->offset - fmd->mmap_off -
 				f->file_offset;
 	return 0;
 }
 
-static int fio_mmapio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_mmapio_queue(struct thread_data *td,
+					  struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 
 	fio_ro_check(td, io_u);
 
@@ -150,7 +236,7 @@
 	else if (io_u->ddir == DDIR_WRITE)
 		memcpy(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen);
 	else if (ddir_sync(io_u->ddir)) {
-		if (msync(f->mmap_ptr, f->mmap_sz, MS_SYNC)) {
+		if (msync(fmd->mmap_ptr, fmd->mmap_sz, MS_SYNC)) {
 			io_u->error = errno;
 			td_verror(td, io_u->error, "msync");
 		}
@@ -182,39 +268,63 @@
 static int fio_mmapio_init(struct thread_data *td)
 {
 	struct thread_options *o = &td->o;
-	unsigned long shift, mask;
 
-	if ((td->o.rw_min_bs & page_mask) &&
+	if ((o->rw_min_bs & page_mask) &&
 	    (o->odirect || o->fsync_blocks || o->fdatasync_blocks)) {
 		log_err("fio: mmap options dictate a minimum block size of "
 			"%llu bytes\n", (unsigned long long) page_size);
 		return 1;
 	}
 
-	mmap_map_size = MMAP_TOTAL_SZ / td->o.nr_files;
-	mask = mmap_map_size;
-	shift = 0;
-	do {
-		mask >>= 1;
-		if (!mask)
-			break;
-		shift++;
-	} while (1);
+	mmap_map_size = MMAP_TOTAL_SZ / o->nr_files;
+	return 0;
+}
 
-	mmap_map_mask = 1UL << shift;
+static int fio_mmapio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
 	return 0;
 }
 
+static int fio_mmapio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
 static struct ioengine_ops ioengine = {
 	.name		= "mmap",
 	.version	= FIO_IOOPS_VERSION,
 	.init		= fio_mmapio_init,
 	.prep		= fio_mmapio_prep,
 	.queue		= fio_mmapio_queue,
-	.open_file	= generic_open_file,
-	.close_file	= generic_close_file,
+	.open_file	= fio_mmapio_open_file,
+	.close_file	= fio_mmapio_close_file,
 	.get_file_size	= generic_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+#ifdef CONFIG_HAVE_THP
+	.options	= options,
+	.option_struct_size = sizeof(struct mmap_options),
+#endif
 };
 
 static void fio_init fio_mmapio_register(void)
diff -Nru fio-2.1.3/engines/mtd.c fio-3.16/engines/mtd.c
--- fio-2.1.3/engines/mtd.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/mtd.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,233 @@
+/*
+ * MTD engine
+ *
+ * IO engine that reads/writes from MTD character devices.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/libmtd.h"
+
+static libmtd_t desc;
+
+struct fio_mtd_data {
+	struct mtd_dev_info info;
+};
+
+struct fio_mtd_options {
+	void *pad; /* avoid off1 == 0 */
+	unsigned int skip_bad;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "skip_bad",
+		.lname	= "Skip operations against bad blocks",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct fio_mtd_options, skip_bad),
+		.help	= "Skip operations against known bad blocks.",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int fio_mtd_maybe_mark_bad(struct thread_data *td,
+				  struct fio_mtd_data *fmd,
+				  struct io_u *io_u, int eb)
+{
+	int ret;
+	if (errno == EIO) {
+		ret = mtd_mark_bad(&fmd->info, io_u->file->fd, eb);
+		if (ret != 0) {
+			io_u->error = errno;
+			td_verror(td, errno, "mtd_mark_bad");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int fio_mtd_is_bad(struct thread_data *td,
+			  struct fio_mtd_data *fmd,
+			  struct io_u *io_u, int eb)
+{
+	int ret = mtd_is_bad(&fmd->info, io_u->file->fd, eb);
+	if (ret == -1) {
+		io_u->error = errno;
+		td_verror(td, errno, "mtd_is_bad");
+	} else if (ret == 1)
+		io_u->error = EIO;	/* Silent failure--don't flood stderr */
+	return ret;
+}
+
+static enum fio_q_status fio_mtd_queue(struct thread_data *td,
+				       struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+	struct fio_mtd_options *o = td->eo;
+	int local_offs = 0;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	/*
+	 * Errors tend to pertain to particular erase blocks, so divide up
+	 * I/O to erase block size.
+	 * If an error is encountered, log it and keep going onto the next
+	 * block because the error probably just pertains to that block.
+	 * TODO(dehrenberg): Divide up reads and writes into page-sized
+	 * operations to get more fine-grained information about errors.
+	 */
+	while (local_offs < io_u->buflen) {
+		int eb = (io_u->offset + local_offs) / fmd->info.eb_size;
+		int eb_offs = (io_u->offset + local_offs) % fmd->info.eb_size;
+		/* The length is the smaller of the length remaining in the
+		 * buffer and the distance to the end of the erase block */
+		int len = min((int)io_u->buflen - local_offs,
+			      (int)fmd->info.eb_size - eb_offs);
+		char *buf = ((char *)io_u->buf) + local_offs;
+
+		if (o->skip_bad) {
+			ret = fio_mtd_is_bad(td, fmd, io_u, eb);
+			if (ret == -1)
+				break;
+			else if (ret == 1)
+				goto next;
+		}
+		if (io_u->ddir == DDIR_READ) {
+			ret = mtd_read(&fmd->info, f->fd, eb, eb_offs, buf, len);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_read");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_WRITE) {
+			ret = mtd_write(desc, &fmd->info, f->fd, eb,
+					    eb_offs, buf, len, NULL, 0, 0);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_write");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_TRIM) {
+			if (eb_offs != 0 || len != fmd->info.eb_size) {
+				io_u->error = EINVAL;
+				td_verror(td, EINVAL,
+					  "trim on MTD must be erase block-aligned");
+			}
+			ret = mtd_erase(desc, &fmd->info, f->fd, eb);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_erase");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else {
+			io_u->error = ENOTSUP;
+			td_verror(td, io_u->error, "operation not supported on mtd");
+		}
+
+next:
+		local_offs += len;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_mtd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd)
+		goto err_close;
+
+	ret = mtd_get_dev_info(desc, f->file_name, &fmd->info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		goto err_free;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
+	return 0;
+
+err_free:
+	free(fmd);
+err_close:
+	{
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+}
+
+static int fio_mtd_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+
+	return generic_close_file(td, f);
+}
+
+static int fio_mtd_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct mtd_dev_info info;
+
+	int ret = mtd_get_dev_info(desc, f->file_name, &info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		return errno;
+	}
+	f->real_file_size = info.size;
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "mtd",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_mtd_queue,
+	.open_file	= fio_mtd_open_file,
+	.close_file	= fio_mtd_close_file,
+	.get_file_size	= fio_mtd_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_mtd_options),
+};
+
+static void fio_init fio_mtd_register(void)
+{
+	desc = libmtd_open();
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_mtd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+	libmtd_close(desc);
+	desc = NULL;
+}
+
+
+
diff -Nru fio-2.1.3/engines/nbd.c fio-3.16/engines/nbd.c
--- fio-2.1.3/engines/nbd.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/nbd.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,359 @@
+/*
+ * NBD engine
+ *
+ * IO engine that talks to an NBD server.
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ * Written by Richard W.M. Jones <rjones@redhat.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include <libnbd.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* Actually this differs across servers, but for nbdkit ... */
+#define NBD_MAX_REQUEST_SIZE (64 * 1024 * 1024)
+
+/* Storage for the NBD handle. */
+struct nbd_data {
+	struct nbd_handle *nbd;
+	int debug;
+
+	/* The list of completed io_u structs. */
+	struct io_u **completed;
+	size_t nr_completed;
+};
+
+/* Options. */
+struct nbd_options {
+	void *padding;
+	char *uri;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "uri",
+		.lname	= "NBD URI",
+		.help	= "Name of NBD URI",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NBD,
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct nbd_options, uri),
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/* Alocates nbd_data. */
+static int nbd_setup(struct thread_data *td)
+{
+	struct nbd_data *nbd_data;
+	struct nbd_options *o = td->eo;
+	struct fio_file *f;
+	int r;
+	int64_t size;
+
+	nbd_data = calloc(1, sizeof(*nbd_data));
+	if (!nbd_data) {
+		td_verror(td, errno, "calloc");
+		return 1;
+	}
+	td->io_ops_data = nbd_data;
+
+	/* Pretend to deal with files.	See engines/rbd.c */
+	if (!td->files_index) {
+		add_file(td, "nbd", 0, 0);
+		td->o.nr_files = td->o.nr_files ? : 1;
+		td->o.open_files++;
+	}
+	f = td->files[0];
+
+	nbd_data->nbd = nbd_create();
+	if (!nbd_data->nbd) {
+		log_err("fio: nbd_create: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	/* Get the debug flag which can be set through LIBNBD_DEBUG=1. */
+	nbd_data->debug = nbd_get_debug(nbd_data->nbd);
+
+	/* Connect synchronously here so we can check for the size and
+	 * in future other properties of the server.
+	 */
+	if (!o->uri) {
+		log_err("fio: nbd: uri parameter was not specified\n");
+		return 1;
+	}
+	r = nbd_connect_uri(nbd_data->nbd, o->uri);
+	if (r == -1) {
+		log_err("fio: nbd_connect_uri: %s\n", nbd_get_error());
+		return 1;
+	}
+	size = nbd_get_size(nbd_data->nbd);
+	if (size == -1) {
+		log_err("fio: nbd_get_size: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	f->real_file_size = size;
+
+	nbd_close (nbd_data->nbd);
+	nbd_data->nbd = NULL;
+
+	return 0;
+}
+
+/* Closes socket and frees nbd_data -- the opposite of nbd_setup. */
+static void nbd_cleanup(struct thread_data *td)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+
+	if (nbd_data) {
+		if (nbd_data->nbd)
+			nbd_close(nbd_data->nbd);
+		free(nbd_data);
+	}
+}
+
+/* Connect to the server from each thread. */
+static int nbd_init(struct thread_data *td)
+{
+	struct nbd_options *o = td->eo;
+	struct nbd_data *nbd_data = td->io_ops_data;
+	int r;
+
+	if (!o->uri) {
+		log_err("fio: nbd: uri parameter was not specified\n");
+		return 1;
+	}
+
+	nbd_data->nbd = nbd_create();
+	if (!nbd_data->nbd) {
+		log_err("fio: nbd_create: %s\n", nbd_get_error());
+		return 1;
+	}
+	/* This is actually a synchronous connect and handshake. */
+	r = nbd_connect_uri(nbd_data->nbd, o->uri);
+	if (r == -1) {
+		log_err("fio: nbd_connect_uri: %s\n", nbd_get_error());
+		return 1;
+	}
+
+	log_info("fio: connected to NBD server\n");
+	return 0;
+}
+
+/* A command in flight has been completed. */
+static int cmd_completed (void *vp, int *error)
+{
+	struct io_u *io_u;
+	struct nbd_data *nbd_data;
+	struct io_u **completed;
+
+	io_u = vp;
+	nbd_data = io_u->engine_data;
+
+	if (nbd_data->debug)
+		log_info("fio: nbd: command completed\n");
+
+	if (*error != 0)
+		io_u->error = *error;
+	else
+		io_u->error = 0;
+
+	/* Add this completion to the list so it can be picked up
+	 * later by ->event.
+	 */
+	completed = realloc(nbd_data->completed,
+			    sizeof(struct io_u *) *
+			    (nbd_data->nr_completed+1));
+	if (completed == NULL) {
+		io_u->error = errno;
+		return 0;
+	}
+
+	nbd_data->completed = completed;
+	nbd_data->completed[nbd_data->nr_completed] = io_u;
+	nbd_data->nr_completed++;
+
+	return 0;
+}
+
+/* Begin read or write request. */
+static enum fio_q_status nbd_queue(struct thread_data *td,
+				   struct io_u *io_u)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+	nbd_completion_callback completion = { .callback = cmd_completed,
+					       .user_data = io_u };
+	int r;
+
+	fio_ro_check(td, io_u);
+
+	io_u->engine_data = nbd_data;
+
+	if (io_u->ddir == DDIR_WRITE || io_u->ddir == DDIR_READ)
+		assert(io_u->xfer_buflen <= NBD_MAX_REQUEST_SIZE);
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		r = nbd_aio_pread(nbd_data->nbd,
+				  io_u->xfer_buf, io_u->xfer_buflen,
+				  io_u->offset, completion, 0);
+		break;
+	case DDIR_WRITE:
+		r = nbd_aio_pwrite(nbd_data->nbd,
+				   io_u->xfer_buf, io_u->xfer_buflen,
+				   io_u->offset, completion, 0);
+		break;
+	case DDIR_TRIM:
+		r = nbd_aio_trim(nbd_data->nbd, io_u->xfer_buflen,
+				 io_u->offset, completion, 0);
+		break;
+	case DDIR_SYNC:
+		/* XXX We could probably also handle
+		 * DDIR_SYNC_FILE_RANGE with a bit of effort.
+		 */
+		r = nbd_aio_flush(nbd_data->nbd, completion, 0);
+		break;
+	default:
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (r == -1) {
+		/* errno is optional information on libnbd error path;
+		 * if it's 0, set it to a default value
+		 */
+		io_u->error = nbd_get_errno();
+		if (io_u->error == 0)
+			io_u->error = EIO;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (nbd_data->debug)
+		log_info("fio: nbd: command issued\n");
+	io_u->error = 0;
+	return FIO_Q_QUEUED;
+}
+
+static unsigned retire_commands(struct nbd_handle *nbd)
+{
+	int64_t cookie;
+	unsigned r = 0;
+
+	while ((cookie = nbd_aio_peek_command_completed(nbd)) > 0) {
+		/* Ignore the return value.  cmd_completed has already
+		 * checked for an error and set io_u->error.  We only
+		 * have to call this to retire the command.
+		 */
+		nbd_aio_command_completed(nbd, cookie);
+		r++;
+	}
+
+	if (nbd_get_debug(nbd))
+		log_info("fio: nbd: %u commands retired\n", r);
+	return r;
+}
+
+static int nbd_getevents(struct thread_data *td, unsigned int min,
+			 unsigned int max, const struct timespec *t)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+	int r;
+	unsigned events = 0;
+	int timeout;
+
+	/* XXX This handling of timeout is wrong because it will wait
+	 * for up to loop iterations * timeout.
+	 */
+	timeout = !t ? -1 : t->tv_sec * 1000 + t->tv_nsec / 1000000;
+
+	while (events < min) {
+		r = nbd_poll(nbd_data->nbd, timeout);
+		if (r == -1) {
+			/* error in poll */
+			log_err("fio: nbd_poll: %s\n", nbd_get_error());
+			return -1;
+		}
+		else {
+			/* poll made progress */
+			events += retire_commands(nbd_data->nbd);
+		}
+	}
+
+	return events;
+}
+
+static struct io_u *nbd_event(struct thread_data *td, int event)
+{
+	struct nbd_data *nbd_data = td->io_ops_data;
+
+	if (nbd_data->nr_completed == 0)
+		return NULL;
+
+	/* XXX We ignore the event number and assume fio calls us
+	 * exactly once for [0..nr_events-1].
+	 */
+	nbd_data->nr_completed--;
+	return nbd_data->completed[nbd_data->nr_completed];
+}
+
+static int nbd_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	io_u->engine_data = NULL;
+	return 0;
+}
+
+static void nbd_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	/* Nothing needs to be done. */
+}
+
+static int nbd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int nbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "nbd",
+	.version		= FIO_IOOPS_VERSION,
+	.options		= options,
+	.option_struct_size	= sizeof(struct nbd_options),
+	.flags			= FIO_DISKLESSIO | FIO_NOEXTEND,
+
+	.setup			= nbd_setup,
+	.init			= nbd_init,
+	.cleanup		= nbd_cleanup,
+	.queue			= nbd_queue,
+	.getevents		= nbd_getevents,
+	.event			= nbd_event,
+	.io_u_init		= nbd_io_u_init,
+	.io_u_free		= nbd_io_u_free,
+
+	.open_file		= nbd_open_file,
+	.invalidate		= nbd_invalidate,
+};
+
+static void fio_init fio_nbd_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nbd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/net.c fio-3.16/engines/net.c
--- fio-2.1.3/engines/net.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/net.c	2019-09-20 01:01:52.000000000 +0000
@@ -9,25 +9,29 @@
 #include <unistd.h>
 #include <signal.h>
 #include <errno.h>
-#include <assert.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <arpa/inet.h>
 #include <netdb.h>
-#include <sys/poll.h>
-#include <sys/types.h>
+#include <poll.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 
 #include "../fio.h"
+#include "../verify.h"
+#include "../optgroup.h"
 
 struct netio_data {
 	int listenfd;
 	int use_splice;
+	int seq_off;
 	int pipes[2];
 	struct sockaddr_in addr;
+	struct sockaddr_in6 addr6;
 	struct sockaddr_un addr_un;
+	uint64_t udp_send_seq;
+	uint64_t udp_recv_seq;
 };
 
 struct netio_options {
@@ -38,7 +42,9 @@
 	unsigned int pingpong;
 	unsigned int nodelay;
 	unsigned int ttl;
-	char * interface;
+	unsigned int window_size;
+	unsigned int mss;
+	char *intfc;
 };
 
 struct udp_close_msg {
@@ -46,14 +52,23 @@
 	uint32_t cmd;
 };
 
+struct udp_seq {
+	uint64_t magic;
+	uint64_t seq;
+	uint64_t bs;
+};
+
 enum {
 	FIO_LINK_CLOSE = 0x89,
 	FIO_LINK_OPEN_CLOSE_MAGIC = 0x6c696e6b,
 	FIO_LINK_OPEN = 0x98,
+	FIO_UDP_SEQ_MAGIC = 0x657375716e556563ULL,
 
 	FIO_TYPE_TCP	= 1,
 	FIO_TYPE_UDP	= 2,
 	FIO_TYPE_UNIX	= 3,
+	FIO_TYPE_TCP_V6	= 4,
+	FIO_TYPE_UDP_V6	= 5,
 };
 
 static int str_hostname_cb(void *data, const char *input);
@@ -91,10 +106,22 @@
 			    .oval = FIO_TYPE_TCP,
 			    .help = "Transmission Control Protocol",
 			  },
+#ifdef CONFIG_IPV6
+			  { .ival = "tcpv6",
+			    .oval = FIO_TYPE_TCP_V6,
+			    .help = "Transmission Control Protocol V6",
+			  },
+#endif
 			  { .ival = "udp",
 			    .oval = FIO_TYPE_UDP,
 			    .help = "User Datagram Protocol",
 			  },
+#ifdef CONFIG_IPV6
+			  { .ival = "udpv6",
+			    .oval = FIO_TYPE_UDP_V6,
+			    .help = "User Datagram Protocol V6",
+			  },
+#endif
 			  { .ival = "unix",
 			    .oval = FIO_TYPE_UNIX,
 			    .help = "UNIX domain socket",
@@ -106,6 +133,7 @@
 #ifdef CONFIG_TCP_NODELAY
 	{
 		.name	= "nodelay",
+		.lname	= "No Delay",
 		.type	= FIO_OPT_BOOL,
 		.off1	= offsetof(struct netio_options, nodelay),
 		.help	= "Use TCP_NODELAY on TCP connections",
@@ -124,6 +152,7 @@
 	},
 	{
 		.name	= "pingpong",
+		.lname	= "Ping Pong",
 		.type	= FIO_OPT_STR_SET,
 		.off1	= offsetof(struct netio_options, pingpong),
 		.help	= "Ping-pong IO requests",
@@ -134,7 +163,7 @@
 		.name	= "interface",
 		.lname	= "net engine interface",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= offsetof(struct netio_options, interface),
+		.off1	= offsetof(struct netio_options, intfc),
 		.help	= "Network interface to use",
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_NETIO,
@@ -150,11 +179,109 @@
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_NETIO,
 	},
+#ifdef CONFIG_NET_WINDOWSIZE
+	{
+		.name	= "window_size",
+		.lname	= "Window Size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, window_size),
+		.minval	= 0,
+		.help	= "Set socket buffer window size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
+#ifdef CONFIG_NET_MSS
+	{
+		.name	= "mss",
+		.lname	= "Maximum segment size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, mss),
+		.minval	= 0,
+		.help	= "Set TCP maximum segment size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
 	{
 		.name	= NULL,
 	},
 };
 
+static inline int is_udp(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_UDP || o->proto == FIO_TYPE_UDP_V6;
+}
+
+static inline int is_tcp(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_TCP || o->proto == FIO_TYPE_TCP_V6;
+}
+
+static inline int is_ipv6(struct netio_options *o)
+{
+	return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6;
+}
+
+static int set_window_size(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_WINDOWSIZE
+	struct netio_options *o = td->eo;
+	unsigned int wss;
+	int snd, rcv, ret;
+
+	if (!o->window_size)
+		return 0;
+
+	rcv = o->listen || o->pingpong;
+	snd = !o->listen || o->pingpong;
+	wss = o->window_size;
+	ret = 0;
+
+	if (rcv) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "rcvbuf window size");
+	}
+	if (snd && !ret) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "sndbuf window size");
+	}
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt window size");
+	return -1;
+#endif
+}
+
+static int set_mss(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_MSS
+	struct netio_options *o = td->eo;
+	unsigned int mss;
+	int ret;
+
+	if (!o->mss || !is_tcp(o))
+		return 0;
+
+	mss = o->mss;
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *) &mss,
+				sizeof(mss));
+	if (ret < 0)
+		td_verror(td, errno, "setsockopt TCP_MAXSEG");
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt TCP_MAXSEG");
+	return -1;
+#endif
+}
+
+
 /*
  * Return -1 for error and 'nr events' for a positive number
  * of events
@@ -207,7 +334,7 @@
 	/*
 	 * Make sure we don't see spurious reads to a receiver, and vice versa
 	 */
-	if (o->proto == FIO_TYPE_TCP)
+	if (is_tcp(o))
 		return 0;
 
 	if ((o->listen && io_u->ddir == DDIR_WRITE) ||
@@ -247,7 +374,7 @@
  */
 static int splice_in(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return splice_io_u(io_u->file->fd, nd->pipes[1], io_u->xfer_buflen);
 }
@@ -258,7 +385,7 @@
 static int splice_out(struct thread_data *td, struct io_u *io_u,
 		      unsigned int len)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return splice_io_u(nd->pipes[0], io_u->file->fd, len);
 }
@@ -296,7 +423,7 @@
 static int vmsplice_io_u_out(struct thread_data *td, struct io_u *io_u,
 			     unsigned int len)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return vmsplice_io_u(io_u, nd->pipes[0], len);
 }
@@ -306,7 +433,7 @@
  */
 static int vmsplice_io_u_in(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return vmsplice_io_u(io_u, nd->pipes[1], io_u->xfer_buflen);
 }
@@ -354,19 +481,71 @@
 }
 #endif
 
+static void store_udp_seq(struct netio_data *nd, struct io_u *io_u)
+{
+	struct udp_seq *us;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	us->magic = cpu_to_le64((uint64_t) FIO_UDP_SEQ_MAGIC);
+	us->bs = cpu_to_le64((uint64_t) io_u->xfer_buflen);
+	us->seq = cpu_to_le64(nd->udp_send_seq++);
+}
+
+static void verify_udp_seq(struct thread_data *td, struct netio_data *nd,
+			   struct io_u *io_u)
+{
+	struct udp_seq *us;
+	uint64_t seq;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	if (nd->seq_off)
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	if (le64_to_cpu(us->magic) != FIO_UDP_SEQ_MAGIC)
+		return;
+	if (le64_to_cpu(us->bs) != io_u->xfer_buflen) {
+		nd->seq_off = 1;
+		return;
+	}
+
+	seq = le64_to_cpu(us->seq);
+
+	if (seq != nd->udp_recv_seq)
+		td->ts.drop_io_u[io_u->ddir] += seq - nd->udp_recv_seq;
+
+	nd->udp_recv_seq = seq + 1;
+}
+
 static int fio_netio_send(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret, flags = 0;
 
 	do {
-		if (o->proto == FIO_TYPE_UDP) {
-			struct sockaddr *to = (struct sockaddr *) &nd->addr;
+		if (is_udp(o)) {
+			const struct sockaddr *to;
+			socklen_t len;
+
+			if (is_ipv6(o)) {
+				to = (struct sockaddr *) &nd->addr6;
+				len = sizeof(nd->addr6);
+			} else {
+				to = (struct sockaddr *) &nd->addr;
+				len = sizeof(nd->addr);
+			}
+
+			if (td->o.verify == VERIFY_NONE)
+				store_udp_seq(nd, io_u);
 
 			ret = sendto(io_u->file->fd, io_u->xfer_buf,
-					io_u->xfer_buflen, flags, to,
-					sizeof(*to));
+					io_u->xfer_buflen, flags, to, len);
 		} else {
 			/*
 			 * if we are going to write more, set MSG_MORE
@@ -390,7 +569,7 @@
 	return ret;
 }
 
-static int is_udp_close(struct io_u *io_u, int len)
+static int is_close_msg(struct io_u *io_u, int len)
 {
 	struct udp_close_msg *msg;
 
@@ -398,9 +577,9 @@
 		return 0;
 
 	msg = io_u->xfer_buf;
-	if (ntohl(msg->magic) != FIO_LINK_OPEN_CLOSE_MAGIC)
+	if (le32_to_cpu(msg->magic) != FIO_LINK_OPEN_CLOSE_MAGIC)
 		return 0;
-	if (ntohl(msg->cmd) != FIO_LINK_CLOSE)
+	if (le32_to_cpu(msg->cmd) != FIO_LINK_CLOSE)
 		return 0;
 
 	return 1;
@@ -408,19 +587,23 @@
 
 static int fio_netio_recv(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret, flags = 0;
 
 	do {
-		if (o->proto == FIO_TYPE_UDP) {
-			socklen_t l;
-			socklen_t *len = &l;
+		if (is_udp(o)) {
 			struct sockaddr *from;
+			socklen_t l, *len = &l;
 
 			if (o->listen) {
-				from = (struct sockaddr *) &nd->addr;
-				*len = sizeof(nd->addr);
+				if (!is_ipv6(o)) {
+					from = (struct sockaddr *) &nd->addr;
+					*len = sizeof(nd->addr);
+				} else {
+					from = (struct sockaddr *) &nd->addr6;
+					*len = sizeof(nd->addr6);
+				}
 			} else {
 				from = NULL;
 				len = NULL;
@@ -428,13 +611,19 @@
 
 			ret = recvfrom(io_u->file->fd, io_u->xfer_buf,
 					io_u->xfer_buflen, flags, from, len);
-			if (is_udp_close(io_u, ret)) {
+
+			if (is_close_msg(io_u, ret)) {
 				td->done = 1;
 				return 0;
 			}
 		} else {
 			ret = recv(io_u->file->fd, io_u->xfer_buf,
 					io_u->xfer_buflen, flags);
+
+			if (is_close_msg(io_u, ret)) {
+				td->done = 1;
+				return 0;
+			}
 		}
 		if (ret > 0)
 			break;
@@ -447,24 +636,28 @@
 		flags |= MSG_WAITALL;
 	} while (1);
 
+	if (is_udp(o) && td->o.verify == VERIFY_NONE)
+		verify_udp_seq(td, nd, io_u);
+
 	return ret;
 }
 
-static int __fio_netio_queue(struct thread_data *td, struct io_u *io_u,
-			     enum fio_ddir ddir)
+static enum fio_q_status __fio_netio_queue(struct thread_data *td,
+					   struct io_u *io_u,
+					   enum fio_ddir ddir)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret;
 
 	if (ddir == DDIR_WRITE) {
-		if (!nd->use_splice || o->proto == FIO_TYPE_UDP ||
+		if (!nd->use_splice || is_udp(o) ||
 		    o->proto == FIO_TYPE_UNIX)
 			ret = fio_netio_send(td, io_u);
 		else
 			ret = fio_netio_splice_out(td, io_u);
 	} else if (ddir == DDIR_READ) {
-		if (!nd->use_splice || o->proto == FIO_TYPE_UDP ||
+		if (!nd->use_splice || is_udp(o) ||
 		    o->proto == FIO_TYPE_UNIX)
 			ret = fio_netio_recv(td, io_u);
 		else
@@ -473,11 +666,13 @@
 		ret = 0;	/* must be a SYNC */
 
 	if (ret != (int) io_u->xfer_buflen) {
-		if (ret >= 0) {
+		if (ret > 0) {
 			io_u->resid = io_u->xfer_buflen - ret;
 			io_u->error = 0;
 			return FIO_Q_COMPLETED;
-		} else {
+		} else if (!ret)
+			return FIO_Q_BUSY;
+		else {
 			int err = errno;
 
 			if (ddir == DDIR_WRITE && err == EMSGSIZE)
@@ -493,7 +688,8 @@
 	return FIO_Q_COMPLETED;
 }
 
-static int fio_netio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_netio_queue(struct thread_data *td,
+					 struct io_u *io_u)
 {
 	struct netio_options *o = td->eo;
 	int ret;
@@ -517,16 +713,22 @@
 
 static int fio_netio_connect(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int type, domain;
 
 	if (o->proto == FIO_TYPE_TCP) {
 		domain = AF_INET;
 		type = SOCK_STREAM;
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		domain = AF_INET6;
+		type = SOCK_STREAM;
 	} else if (o->proto == FIO_TYPE_UDP) {
 		domain = AF_INET;
 		type = SOCK_DGRAM;
+	} else if (o->proto == FIO_TYPE_UDP_V6) {
+		domain = AF_INET6;
+		type = SOCK_DGRAM;
 	} else if (o->proto == FIO_TYPE_UNIX) {
 		domain = AF_UNIX;
 		type = SOCK_STREAM;
@@ -543,7 +745,7 @@
 	}
 
 #ifdef CONFIG_TCP_NODELAY
-	if (o->nodelay && o->proto == FIO_TYPE_TCP) {
+	if (o->nodelay && is_tcp(o)) {
 		int optval = 1;
 
 		if (setsockopt(f->fd, IPPROTO_TCP, TCP_NODELAY, (void *) &optval, sizeof(int)) < 0) {
@@ -553,24 +755,39 @@
 	}
 #endif
 
-	if (o->proto == FIO_TYPE_UDP) {
+	if (set_window_size(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+	if (set_mss(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+
+	if (is_udp(o)) {
 		if (!fio_netio_is_multicast(td->o.filename))
 			return 0;
+		if (is_ipv6(o)) {
+			log_err("fio: multicast not supported on IPv6\n");
+			close(f->fd);
+			return 1;
+		}
 
-		if (o->interface) {
+		if (o->intfc) {
 			struct in_addr interface_addr;
-			if (inet_aton(o->interface, &interface_addr) == 0) {
+
+			if (inet_aton(o->intfc, &interface_addr) == 0) {
 				log_err("fio: interface not valid interface IP\n");
 				close(f->fd);
 				return 1;
 			}
-			if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_IF, &interface_addr, sizeof(interface_addr)) < 0) {
+			if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_IF, (const char*)&interface_addr, sizeof(interface_addr)) < 0) {
 				td_verror(td, errno, "setsockopt IP_MULTICAST_IF");
 				close(f->fd);
 				return 1;
 			}
 		}
-		if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_TTL, &o->ttl, sizeof(o->ttl)) < 0) {
+		if (setsockopt(f->fd, IPPROTO_IP, IP_MULTICAST_TTL, (const char*)&o->ttl, sizeof(o->ttl)) < 0) {
 			td_verror(td, errno, "setsockopt IP_MULTICAST_TTL");
 			close(f->fd);
 			return 1;
@@ -584,6 +801,15 @@
 			close(f->fd);
 			return 1;
 		}
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		socklen_t len = sizeof(nd->addr6);
+
+		if (connect(f->fd, (struct sockaddr *) &nd->addr6, len) < 0) {
+			td_verror(td, errno, "connect");
+			close(f->fd);
+			return 1;
+		}
+
 	} else {
 		struct sockaddr_un *addr = &nd->addr_un;
 		socklen_t len;
@@ -602,12 +828,12 @@
 
 static int fio_netio_accept(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
-	socklen_t socklen = sizeof(nd->addr);
+	socklen_t socklen;
 	int state;
 
-	if (o->proto == FIO_TYPE_UDP) {
+	if (is_udp(o)) {
 		f->fd = nd->listenfd;
 		return 0;
 	}
@@ -620,14 +846,21 @@
 	if (poll_wait(td, nd->listenfd, POLLIN) < 0)
 		goto err;
 
-	f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen);
+	if (o->proto == FIO_TYPE_TCP) {
+		socklen = sizeof(nd->addr);
+		f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen);
+	} else {
+		socklen = sizeof(nd->addr6);
+		f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr6, &socklen);
+	}
+
 	if (f->fd < 0) {
 		td_verror(td, errno, "accept");
 		goto err;
 	}
 
 #ifdef CONFIG_TCP_NODELAY
-	if (o->nodelay && o->proto == FIO_TYPE_TCP) {
+	if (o->nodelay && is_tcp(o)) {
 		int optval = 1;
 
 		if (setsockopt(f->fd, IPPROTO_TCP, TCP_NODELAY, (void *) &optval, sizeof(int)) < 0) {
@@ -645,44 +878,58 @@
 	return 1;
 }
 
-static void fio_netio_udp_close(struct thread_data *td, struct fio_file *f)
+static void fio_netio_send_close(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
-	struct sockaddr *to = (struct sockaddr *) &nd->addr;
+	struct sockaddr *to;
+	socklen_t len;
 	int ret;
 
-	msg.magic = htonl(FIO_LINK_OPEN_CLOSE_MAGIC);
-	msg.cmd = htonl(FIO_LINK_CLOSE);
+	if (is_ipv6(o)) {
+		to = (struct sockaddr *) &nd->addr6;
+		len = sizeof(nd->addr6);
+	} else {
+		to = (struct sockaddr *) &nd->addr;
+		len = sizeof(nd->addr);
+	}
+
+	msg.magic = cpu_to_le32((uint32_t) FIO_LINK_OPEN_CLOSE_MAGIC);
+	msg.cmd = cpu_to_le32((uint32_t) FIO_LINK_CLOSE);
 
-	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to,
-			sizeof(nd->addr));
+	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, len);
 	if (ret < 0)
 		td_verror(td, errno, "sendto udp link close");
 }
 
 static int fio_netio_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_options *o = td->eo;
-
 	/*
-	 * If this is an UDP connection, notify the receiver that we are
-	 * closing down the link
+	 * Notify the receiver that we are closing down the link
 	 */
-	if (o->proto == FIO_TYPE_UDP)
-		fio_netio_udp_close(td, f);
+	fio_netio_send_close(td, f);
 
 	return generic_close_file(td, f);
 }
 
 static int fio_netio_udp_recv_open(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
-	struct sockaddr *to = (struct sockaddr *) &nd->addr;
-	socklen_t len = sizeof(nd->addr);
+	struct sockaddr *to;
+	socklen_t len;
 	int ret;
 
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6);
+		to = (struct sockaddr *) &nd->addr6;
+	} else {
+		len = sizeof(nd->addr);
+		to = (struct sockaddr *) &nd->addr;
+	}
+
 	ret = recvfrom(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, &len);
 	if (ret < 0) {
 		td_verror(td, errno, "recvfrom udp link open");
@@ -696,21 +943,31 @@
 		return -1;
 	}
 
+	fio_gettime(&td->start, NULL);
 	return 0;
 }
 
-static int fio_netio_udp_send_open(struct thread_data *td, struct fio_file *f)
+static int fio_netio_send_open(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
-	struct sockaddr *to = (struct sockaddr *) &nd->addr;
+	struct sockaddr *to;
+	socklen_t len;
 	int ret;
 
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6);
+		to = (struct sockaddr *) &nd->addr6;
+	} else {
+		len = sizeof(nd->addr);
+		to = (struct sockaddr *) &nd->addr;
+	}
+
 	msg.magic = htonl(FIO_LINK_OPEN_CLOSE_MAGIC);
 	msg.cmd = htonl(FIO_LINK_OPEN);
 
-	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to,
-			sizeof(nd->addr));
+	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, len);
 	if (ret < 0) {
 		td_verror(td, errno, "sendto udp link open");
 		return ret;
@@ -734,9 +991,9 @@
 		return ret;
 	}
 
-	if (o->proto == FIO_TYPE_UDP) {
+	if (is_udp(o)) {
 		if (td_write(td))
-			ret = fio_netio_udp_send_open(td, f);
+			ret = fio_netio_send_open(td, f);
 		else {
 			int state;
 
@@ -753,10 +1010,52 @@
 	return ret;
 }
 
+static int fio_fill_addr(struct thread_data *td, const char *host, int af,
+			 void *dst, struct addrinfo **res)
+{
+	struct netio_options *o = td->eo;
+	struct addrinfo hints;
+	int ret;
+
+	if (inet_pton(af, host, dst))
+		return 0;
+
+	memset(&hints, 0, sizeof(hints));
+
+	if (is_tcp(o))
+		hints.ai_socktype = SOCK_STREAM;
+	else
+		hints.ai_socktype = SOCK_DGRAM;
+
+	if (is_ipv6(o))
+		hints.ai_family = AF_INET6;
+	else
+		hints.ai_family = AF_INET;
+
+	ret = getaddrinfo(host, NULL, &hints, res);
+	if (ret) {
+		int e = EINVAL;
+		char str[128];
+
+		if (ret == EAI_SYSTEM)
+			e = errno;
+
+		snprintf(str, sizeof(str), "getaddrinfo: %s", gai_strerror(ret));
+		td_verror(td, e, str);
+		return 1;
+	}
+
+	return 0;
+}
+
 static int fio_netio_setup_connect_inet(struct thread_data *td,
 					const char *host, unsigned short port)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
+	struct netio_options *o = td->eo;
+	struct addrinfo *res = NULL;
+	void *dst, *src;
+	int af, len;
 
 	if (!host) {
 		log_err("fio: connect with no host to connect to.\n");
@@ -769,30 +1068,44 @@
 
 	nd->addr.sin_family = AF_INET;
 	nd->addr.sin_port = htons(port);
+	nd->addr6.sin6_family = AF_INET6;
+	nd->addr6.sin6_port = htons(port);
 
-	if (inet_aton(host, &nd->addr.sin_addr) != 1) {
-		struct hostent *hent;
+	if (is_ipv6(o)) {
+		af = AF_INET6;
+		dst = &nd->addr6.sin6_addr;
+	} else {
+		af = AF_INET;
+		dst = &nd->addr.sin_addr;
+	}
 
-		hent = gethostbyname(host);
-		if (!hent) {
-			td_verror(td, errno, "gethostbyname");
-			return 1;
-		}
+	if (fio_fill_addr(td, host, af, dst, &res))
+		return 1;
+
+	if (!res)
+		return 0;
 
-		memcpy(&nd->addr.sin_addr, hent->h_addr, 4);
+	if (is_ipv6(o)) {
+		len = sizeof(nd->addr6.sin6_addr);
+		src = &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr;
+	} else {
+		len = sizeof(nd->addr.sin_addr);
+		src = &((struct sockaddr_in *) res->ai_addr)->sin_addr;
 	}
 
+	memcpy(dst, src, len);
+	freeaddrinfo(res);
 	return 0;
 }
 
 static int fio_netio_setup_connect_unix(struct thread_data *td,
 					const char *path)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct sockaddr_un *soun = &nd->addr_un;
 
 	soun->sun_family = AF_UNIX;
-	strcpy(soun->sun_path, path);
+	snprintf(soun->sun_path, sizeof(soun->sun_path), "%s", path);
 	return 0;
 }
 
@@ -800,7 +1113,7 @@
 {
 	struct netio_options *o = td->eo;
 
-	if (o->proto == FIO_TYPE_UDP || o->proto == FIO_TYPE_TCP)
+	if (is_udp(o) || is_tcp(o))
 		return fio_netio_setup_connect_inet(td, td->o.filename,o->port);
 	else
 		return fio_netio_setup_connect_unix(td, td->o.filename);
@@ -808,7 +1121,7 @@
 
 static int fio_netio_setup_listen_unix(struct thread_data *td, const char *path)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct sockaddr_un *addr = &nd->addr_un;
 	mode_t mode;
 	int len, fd;
@@ -821,9 +1134,8 @@
 
 	mode = umask(000);
 
-	memset(addr, 0, sizeof(*addr));
 	addr->sun_family = AF_UNIX;
-	strcpy(addr->sun_path, path);
+	snprintf(addr->sun_path, sizeof(addr->sun_path), "%s", path);
 	unlink(path);
 
 	len = sizeof(addr->sun_family) + strlen(path) + 1;
@@ -841,19 +1153,34 @@
 
 static int fio_netio_setup_listen_inet(struct thread_data *td, short port)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct ip_mreq mr;
 	struct sockaddr_in sin;
-	int fd, opt, type;
+	struct sockaddr *saddr;
+	int fd, opt, type, domain;
+	socklen_t len;
 
 	memset(&sin, 0, sizeof(sin));
-	if (o->proto == FIO_TYPE_TCP)
+
+	if (o->proto == FIO_TYPE_TCP) {
 		type = SOCK_STREAM;
-	else
+		domain = AF_INET;
+	} else if (o->proto == FIO_TYPE_TCP_V6) {
+		type = SOCK_STREAM;
+		domain = AF_INET6;
+	} else if (o->proto == FIO_TYPE_UDP) {
+		type = SOCK_DGRAM;
+		domain = AF_INET;
+	} else if (o->proto == FIO_TYPE_UDP_V6) {
 		type = SOCK_DGRAM;
+		domain = AF_INET6;
+	} else {
+		log_err("fio: unknown proto %d\n", o->proto);
+		return 1;
+	}
 
-	fd = socket(AF_INET, type, 0);
+	fd = socket(domain, type, 0);
 	if (fd < 0) {
 		td_verror(td, errno, "socket");
 		return 1;
@@ -873,19 +1200,32 @@
 	}
 #endif
 
-	if (td->o.filename){
-		if(o->proto != FIO_TYPE_UDP ||
-		   !fio_netio_is_multicast(td->o.filename)) {
+	if (set_window_size(td, fd)) {
+		close(fd);
+		return 1;
+	}
+	if (set_mss(td, fd)) {
+		close(fd);
+		return 1;
+	}
+
+	if (td->o.filename) {
+		if (!is_udp(o) || !fio_netio_is_multicast(td->o.filename)) {
 			log_err("fio: hostname not valid for non-multicast inbound network IO\n");
 			close(fd);
 			return 1;
 		}
+		if (is_ipv6(o)) {
+			log_err("fio: IPv6 not supported for multicast network IO\n");
+			close(fd);
+			return 1;
+		}
 
 		inet_aton(td->o.filename, &sin.sin_addr);
 
 		mr.imr_multiaddr = sin.sin_addr;
-		if (o->interface) {
-			if (inet_aton(o->interface, &mr.imr_interface) == 0) {
+		if (o->intfc) {
+			if (inet_aton(o->intfc, &mr.imr_interface) == 0) {
 				log_err("fio: interface not valid interface IP\n");
 				close(fd);
 				return 1;
@@ -893,18 +1233,32 @@
 		} else {
 			mr.imr_interface.s_addr = htonl(INADDR_ANY);
 		}
-		if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mr, sizeof(mr)) < 0) {
+
+		if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, (const char*)&mr, sizeof(mr)) < 0) {
 			td_verror(td, errno, "setsockopt IP_ADD_MEMBERSHIP");
 			close(fd);
 			return 1;
 		}
 	}
 
-	nd->addr.sin_family = AF_INET;
-	nd->addr.sin_addr.s_addr = sin.sin_addr.s_addr ? sin.sin_addr.s_addr : htonl(INADDR_ANY);
-	nd->addr.sin_port = htons(port);
+	if (!is_ipv6(o)) {
+		saddr = (struct sockaddr *) &nd->addr;
+		len = sizeof(nd->addr);
+
+		nd->addr.sin_family = AF_INET;
+		nd->addr.sin_addr.s_addr = sin.sin_addr.s_addr ? sin.sin_addr.s_addr : htonl(INADDR_ANY);
+		nd->addr.sin_port = htons(port);
+	} else {
+		saddr = (struct sockaddr *) &nd->addr6;
+		len = sizeof(nd->addr6);
+
+		nd->addr6.sin6_family = AF_INET6;
+		nd->addr6.sin6_addr = in6addr_any;
+		nd->addr6.sin6_port = htons(port);
+	}
 
-	if (bind(fd, (struct sockaddr *) &nd->addr, sizeof(nd->addr)) < 0) {
+	if (bind(fd, saddr, len) < 0) {
+		close(fd);
 		td_verror(td, errno, "bind");
 		return 1;
 	}
@@ -915,18 +1269,18 @@
 
 static int fio_netio_setup_listen(struct thread_data *td)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret;
 
-	if (o->proto == FIO_TYPE_UDP || o->proto == FIO_TYPE_TCP)
+	if (is_udp(o) || is_tcp(o))
 		ret = fio_netio_setup_listen_inet(td, o->port);
 	else
 		ret = fio_netio_setup_listen_unix(td, td->o.filename);
 
 	if (ret)
 		return ret;
-	if (o->proto == FIO_TYPE_UDP)
+	if (is_udp(o))
 		return 0;
 
 	if (listen(nd->listenfd, 10) < 0) {
@@ -961,7 +1315,9 @@
 		return 1;
 	}
 
-	if (o->proto != FIO_TYPE_TCP) {
+	o->port += td->subjob_number;
+
+	if (!is_tcp(o)) {
 		if (o->listen) {
 			log_err("fio: listen only valid for TCP proto IO\n");
 			return 1;
@@ -988,7 +1344,7 @@
 
 static void fio_netio_cleanup(struct thread_data *td)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	if (nd) {
 		if (nd->listenfd != -1)
@@ -1007,17 +1363,18 @@
 	struct netio_data *nd;
 
 	if (!td->files_index) {
-		add_file(td, td->o.filename ?: "net");
+		add_file(td, td->o.filename ?: "net", 0, 0);
 		td->o.nr_files = td->o.nr_files ?: 1;
+		td->o.open_files++;
 	}
 
-	if (!td->io_ops->data) {
-		nd = malloc(sizeof(*nd));;
+	if (!td->io_ops_data) {
+		nd = malloc(sizeof(*nd));
 
 		memset(nd, 0, sizeof(*nd));
 		nd->listenfd = -1;
 		nd->pipes[0] = nd->pipes[1] = -1;
-		td->io_ops->data = nd;
+		td->io_ops_data = nd;
 	}
 
 	return 0;
@@ -1025,7 +1382,7 @@
 
 static void fio_netio_terminate(struct thread_data *td)
 {
-	kill(td->pid, SIGUSR2);
+	kill(td->pid, SIGTERM);
 }
 
 #ifdef CONFIG_LINUX_SPLICE
@@ -1035,7 +1392,7 @@
 
 	fio_netio_setup(td);
 
-	nd = td->io_ops->data;
+	nd = td->io_ops_data;
 	if (nd) {
 		if (pipe(nd->pipes) < 0)
 			return 1;
diff -Nru fio-2.1.3/engines/null.c fio-3.16/engines/null.c
--- fio-2.1.3/engines/null.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/null.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,11 +4,16 @@
  * IO engine that doesn't do any real IO transfers, it just pretends to.
  * The main purpose is to test fio itself.
  *
+ * It also can act as external C++ engine - compiled with:
+ *
+ * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE
+ *
+ * to test it execute:
+ *
+ * LD_LIBRARY_PATH=./engines ./fio examples/cpp_null.fio
+ *
  */
-#include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
 #include <assert.h>
 
 #include "../fio.h"
@@ -19,20 +24,17 @@
 	int events;
 };
 
-static struct io_u *fio_null_event(struct thread_data *td, int event)
+static struct io_u *null_event(struct null_data *nd, int event)
 {
-	struct null_data *nd = td->io_ops->data;
-
 	return nd->io_us[event];
 }
 
-static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
-			      unsigned int fio_unused max,
-			      struct timespec fio_unused *t)
+static int null_getevents(struct null_data *nd, unsigned int min_events,
+			  unsigned int fio_unused max,
+			  const struct timespec fio_unused *t)
 {
-	struct null_data *nd = td->io_ops->data;
 	int ret = 0;
-	
+
 	if (min_events) {
 		ret = nd->events;
 		nd->events = 0;
@@ -41,12 +43,12 @@
 	return ret;
 }
 
-static int fio_null_commit(struct thread_data *td)
+static int null_commit(struct thread_data *td, struct null_data *nd)
 {
-	struct null_data *nd = td->io_ops->data;
-
 	if (!nd->events) {
+#ifndef FIO_EXTERNAL_ENGINE
 		io_u_mark_submit(td, nd->queued);
+#endif
 		nd->events = nd->queued;
 		nd->queued = 0;
 	}
@@ -54,10 +56,9 @@
 	return 0;
 }
 
-static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status null_queue(struct thread_data *td,
+				    struct null_data *nd, struct io_u *io_u)
 {
-	struct null_data *nd = td->io_ops->data;
-
 	fio_ro_check(td, io_u);
 
 	if (td->io_ops->flags & FIO_SYNCIO)
@@ -69,36 +70,74 @@
 	return FIO_Q_QUEUED;
 }
 
-static int fio_null_open(struct thread_data fio_unused *td,
-			 struct fio_file fio_unused *f)
+static int null_open(struct null_data fio_unused *nd,
+		     struct fio_file fio_unused *f)
 {
 	return 0;
 }
 
-static void fio_null_cleanup(struct thread_data *td)
+static void null_cleanup(struct null_data *nd)
 {
-	struct null_data *nd = td->io_ops->data;
-
 	if (nd) {
-		if (nd->io_us)
-			free(nd->io_us);
+		free(nd->io_us);
 		free(nd);
 	}
 }
 
-static int fio_null_init(struct thread_data *td)
+static struct null_data *null_init(struct thread_data *td)
 {
-	struct null_data *nd = malloc(sizeof(*nd));
+	struct null_data *nd = (struct null_data *) malloc(sizeof(*nd));
 
 	memset(nd, 0, sizeof(*nd));
 
 	if (td->o.iodepth != 1) {
-		nd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
+		nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *));
 		memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
 	} else
 		td->io_ops->flags |= FIO_SYNCIO;
 
-	td->io_ops->data = nd;
+	return nd;
+}
+
+#ifndef __cplusplus
+
+static struct io_u *fio_null_event(struct thread_data *td, int event)
+{
+	return null_event(td->io_ops_data, event);
+}
+
+static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
+			      unsigned int max, const struct timespec *t)
+{
+	struct null_data *nd = td->io_ops_data;
+	return null_getevents(nd, min_events, max, t);
+}
+
+static int fio_null_commit(struct thread_data *td)
+{
+	return null_commit(td, td->io_ops_data);
+}
+
+static enum fio_q_status fio_null_queue(struct thread_data *td,
+					struct io_u *io_u)
+{
+	return null_queue(td, td->io_ops_data, io_u);
+}
+
+static int fio_null_open(struct thread_data *td, struct fio_file *f)
+{
+	return null_open(td->io_ops_data, f);
+}
+
+static void fio_null_cleanup(struct thread_data *td)
+{
+	null_cleanup(td->io_ops_data);
+}
+
+static int fio_null_init(struct thread_data *td)
+{
+	td->io_ops_data = null_init(td);
+	assert(td->io_ops_data);
 	return 0;
 }
 
@@ -112,7 +151,7 @@
 	.init		= fio_null_init,
 	.cleanup	= fio_null_cleanup,
 	.open_file	= fio_null_open,
-	.flags		= FIO_DISKLESSIO,
+	.flags		= FIO_DISKLESSIO | FIO_FAKEIO,
 };
 
 static void fio_init fio_null_register(void)
@@ -124,3 +163,114 @@
 {
 	unregister_ioengine(&ioengine);
 }
+
+#else
+
+#ifdef FIO_EXTERNAL_ENGINE
+
+struct NullData {
+	NullData(struct thread_data *td)
+	{
+		impl_ = null_init(td);
+		assert(impl_);
+	}
+
+	~NullData()
+	{
+		null_cleanup(impl_);
+	}
+
+	static NullData *get(struct thread_data *td)
+	{
+		return reinterpret_cast<NullData *>(td->io_ops_data);
+	}
+
+	io_u *fio_null_event(struct thread_data *, int event)
+	{
+		return null_event(impl_, event);
+	}
+
+	int fio_null_getevents(struct thread_data *, unsigned int min_events,
+			       unsigned int max, const struct timespec *t)
+	{
+		return null_getevents(impl_, min_events, max, t);
+	}
+
+	int fio_null_commit(struct thread_data *td)
+	{
+		return null_commit(td, impl_);
+	}
+
+	int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+	{
+		return null_queue(td, impl_, io_u);
+	}
+
+	int fio_null_open(struct thread_data *, struct fio_file *f)
+	{
+		return null_open(impl_, f);
+	}
+
+private:
+	struct null_data *impl_;
+};
+
+extern "C" {
+
+static struct io_u *fio_null_event(struct thread_data *td, int event)
+{
+	return NullData::get(td)->fio_null_event(td, event);
+}
+
+static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
+			      unsigned int max, const struct timespec *t)
+{
+	return NullData::get(td)->fio_null_getevents(td, min_events, max, t);
+}
+
+static int fio_null_commit(struct thread_data *td)
+{
+	return NullData::get(td)->fio_null_commit(td);
+}
+
+static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+{
+	return NullData::get(td)->fio_null_queue(td, io_u);
+}
+
+static int fio_null_open(struct thread_data *td, struct fio_file *f)
+{
+	return NullData::get(td)->fio_null_open(td, f);
+}
+
+static int fio_null_init(struct thread_data *td)
+{
+	td->io_ops_data = new NullData(td);
+	return 0;
+}
+
+static void fio_null_cleanup(struct thread_data *td)
+{
+	delete NullData::get(td);
+}
+
+static struct ioengine_ops ioengine;
+void get_ioengine(struct ioengine_ops **ioengine_ptr)
+{
+	*ioengine_ptr = &ioengine;
+
+	ioengine.name           = "cpp_null";
+	ioengine.version        = FIO_IOOPS_VERSION;
+	ioengine.queue          = fio_null_queue;
+	ioengine.commit         = fio_null_commit;
+	ioengine.getevents      = fio_null_getevents;
+	ioengine.event          = fio_null_event;
+	ioengine.init           = fio_null_init;
+	ioengine.cleanup        = fio_null_cleanup;
+	ioengine.open_file      = fio_null_open;
+	ioengine.flags          = FIO_DISKLESSIO | FIO_FAKEIO;
+}
+}
+#endif /* FIO_EXTERNAL_ENGINE */
+
+#endif /* __cplusplus */
diff -Nru fio-2.1.3/engines/pmemblk.c fio-3.16/engines/pmemblk.c
--- fio-2.1.3/engines/pmemblk.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/pmemblk.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,446 @@
+/*
+ * pmemblk: IO engine that uses PMDK libpmemblk to read and write data
+ *
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * pmemblk engine
+ *
+ * IO engine that uses libpmemblk to read and write data
+ *
+ * To use:
+ *   ioengine=pmemblk
+ *
+ * Other relevant settings:
+ *   thread=1   REQUIRED
+ *   iodepth=1
+ *   direct=1
+ *   unlink=1
+ *   filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
+ *
+ *   thread must be set to 1 for pmemblk as multiple processes cannot
+ *     open the same block pool file.
+ *
+ *   iodepth should be set to 1 as pmemblk is always synchronous.
+ *   Use numjobs to scale up.
+ *
+ *   direct=1 is implied as pmemblk is always direct. A warning message
+ *   is printed if this is not specified.
+ *
+ *   unlink=1 removes the block pool file after testing, and is optional.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled.  filename must point to a file on that filesystem.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *   When specifying the filename, if the block pool file does not already
+ *   exist, then the pmemblk engine creates the pool file if you specify
+ *   the block and file sizes.  BSIZE is the block size in bytes.
+ *   FSIZEMB is the pool file size in MiB.
+ *
+ *   See examples/pmemblk.fio for more.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <libpmem.h>
+#include <libpmemblk.h>
+
+#include "../fio.h"
+
+/*
+ * libpmemblk
+ */
+typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
+
+struct fio_pmemblk_file {
+	fio_pmemblk_file_t pmb_next;
+	char *pmb_filename;
+	uint64_t pmb_refcnt;
+	PMEMblkpool *pmb_pool;
+	size_t pmb_bsize;
+	size_t pmb_nblocks;
+};
+
+static fio_pmemblk_file_t Cache;
+
+static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define PMB_CREATE   (0x0001)	/* should create file */
+
+fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
+{
+	fio_pmemblk_file_t i;
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (!strcmp(filename, i->pmb_filename))
+			return i;
+
+	return NULL;
+}
+
+static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
+{
+	pmb->pmb_next = Cache;
+	Cache = pmb;
+}
+
+static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
+{
+	fio_pmemblk_file_t i;
+
+	if (pmb == Cache) {
+		Cache = Cache->pmb_next;
+		pmb->pmb_next = NULL;
+		return;
+	}
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (pmb == i->pmb_next) {
+			i->pmb_next = i->pmb_next->pmb_next;
+			pmb->pmb_next = NULL;
+			return;
+		}
+}
+
+/*
+ * to control block size and gross file size at the libpmemblk
+ * level, we allow the block size and file size to be appended
+ * to the file name:
+ *
+ *   path[,bsize,fsizemib]
+ *
+ * note that we do not use the fio option "filesize" to dictate
+ * the file size because we can only give libpmemblk the gross
+ * file size, which is different from the net or usable file
+ * size (which is probably what fio wants).
+ *
+ * the final path without the parameters is returned in ppath.
+ * the block size and file size are returned in pbsize and fsize.
+ *
+ * note that the user specifies the file size in MiB, but
+ * we return bytes from here.
+ */
+static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
+			   uint64_t *pfsize)
+{
+	char *path;
+	char *s;
+	uint64_t bsize;
+	uint64_t fsizemib;
+
+	path = strdup(pathspec);
+	if (!path) {
+		*ppath = NULL;
+		return;
+	}
+
+	/* extract sizes, if given */
+	s = strrchr(path, ',');
+	if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
+		*s = 0;
+		s = strrchr(path, ',');
+		if (s && (bsize = strtoull(s + 1, NULL, 10))) {
+			*s = 0;
+			*ppath = path;
+			*pbsize = bsize;
+			*pfsize = fsizemib << 20;
+			return;
+		}
+	}
+
+	/* size specs not found */
+	strcpy(path, pathspec);
+	*ppath = path;
+	*pbsize = 0;
+	*pfsize = 0;
+}
+
+static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
+{
+	fio_pmemblk_file_t pmb;
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	pmb_parse_path(pathspec, &path, &bsize, &fsize);
+	if (!path)
+		return NULL;
+
+	pthread_mutex_lock(&CacheLock);
+
+	pmb = fio_pmemblk_cache_lookup(path);
+	if (!pmb) {
+		pmb = malloc(sizeof(*pmb));
+		if (!pmb)
+			goto error;
+
+		/* try opening existing first, create it if needed */
+		pmb->pmb_pool = pmemblk_open(path, bsize);
+		if (!pmb->pmb_pool && (errno == ENOENT) &&
+		    (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
+			pmb->pmb_pool =
+			    pmemblk_create(path, bsize, fsize, 0644);
+		}
+		if (!pmb->pmb_pool) {
+			log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
+			     path, strerror(errno));
+			goto error;
+		}
+
+		pmb->pmb_filename = path;
+		pmb->pmb_next = NULL;
+		pmb->pmb_refcnt = 0;
+		pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
+		pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
+
+		fio_pmemblk_cache_insert(pmb);
+	}
+
+	pmb->pmb_refcnt += 1;
+
+	pthread_mutex_unlock(&CacheLock);
+
+	return pmb;
+
+error:
+	if (pmb) {
+		if (pmb->pmb_pool)
+			pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		pmb->pmb_filename = NULL;
+		free(pmb);
+	}
+	if (path)
+		free(path);
+
+	pthread_mutex_unlock(&CacheLock);
+	return NULL;
+}
+
+static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
+{
+	pthread_mutex_lock(&CacheLock);
+
+	pmb->pmb_refcnt--;
+
+	if (!keep && !pmb->pmb_refcnt) {
+		pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		free(pmb->pmb_filename);
+		pmb->pmb_filename = NULL;
+		fio_pmemblk_cache_remove(pmb);
+		free(pmb);
+	}
+
+	pthread_mutex_unlock(&CacheLock);
+}
+
+static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
+{
+	static int thread_warned = 0;
+	static int odirect_warned = 0;
+
+	uint64_t flags = 0;
+
+	if (!td->o.use_thread) {
+		if (!thread_warned) {
+			thread_warned = 1;
+			log_err("pmemblk: must set thread=1 for pmemblk engine\n");
+		}
+		return 1;
+	}
+
+	if (!td->o.odirect && !odirect_warned) {
+		odirect_warned = 1;
+		log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
+	}
+
+	if (td->o.allow_create)
+		flags |= PMB_CREATE;
+
+	(*pflags) = flags;
+	return 0;
+}
+
+static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb;
+
+	if (pmb_get_flags(td, &flags))
+		return 1;
+
+	pmb = pmb_open(f->file_name, flags);
+	if (!pmb)
+		return 1;
+
+	FILE_SET_ENG_DATA(f, pmb);
+	return 0;
+}
+
+static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
+				  struct fio_file *f)
+{
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (pmb)
+		pmb_close(pmb, false);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	return 0;
+}
+
+static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (!pmb) {
+		if (pmb_get_flags(td, &flags))
+			return 1;
+		pmb = pmb_open(f->file_name, flags);
+		if (!pmb)
+			return 1;
+	}
+
+	f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
+
+	fio_file_set_size_known(f);
+
+	if (!FILE_ENG_DATA(f))
+		pmb_close(pmb, true);
+
+	return 0;
+}
+
+static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
+					   struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	unsigned long long off;
+	unsigned long len;
+	void *buf;
+
+	fio_ro_check(td, io_u);
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+	case DDIR_WRITE:
+		off = io_u->offset;
+		len = io_u->xfer_buflen;
+
+		io_u->error = EINVAL;
+		if (off % pmb->pmb_bsize)
+			break;
+		if (len % pmb->pmb_bsize)
+			break;
+		if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
+			break;
+
+		io_u->error = 0;
+		buf = io_u->xfer_buf;
+		off /= pmb->pmb_bsize;
+		len /= pmb->pmb_bsize;
+		while (0 < len) {
+			if (io_u->ddir == DDIR_READ &&
+			   0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			} else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			}
+			buf += pmb->pmb_bsize;
+			off++;
+			len--;
+		}
+		off *= pmb->pmb_bsize;
+		len *= pmb->pmb_bsize;
+		io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		/* we're always sync'd */
+		io_u->error = 0;
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	/*
+	 * we need our own unlink in case the user has specified
+	 * the block and file sizes in the path name.  we parse
+	 * the file_name to determine the file name we actually used.
+	 */
+
+	pmb_parse_path(f->file_name, &path, &bsize, &fsize);
+	if (!path)
+		return ENOENT;
+
+	unlink(path);
+	free(path);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "pmemblk",
+	.version = FIO_IOOPS_VERSION,
+	.queue = fio_pmemblk_queue,
+	.open_file = fio_pmemblk_open_file,
+	.close_file = fio_pmemblk_close_file,
+	.get_file_size = fio_pmemblk_get_file_size,
+	.unlink_file = fio_pmemblk_unlink_file,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_pmemblk_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_pmemblk_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/posixaio.c fio-3.16/engines/posixaio.c
--- fio-2.1.3/engines/posixaio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/posixaio.c	2019-09-20 01:01:52.000000000 +0000
@@ -91,9 +91,9 @@
 #define SUSPEND_ENTRIES	8
 
 static int fio_posixaio_getevents(struct thread_data *td, unsigned int min,
-				  unsigned int max, struct timespec *t)
+				  unsigned int max, const struct timespec *t)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 	os_aiocb_t *suspend_list[SUSPEND_ENTRIES];
 	struct timespec start;
 	int have_timeout = 0;
@@ -109,7 +109,7 @@
 
 	r = 0;
 restart:
-	memset(suspend_list, 0, sizeof(*suspend_list));
+	memset(suspend_list, 0, sizeof(suspend_list));
 	suspend_entries = 0;
 	io_u_qiter(&td->io_u_all, io_u, i) {
 		int err;
@@ -161,15 +161,15 @@
 
 static struct io_u *fio_posixaio_event(struct thread_data *td, int event)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 
 	return pd->aio_events[event];
 }
 
-static int fio_posixaio_queue(struct thread_data *td,
-			      struct io_u *io_u)
+static enum fio_q_status fio_posixaio_queue(struct thread_data *td,
+					    struct io_u *io_u)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 	os_aiocb_t *aiocb = &io_u->aiocb;
 	int ret;
 
@@ -196,18 +196,20 @@
 		return FIO_Q_COMPLETED;
 #endif
 	}
-		
+
 	if (ret) {
+		int aio_err = errno;
+
 		/*
 		 * At least OSX has a very low limit on the number of pending
 		 * IOs, so if it returns EAGAIN, we are out of resources
 		 * to queue more. Just return FIO_Q_BUSY to naturally
 		 * drop off at this depth.
 		 */
-		if (errno == EAGAIN)
+		if (aio_err == EAGAIN)
 			return FIO_Q_BUSY;
 
-		io_u->error = errno;
+		io_u->error = aio_err;
 		td_verror(td, io_u->error, "xfer");
 		return FIO_Q_COMPLETED;
 	}
@@ -218,7 +220,7 @@
 
 static void fio_posixaio_cleanup(struct thread_data *td)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 
 	if (pd) {
 		free(pd->aio_events);
@@ -234,13 +236,14 @@
 	pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
 	memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
 
-	td->io_ops->data = pd;
+	td->io_ops_data = pd;
 	return 0;
 }
 
 static struct ioengine_ops ioengine = {
 	.name		= "posixaio",
 	.version	= FIO_IOOPS_VERSION,
+	.flags		= FIO_ASYNCIO_SYNC_TRIM,
 	.init		= fio_posixaio_init,
 	.prep		= fio_posixaio_prep,
 	.queue		= fio_posixaio_queue,
diff -Nru fio-2.1.3/engines/rados.c fio-3.16/engines/rados.c
--- fio-2.1.3/engines/rados.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/rados.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,458 @@
+/*
+ *  Ceph Rados engine
+ *
+ * IO engine using Ceph's RADOS interface to test low-level performance of
+ * Ceph OSDs.
+ *
+ */
+
+#include <rados/librados.h>
+#include <pthread.h>
+#include "fio.h"
+#include "../optgroup.h"
+
+struct fio_rados_iou {
+	struct thread_data *td;
+	struct io_u *io_u;
+	rados_completion_t completion;
+	rados_write_op_t write_op;
+};
+
+struct rados_data {
+	rados_t cluster;
+	rados_ioctx_t io_ctx;
+	struct io_u **aio_events;
+	bool connected;
+};
+
+/* fio configuration options read from the job file */
+struct rados_options {
+	void *pad;
+	char *cluster_name;
+	char *pool_name;
+	char *client_name;
+	int busy_poll;
+};
+
+static struct fio_option options[] = {
+	{
+		.name     = "clustername",
+		.lname    = "ceph cluster name",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Cluster name for ceph",
+		.off1     = offsetof(struct rados_options, cluster_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "pool",
+		.lname    = "pool name to use",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Ceph pool name to benchmark against",
+		.off1     = offsetof(struct rados_options, pool_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "clientname",
+		.lname    = "rados engine clientname",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Name of the ceph client to access RADOS engine",
+		.off1     = offsetof(struct rados_options, client_name),
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = "busy_poll",
+		.lname    = "busy poll mode",
+		.type     = FIO_OPT_BOOL,
+		.help     = "Busy poll for completions instead of sleeping",
+		.off1     = offsetof(struct rados_options, busy_poll),
+		.def	  = "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
+	{
+		.name     = NULL,
+	},
+};
+
+static int _fio_setup_rados_data(struct thread_data *td,
+				struct rados_data **rados_data_ptr)
+{
+	struct rados_data *rados;
+
+	if (td->io_ops_data)
+		return 0;
+
+	rados = calloc(1, sizeof(struct rados_data));
+	if (!rados)
+		goto failed;
+
+	rados->connected = false;
+
+	rados->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rados->aio_events)
+		goto failed;
+	*rados_data_ptr = rados;
+	return 0;
+
+failed:
+	if (rados) {
+		if (rados->aio_events)
+			free(rados->aio_events);
+		free(rados);
+	}
+	return 1;
+}
+
+static void _fio_rados_rm_objects(struct thread_data *td, struct rados_data *rados)
+{
+	size_t i;
+	for (i = 0; i < td->o.nr_files; i++) {
+		struct fio_file *f = td->files[i];
+		rados_remove(rados->io_ctx, f->file_name);
+	}
+}
+
+static int _fio_rados_connect(struct thread_data *td)
+{
+	struct rados_data *rados = td->io_ops_data;
+	struct rados_options *o = td->eo;
+	int r;
+	const uint64_t file_size =
+		td->o.size / (td->o.nr_files ? td->o.nr_files : 1u);
+	struct fio_file *f;
+	uint32_t i;
+
+	if (o->cluster_name) {
+		char *client_name = NULL;
+
+		/*
+		* If we specify cluser name, the rados_create2
+		* will not assume 'client.'. name is considered
+		* as a full type.id namestr
+		*/
+		if (o->client_name) {
+			if (!index(o->client_name, '.')) {
+				client_name = calloc(1, strlen("client.") +
+					strlen(o->client_name) + 1);
+				strcat(client_name, "client.");
+				strcat(client_name, o->client_name);
+			} else {
+				client_name = o->client_name;
+			}
+		}
+
+		r = rados_create2(&rados->cluster, o->cluster_name,
+			client_name, 0);
+
+		if (client_name && !index(o->client_name, '.'))
+			free(client_name);
+	} else
+		r = rados_create(&rados->cluster, o->client_name);
+
+	if (o->pool_name == NULL) {
+		log_err("rados pool name must be provided.\n");
+		goto failed_early;
+	}
+
+	if (r < 0) {
+		log_err("rados_create failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_conf_read_file(rados->cluster, NULL);
+	if (r < 0) {
+		log_err("rados_conf_read_file failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_connect(rados->cluster);
+	if (r < 0) {
+		log_err("rados_connect failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_ioctx_create(rados->cluster, o->pool_name, &rados->io_ctx);
+	if (r < 0) {
+		log_err("rados_ioctx_create failed.\n");
+		goto failed_shutdown;
+	}
+
+	for (i = 0; i < td->o.nr_files; i++) {
+		f = td->files[i];
+		f->real_file_size = file_size;
+		r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
+		if (r < 0) {
+			goto failed_obj_create;
+		}
+	}
+	return 0;
+
+failed_obj_create:
+	_fio_rados_rm_objects(td, rados);
+	rados_ioctx_destroy(rados->io_ctx);
+	rados->io_ctx = NULL;
+failed_shutdown:
+	rados_shutdown(rados->cluster);
+	rados->cluster = NULL;
+failed_early:
+	return 1;
+}
+
+static void _fio_rados_disconnect(struct rados_data *rados)
+{
+	if (!rados)
+		return;
+
+	if (rados->io_ctx) {
+		rados_ioctx_destroy(rados->io_ctx);
+		rados->io_ctx = NULL;
+	}
+
+	if (rados->cluster) {
+		rados_shutdown(rados->cluster);
+		rados->cluster = NULL;
+	}
+}
+
+static void fio_rados_cleanup(struct thread_data *td)
+{
+	struct rados_data *rados = td->io_ops_data;
+
+	if (rados) {
+		_fio_rados_rm_objects(td, rados);
+		_fio_rados_disconnect(rados);
+		free(rados->aio_events);
+		free(rados);
+	}
+}
+
+static enum fio_q_status fio_rados_queue(struct thread_data *td,
+					 struct io_u *io_u)
+{
+	struct rados_data *rados = td->io_ops_data;
+	struct fio_rados_iou *fri = io_u->engine_data;
+	char *object = io_u->file->file_name;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_WRITE) {
+		 r = rados_aio_create_completion(fri, NULL,
+			NULL, &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+
+		r = rados_aio_write(rados->io_ctx, object, fri->completion,
+			io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+		if (r < 0) {
+			log_err("rados_write failed.\n");
+			goto failed_comp;
+		}
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_READ) {
+		r = rados_aio_create_completion(fri, NULL,
+			NULL, &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+		r = rados_aio_read(rados->io_ctx, object, fri->completion,
+			io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
+		if (r < 0) {
+			log_err("rados_aio_read failed.\n");
+			goto failed_comp;
+		}
+		return FIO_Q_QUEUED;
+	} else if (io_u->ddir == DDIR_TRIM) {
+		r = rados_aio_create_completion(fri, NULL,
+			NULL , &fri->completion);
+		if (r < 0) {
+			log_err("rados_aio_create_completion failed.\n");
+			goto failed;
+		}
+		fri->write_op = rados_create_write_op();
+		if (fri->write_op == NULL) {
+			log_err("rados_create_write_op failed.\n");
+			goto failed_comp;
+		}
+		rados_write_op_zero(fri->write_op, io_u->offset,
+			io_u->xfer_buflen);
+		r = rados_aio_write_op_operate(fri->write_op, rados->io_ctx,
+			fri->completion, object, NULL, 0);
+		if (r < 0) {
+			log_err("rados_aio_write_op_operate failed.\n");
+			goto failed_write_op;
+		}
+		return FIO_Q_QUEUED;
+	 }
+
+	log_err("WARNING: Only DDIR_READ, DDIR_WRITE and DDIR_TRIM are supported!");
+
+failed_write_op:
+	rados_release_write_op(fri->write_op);
+failed_comp:
+	rados_aio_release(fri->completion);
+failed:
+	io_u->error = -r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static struct io_u *fio_rados_event(struct thread_data *td, int event)
+{
+	struct rados_data *rados = td->io_ops_data;
+	return rados->aio_events[event];
+}
+
+int fio_rados_getevents(struct thread_data *td, unsigned int min,
+	unsigned int max, const struct timespec *t)
+{
+	struct rados_data *rados = td->io_ops_data;
+	struct rados_options *o = td->eo;
+	int busy_poll = o->busy_poll;
+	unsigned int events = 0;
+	struct io_u *u;
+	struct fio_rados_iou *fri;
+	unsigned int i;
+	rados_completion_t first_unfinished;
+	int observed_new = 0;
+
+	/* loop through inflight ios until we find 'min' completions */
+	do {
+		first_unfinished = NULL;
+		io_u_qiter(&td->io_u_all, u, i) {
+			if (!(u->flags & IO_U_F_FLIGHT))
+				continue;
+
+			fri = u->engine_data;
+			if (fri->completion) {
+				if (rados_aio_is_complete(fri->completion)) {
+					if (fri->write_op != NULL) {
+						rados_release_write_op(fri->write_op);
+						fri->write_op = NULL;
+					}
+					rados_aio_release(fri->completion);
+					fri->completion = NULL;
+					rados->aio_events[events] = u;
+					events++;
+					observed_new = 1;
+				} else if (first_unfinished == NULL) {
+					first_unfinished = fri->completion;
+				}
+			}
+			if (events >= max)
+				break;
+		}
+		if (events >= min)
+			return events;
+		if (first_unfinished == NULL || busy_poll)
+			continue;
+
+		if (!observed_new)
+			rados_aio_wait_for_complete(first_unfinished);
+	} while (1);
+  return events;
+}
+
+static int fio_rados_setup(struct thread_data *td)
+{
+	struct rados_data *rados = NULL;
+	int r;
+	/* allocate engine specific structure to deal with librados. */
+	r = _fio_setup_rados_data(td, &rados);
+	if (r) {
+		log_err("fio_setup_rados_data failed.\n");
+		goto cleanup;
+	}
+	td->io_ops_data = rados;
+
+	/* Force single process mode.
+	*/
+	td->o.use_thread = 1;
+
+	/* connect in the main thread to determine to determine
+	* the size of the given RADOS block device. And disconnect
+	* later on.
+	*/
+	r = _fio_rados_connect(td);
+	if (r) {
+		log_err("fio_rados_connect failed.\n");
+		goto cleanup;
+	}
+	rados->connected = true;
+
+	return 0;
+cleanup:
+	fio_rados_cleanup(td);
+	return r;
+}
+
+/* open/invalidate are noops. we set the FIO_DISKLESSIO flag in ioengine_ops to
+   prevent fio from creating the files
+*/
+static int fio_rados_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+static int fio_rados_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static void fio_rados_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rados_iou *fri = io_u->engine_data;
+
+	if (fri) {
+		io_u->engine_data = NULL;
+		fri->td = NULL;
+		if (fri->completion)
+			rados_aio_release(fri->completion);
+		if (fri->write_op)
+			rados_release_write_op(fri->write_op);
+		free(fri);
+	}
+}
+
+static int fio_rados_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rados_iou *fri;
+	fri = calloc(1, sizeof(*fri));
+	fri->io_u = io_u;
+	fri->td = td;
+	io_u->engine_data = fri;
+	return 0;
+}
+
+/* ioengine_ops for get_ioengine() */
+static struct ioengine_ops ioengine = {
+	.name = "rados",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_DISKLESSIO,
+	.setup			= fio_rados_setup,
+	.queue			= fio_rados_queue,
+	.getevents		= fio_rados_getevents,
+	.event			= fio_rados_event,
+	.cleanup		= fio_rados_cleanup,
+	.open_file		= fio_rados_open,
+	.invalidate		= fio_rados_invalidate,
+	.options		= options,
+	.io_u_init		= fio_rados_io_u_init,
+	.io_u_free		= fio_rados_io_u_free,
+	.option_struct_size	= sizeof(struct rados_options),
+};
+
+static void fio_init fio_rados_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_rados_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/rbd.c fio-3.16/engines/rbd.c
--- fio-2.1.3/engines/rbd.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/engines/rbd.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,688 @@
+/*
+ * rbd engine
+ *
+ * IO engine using Ceph's librbd to test RADOS Block Devices.
+ *
+ */
+
+#include <rbd/librbd.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#ifdef CONFIG_RBD_POLL
+/* add for poll */
+#include <poll.h>
+#include <sys/eventfd.h>
+#endif
+
+struct fio_rbd_iou {
+	struct io_u *io_u;
+	rbd_completion_t completion;
+	int io_seen;
+	int io_complete;
+};
+
+struct rbd_data {
+	rados_t cluster;
+	rados_ioctx_t io_ctx;
+	rbd_image_t image;
+	struct io_u **aio_events;
+	struct io_u **sort_events;
+	int fd; /* add for poll */
+	bool connected;
+};
+
+struct rbd_options {
+	void *pad;
+	char *cluster_name;
+	char *rbd_name;
+	char *pool_name;
+	char *client_name;
+	int busy_poll;
+};
+
+static struct fio_option options[] = {
+        {
+		.name		= "clustername",
+		.lname		= "ceph cluster name",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Cluster name for ceph",
+		.off1		= offsetof(struct rbd_options, cluster_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+        },
+	{
+		.name		= "rbdname",
+		.lname		= "rbd engine rbdname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "RBD name for RBD engine",
+		.off1		= offsetof(struct rbd_options, rbd_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "pool",
+		.lname		= "rbd engine pool",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the pool hosting the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, pool_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "clientname",
+		.lname		= "rbd engine clientname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the ceph client to access the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, client_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name		= "busy_poll",
+		.lname		= "Busy poll",
+		.type		= FIO_OPT_BOOL,
+		.help		= "Busy poll for completions instead of sleeping",
+		.off1		= offsetof(struct rbd_options, busy_poll),
+		.def		= "0",
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static int _fio_setup_rbd_data(struct thread_data *td,
+			       struct rbd_data **rbd_data_ptr)
+{
+	struct rbd_data *rbd;
+
+	if (td->io_ops_data)
+		return 0;
+
+	rbd = calloc(1, sizeof(struct rbd_data));
+	if (!rbd)
+		goto failed;
+
+	rbd->connected = false;
+
+	/* add for poll, init fd: -1 */
+	rbd->fd = -1;
+
+	rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->aio_events)
+		goto failed;
+
+	rbd->sort_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->sort_events)
+		goto failed;
+
+	*rbd_data_ptr = rbd;
+	return 0;
+
+failed:
+	if (rbd) {
+		if (rbd->aio_events) 
+			free(rbd->aio_events);
+		if (rbd->sort_events)
+			free(rbd->sort_events);
+		free(rbd);
+	}
+	return 1;
+
+}
+
+#ifdef CONFIG_RBD_POLL
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	int r;
+
+	/* add for rbd poll */
+	rbd->fd = eventfd(0, EFD_SEMAPHORE);
+	if (rbd->fd < 0) {
+		log_err("eventfd failed.\n");
+		return false;
+	}
+
+	r = rbd_set_image_notification(rbd->image, rbd->fd, EVENT_TYPE_EVENTFD);
+	if (r < 0) {
+		log_err("rbd_set_image_notification failed.\n");
+		close(rbd->fd);
+		rbd->fd = -1;
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	return true;
+}
+#endif
+
+static int _fio_rbd_connect(struct thread_data *td)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	struct rbd_options *o = td->eo;
+	int r;
+
+	if (o->cluster_name) {
+		char *client_name = NULL; 
+
+		/*
+		 * If we specify cluser name, the rados_create2
+		 * will not assume 'client.'. name is considered
+		 * as a full type.id namestr
+		 */
+		if (o->client_name) {
+			if (!index(o->client_name, '.')) {
+				client_name = calloc(1, strlen("client.") +
+						    strlen(o->client_name) + 1);
+				strcat(client_name, "client.");
+				strcat(client_name, o->client_name);
+			} else {
+				client_name = o->client_name;
+			}
+		}
+
+		r = rados_create2(&rbd->cluster, o->cluster_name,
+				 client_name, 0);
+
+		if (client_name && !index(o->client_name, '.'))
+			free(client_name);
+	} else
+		r = rados_create(&rbd->cluster, o->client_name);
+	
+	if (r < 0) {
+		log_err("rados_create failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_conf_read_file(rbd->cluster, NULL);
+	if (r < 0) {
+		log_err("rados_conf_read_file failed.\n");
+		goto failed_early;
+	}
+
+	r = rados_connect(rbd->cluster);
+	if (r < 0) {
+		log_err("rados_connect failed.\n");
+		goto failed_shutdown;
+	}
+
+	r = rados_ioctx_create(rbd->cluster, o->pool_name, &rbd->io_ctx);
+	if (r < 0) {
+		log_err("rados_ioctx_create failed.\n");
+		goto failed_shutdown;
+	}
+
+	r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
+	if (r < 0) {
+		log_err("rbd_open failed.\n");
+		goto failed_open;
+	}
+
+	if (!_fio_rbd_setup_poll(rbd))
+		goto failed_poll;
+
+	return 0;
+
+failed_poll:
+	rbd_close(rbd->image);
+	rbd->image = NULL;
+failed_open:
+	rados_ioctx_destroy(rbd->io_ctx);
+	rbd->io_ctx = NULL;
+failed_shutdown:
+	rados_shutdown(rbd->cluster);
+	rbd->cluster = NULL;
+failed_early:
+	return 1;
+}
+
+static void _fio_rbd_disconnect(struct rbd_data *rbd)
+{
+	if (!rbd)
+		return;
+
+	/* close eventfd */
+	if (rbd->fd != -1) {
+		close(rbd->fd);
+		rbd->fd = -1;
+	}
+
+	/* shutdown everything */
+	if (rbd->image) {
+		rbd_close(rbd->image);
+		rbd->image = NULL;
+	}
+
+	if (rbd->io_ctx) {
+		rados_ioctx_destroy(rbd->io_ctx);
+		rbd->io_ctx = NULL;
+	}
+
+	if (rbd->cluster) {
+		rados_shutdown(rbd->cluster);
+		rbd->cluster = NULL;
+	}
+}
+
+static void _fio_rbd_finish_aiocb(rbd_completion_t comp, void *data)
+{
+	struct fio_rbd_iou *fri = data;
+	struct io_u *io_u = fri->io_u;
+	ssize_t ret;
+
+	/*
+	 * Looks like return value is 0 for success, or < 0 for
+	 * a specific error. So we have to assume that it can't do
+	 * partial completions.
+	 */
+	ret = rbd_aio_get_return_value(fri->completion);
+	if (ret < 0) {
+		io_u->error = -ret;
+		io_u->resid = io_u->xfer_buflen;
+	} else
+		io_u->error = 0;
+
+	fri->io_complete = 1;
+}
+
+static struct io_u *fio_rbd_event(struct thread_data *td, int event)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+
+	return rbd->aio_events[event];
+}
+
+static inline int fri_check_complete(struct rbd_data *rbd, struct io_u *io_u,
+				     unsigned int *events)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	if (fri->io_complete) {
+		fri->io_seen = 1;
+		rbd->aio_events[*events] = io_u;
+		(*events)++;
+
+		rbd_aio_release(fri->completion);
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifndef CONFIG_RBD_POLL
+static inline int rbd_io_u_seen(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	return fri->io_seen;
+}
+#endif
+
+static void rbd_io_u_wait_complete(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	rbd_aio_wait_for_complete(fri->completion);
+}
+
+static int rbd_io_u_cmp(const void *p1, const void *p2)
+{
+	const struct io_u **a = (const struct io_u **) p1;
+	const struct io_u **b = (const struct io_u **) p2;
+	uint64_t at, bt;
+
+	at = utime_since_now(&(*a)->start_time);
+	bt = utime_since_now(&(*b)->start_time);
+
+	if (at < bt)
+		return -1;
+	else if (at == bt)
+		return 0;
+	else
+		return 1;
+}
+
+static int rbd_iter_events(struct thread_data *td, unsigned int *events,
+			   unsigned int min_evts, int wait)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	unsigned int this_events = 0;
+	struct io_u *io_u;
+	int i, sidx = 0;
+
+#ifdef CONFIG_RBD_POLL
+	int ret = 0;
+	int event_num = 0;
+	struct fio_rbd_iou *fri = NULL;
+	rbd_completion_t comps[min_evts];
+	uint64_t counter;
+	bool completed;
+
+	struct pollfd pfd;
+	pfd.fd = rbd->fd;
+	pfd.events = POLLIN;
+
+	ret = poll(&pfd, 1, wait ? -1 : 0);
+	if (ret <= 0)
+		return 0;
+	if (!(pfd.revents & POLLIN))
+		return 0;
+
+	event_num = rbd_poll_io_events(rbd->image, comps, min_evts);
+
+	for (i = 0; i < event_num; i++) {
+		fri = rbd_aio_get_arg(comps[i]);
+		io_u = fri->io_u;
+
+		/* best effort to decrement the semaphore */
+		ret = read(rbd->fd, &counter, sizeof(counter));
+		if (ret <= 0)
+			log_err("rbd_iter_events failed to decrement semaphore.\n");
+
+		completed = fri_check_complete(rbd, io_u, events);
+		assert(completed);
+
+		this_events++;
+	}
+#else
+	io_u_qiter(&td->io_u_all, io_u, i) {
+		if (!(io_u->flags & IO_U_F_FLIGHT))
+			continue;
+		if (rbd_io_u_seen(io_u))
+			continue;
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+		else if (wait)
+			rbd->sort_events[sidx++] = io_u;
+	}
+#endif
+
+	if (!wait || !sidx)
+		return this_events;
+
+	/*
+	 * Sort events, oldest issue first, then wait on as many as we
+	 * need in order of age. If we have enough events, stop waiting,
+	 * and just check if any of the older ones are done.
+	 */
+	if (sidx > 1)
+		qsort(rbd->sort_events, sidx, sizeof(struct io_u *), rbd_io_u_cmp);
+
+	for (i = 0; i < sidx; i++) {
+		io_u = rbd->sort_events[i];
+
+		if (fri_check_complete(rbd, io_u, events)) {
+			this_events++;
+			continue;
+		}
+
+		/*
+		 * Stop waiting when we have enough, but continue checking
+		 * all pending IOs if they are complete.
+		 */
+		if (*events >= min_evts)
+			continue;
+
+		rbd_io_u_wait_complete(io_u);
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+	}
+
+	return this_events;
+}
+
+static int fio_rbd_getevents(struct thread_data *td, unsigned int min,
+			     unsigned int max, const struct timespec *t)
+{
+	unsigned int this_events, events = 0;
+	struct rbd_options *o = td->eo;
+	int wait = 0;
+
+	do {
+		this_events = rbd_iter_events(td, &events, min, wait);
+
+		if (events >= min)
+			break;
+		if (this_events)
+			continue;
+
+		if (!o->busy_poll)
+			wait = 1;
+		else
+			nop;
+	} while (1);
+
+	return events;
+}
+
+static enum fio_q_status fio_rbd_queue(struct thread_data *td,
+				       struct io_u *io_u)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+	struct fio_rbd_iou *fri = io_u->engine_data;
+	int r = -1;
+
+	fio_ro_check(td, io_u);
+
+	fri->io_seen = 0;
+	fri->io_complete = 0;
+
+	r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
+						&fri->completion);
+	if (r < 0) {
+		log_err("rbd_aio_create_completion failed.\n");
+		goto failed;
+	}
+
+	if (io_u->ddir == DDIR_WRITE) {
+		r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
+					 io_u->xfer_buf, fri->completion);
+		if (r < 0) {
+			log_err("rbd_aio_write failed.\n");
+			goto failed_comp;
+		}
+
+	} else if (io_u->ddir == DDIR_READ) {
+		r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
+					io_u->xfer_buf, fri->completion);
+
+		if (r < 0) {
+			log_err("rbd_aio_read failed.\n");
+			goto failed_comp;
+		}
+	} else if (io_u->ddir == DDIR_TRIM) {
+		r = rbd_aio_discard(rbd->image, io_u->offset,
+					io_u->xfer_buflen, fri->completion);
+		if (r < 0) {
+			log_err("rbd_aio_discard failed.\n");
+			goto failed_comp;
+		}
+	} else if (io_u->ddir == DDIR_SYNC) {
+		r = rbd_aio_flush(rbd->image, fri->completion);
+		if (r < 0) {
+			log_err("rbd_flush failed.\n");
+			goto failed_comp;
+		}
+	} else {
+		dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
+		       io_u->ddir);
+		r = -EINVAL;
+		goto failed_comp;
+	}
+
+	return FIO_Q_QUEUED;
+failed_comp:
+	rbd_aio_release(fri->completion);
+failed:
+	io_u->error = -r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_rbd_init(struct thread_data *td)
+{
+	int r;
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd->connected)
+		return 0;
+
+	r = _fio_rbd_connect(td);
+	if (r) {
+		log_err("fio_rbd_connect failed, return code: %d .\n", r);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	return 1;
+}
+
+static void fio_rbd_cleanup(struct thread_data *td)
+{
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd) {
+		_fio_rbd_disconnect(rbd);
+		free(rbd->aio_events);
+		free(rbd->sort_events);
+		free(rbd);
+	}
+}
+
+static int fio_rbd_setup(struct thread_data *td)
+{
+	rbd_image_info_t info;
+	struct fio_file *f;
+	struct rbd_data *rbd = NULL;
+	int r;
+
+	/* allocate engine specific structure to deal with librbd. */
+	r = _fio_setup_rbd_data(td, &rbd);
+	if (r) {
+		log_err("fio_setup_rbd_data failed.\n");
+		goto cleanup;
+	}
+	td->io_ops_data = rbd;
+
+	/* librbd does not allow us to run first in the main thread and later
+	 * in a fork child. It needs to be the same process context all the
+	 * time. 
+	 */
+	td->o.use_thread = 1;
+
+	/* connect in the main thread to determine to determine
+	 * the size of the given RADOS block device. And disconnect
+	 * later on.
+	 */
+	r = _fio_rbd_connect(td);
+	if (r) {
+		log_err("fio_rbd_connect failed.\n");
+		goto cleanup;
+	}
+	rbd->connected = true;
+
+	/* get size of the RADOS block device */
+	r = rbd_stat(rbd->image, &info, sizeof(info));
+	if (r < 0) {
+		log_err("rbd_status failed.\n");
+		goto cleanup;
+	} else if (info.size == 0) {
+		log_err("image size should be larger than zero.\n");
+		r = -EINVAL;
+		goto cleanup;
+	}
+
+	dprint(FD_IO, "rbd-engine: image size: %" PRIu64 "\n", info.size);
+
+	/* taken from "net" engine. Pretend we deal with files,
+	 * even if we do not have any ideas about files.
+	 * The size of the RBD is set instead of a artificial file.
+	 */
+	if (!td->files_index) {
+		add_file(td, td->o.filename ? : "rbd", 0, 0);
+		td->o.nr_files = td->o.nr_files ? : 1;
+		td->o.open_files++;
+	}
+	f = td->files[0];
+	f->real_file_size = info.size;
+
+	return 0;
+
+cleanup:
+	fio_rbd_cleanup(td);
+	return r;
+}
+
+static int fio_rbd_open(struct thread_data *td, struct fio_file *f)
+{
+	return 0;
+}
+
+static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+#if defined(CONFIG_RBD_INVAL)
+	struct rbd_data *rbd = td->io_ops_data;
+
+	return rbd_invalidate_cache(rbd->image);
+#else
+	return 0;
+#endif
+}
+
+static void fio_rbd_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	if (fri) {
+		io_u->engine_data = NULL;
+		free(fri);
+	}
+}
+
+static int fio_rbd_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri;
+
+	fri = calloc(1, sizeof(*fri));
+	fri->io_u = io_u;
+	io_u->engine_data = fri;
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "rbd",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rbd_setup,
+	.init			= fio_rbd_init,
+	.queue			= fio_rbd_queue,
+	.getevents		= fio_rbd_getevents,
+	.event			= fio_rbd_event,
+	.cleanup		= fio_rbd_cleanup,
+	.open_file		= fio_rbd_open,
+	.invalidate		= fio_rbd_invalidate,
+	.options		= options,
+	.io_u_init		= fio_rbd_io_u_init,
+	.io_u_free		= fio_rbd_io_u_free,
+	.option_struct_size	= sizeof(struct rbd_options),
+};
+
+static void fio_init fio_rbd_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_rbd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff -Nru fio-2.1.3/engines/rdma.c fio-3.16/engines/rdma.c
--- fio-2.1.3/engines/rdma.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/rdma.c	2019-09-20 01:01:52.000000000 +0000
@@ -30,7 +30,7 @@
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <netdb.h>
-#include <sys/poll.h>
+#include <poll.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/time.h>
@@ -41,9 +41,9 @@
 
 #include "../fio.h"
 #include "../hash.h"
+#include "../optgroup.h"
 
 #include <rdma/rdma_cma.h>
-#include <infiniband/arch.h>
 
 #define FIO_RDMA_MAX_IO_DEPTH    512
 
@@ -55,6 +55,88 @@
 	FIO_RDMA_CHA_RECV
 };
 
+struct rdmaio_options {
+	struct thread_data *td;
+	unsigned int port;
+	enum rdma_io_mode verb;
+	char *bindname;
+};
+
+static int str_hostname_cb(void *data, const char *input)
+{
+	struct rdmaio_options *o = data;
+
+	if (o->td->o.filename)
+		free(o->td->o.filename);
+	o->td->o.filename = strdup(input);
+	return 0;
+}
+
+static struct fio_option options[] = {
+	{
+		.name	= "hostname",
+		.lname	= "rdma engine hostname",
+		.type	= FIO_OPT_STR_STORE,
+		.cb	= str_hostname_cb,
+		.help	= "Hostname for RDMA IO engine",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "bindname",
+		.lname	= "rdma engine bindname",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct rdmaio_options, bindname),
+		.help	= "Bind for RDMA IO engine",
+		.def    = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "port",
+		.lname	= "rdma engine port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct rdmaio_options, port),
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port to use for RDMA connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "verb",
+		.lname	= "RDMA engine verb",
+		.alias	= "proto",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct rdmaio_options, verb),
+		.help	= "RDMA engine verb",
+		.def	= "write",
+		.posval = {
+			  { .ival = "write",
+			    .oval = FIO_RDMA_MEM_WRITE,
+			    .help = "Memory Write",
+			  },
+			  { .ival = "read",
+			    .oval = FIO_RDMA_MEM_READ,
+			    .help = "Memory Read",
+			  },
+			  { .ival = "send",
+			    .oval = FIO_RDMA_CHA_SEND,
+			    .help = "Posted Send",
+			  },
+			  { .ival = "recv",
+			    .oval = FIO_RDMA_CHA_RECV,
+			    .help = "Posted Receive",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
 struct remote_u {
 	uint64_t buf;
 	uint32_t rkey;
@@ -66,6 +148,7 @@
 	uint32_t nr;		/* client: io depth
 				   server: number of records for memory semantic
 				 */
+	uint32_t max_bs;        /* maximum block size */
 	struct remote_u rmt_us[FIO_RDMA_MAX_IO_DEPTH];
 };
 
@@ -118,13 +201,22 @@
 
 static int client_recv(struct thread_data *td, struct ibv_wc *wc)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
 
 	if (wc->byte_len != sizeof(rd->recv_buf)) {
 		log_err("Received bogus data, size %d\n", wc->byte_len);
 		return 1;
 	}
 
+	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	if (max_bs > ntohl(rd->recv_buf.max_bs)) {
+		log_err("fio: Server's block size (%d) must be greater than or "
+			"equal to the client's block size (%d)!\n",
+			ntohl(rd->recv_buf.max_bs), max_bs);
+		return 1;
+	}
+
 	/* store mr info for MEMORY semantic */
 	if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
 	    (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
@@ -134,7 +226,7 @@
 		rd->rmt_nr = ntohl(rd->recv_buf.nr);
 
 		for (i = 0; i < rd->rmt_nr; i++) {
-			rd->rmt_us[i].buf = ntohll(rd->recv_buf.rmt_us[i].buf);
+			rd->rmt_us[i].buf = be64_to_cpu(rd->recv_buf.rmt_us[i].buf);
 			rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey);
 			rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size);
 
@@ -150,7 +242,8 @@
 
 static int server_recv(struct thread_data *td, struct ibv_wc *wc)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
 
 	if (wc->wr_id == FIO_RDMA_MAX_IO_DEPTH) {
 		rd->rdma_protocol = ntohl(rd->recv_buf.mode);
@@ -158,6 +251,15 @@
 		/* CHANNEL semantic, do nothing */
 		if (rd->rdma_protocol == FIO_RDMA_CHA_SEND)
 			rd->rdma_protocol = FIO_RDMA_CHA_RECV;
+
+		max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+		if (max_bs < ntohl(rd->recv_buf.max_bs)) {
+			log_err("fio: Server's block size (%d) must be greater than or "
+				"equal to the client's block size (%d)!\n",
+				ntohl(rd->recv_buf.max_bs), max_bs);
+			return 1;
+		}
+
 	}
 
 	return 0;
@@ -165,7 +267,7 @@
 
 static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_wc wc;
 	struct rdma_io_u_data *r_io_u_d;
 	int ret;
@@ -186,9 +288,12 @@
 
 		case IBV_WC_RECV:
 			if (rd->is_client == 1)
-				client_recv(td, &wc);
+				ret = client_recv(td, &wc);
 			else
-				server_recv(td, &wc);
+				ret = server_recv(td, &wc);
+
+			if (ret)
+				return -1;
 
 			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
 				break;
@@ -258,6 +363,7 @@
 		}
 		rd->cq_event_num++;
 	}
+
 	if (ret) {
 		log_err("fio: poll error %d\n", ret);
 		return 1;
@@ -272,7 +378,7 @@
  */
 static int rdma_poll_wait(struct thread_data *td, enum ibv_wc_opcode opcode)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
 	int ret;
@@ -297,7 +403,7 @@
 	}
 
 	ret = cq_event_handler(td, opcode);
-	if (ret < 1)
+	if (ret == 0)
 		goto again;
 
 	ibv_ack_cq_events(rd->cq, ret);
@@ -309,7 +415,7 @@
 
 static int fio_rdmaio_setup_qp(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_qp_init_attr init_attr;
 	int qp_depth = td->o.iodepth * 2;	/* 2 times of io depth */
 
@@ -319,7 +425,7 @@
 		rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
 
 	if (rd->pd == NULL) {
-		log_err("fio: ibv_alloc_pd fail\n");
+		log_err("fio: ibv_alloc_pd fail: %m\n");
 		return 1;
 	}
 
@@ -328,7 +434,7 @@
 	else
 		rd->channel = ibv_create_comp_channel(rd->cm_id->verbs);
 	if (rd->channel == NULL) {
-		log_err("fio: ibv_create_comp_channel fail\n");
+		log_err("fio: ibv_create_comp_channel fail: %m\n");
 		goto err1;
 	}
 
@@ -342,12 +448,12 @@
 		rd->cq = ibv_create_cq(rd->cm_id->verbs,
 				       qp_depth, rd, rd->channel, 0);
 	if (rd->cq == NULL) {
-		log_err("fio: ibv_create_cq failed\n");
+		log_err("fio: ibv_create_cq failed: %m\n");
 		goto err2;
 	}
 
 	if (ibv_req_notify_cq(rd->cq, 0) != 0) {
-		log_err("fio: ibv_create_cq failed\n");
+		log_err("fio: ibv_req_notify_cq failed: %m\n");
 		goto err3;
 	}
 
@@ -363,13 +469,13 @@
 
 	if (rd->is_client == 0) {
 		if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) {
-			log_err("fio: rdma_create_qp failed\n");
+			log_err("fio: rdma_create_qp failed: %m\n");
 			goto err3;
 		}
 		rd->qp = rd->child_cm_id->qp;
 	} else {
 		if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) {
-			log_err("fio: rdma_create_qp failed\n");
+			log_err("fio: rdma_create_qp failed: %m\n");
 			goto err3;
 		}
 		rd->qp = rd->cm_id->qp;
@@ -389,19 +495,19 @@
 
 static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	rd->recv_mr = ibv_reg_mr(rd->pd, &rd->recv_buf, sizeof(rd->recv_buf),
 				 IBV_ACCESS_LOCAL_WRITE);
 	if (rd->recv_mr == NULL) {
-		log_err("fio: recv_buf reg_mr failed\n");
+		log_err("fio: recv_buf reg_mr failed: %m\n");
 		return 1;
 	}
 
 	rd->send_mr = ibv_reg_mr(rd->pd, &rd->send_buf, sizeof(rd->send_buf),
 				 0);
 	if (rd->send_mr == NULL) {
-		log_err("fio: send_buf reg_mr failed\n");
+		log_err("fio: send_buf reg_mr failed: %m\n");
 		ibv_dereg_mr(rd->recv_mr);
 		return 1;
 	}
@@ -433,7 +539,7 @@
 				  struct rdma_event_channel *channel,
 				  enum rdma_cm_event_type wait_event)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_cm_event *event;
 	int ret;
 
@@ -465,7 +571,7 @@
 
 static int fio_rdmaio_prep(struct thread_data *td, struct io_u *io_u)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_io_u_data *r_io_u_d;
 
 	r_io_u_d = io_u->engine_data;
@@ -508,7 +614,7 @@
 
 static struct io_u *fio_rdmaio_event(struct thread_data *td, int event)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct io_u *io_u;
 	int i;
 
@@ -524,9 +630,9 @@
 }
 
 static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
-				unsigned int max, struct timespec *t)
+				unsigned int max, const struct timespec *t)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	enum ibv_wc_opcode comp_opcode;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
@@ -588,7 +694,7 @@
 static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
 			   unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_send_wr *bad_wr;
 #if 0
 	enum ibv_wc_opcode comp_opcode;
@@ -635,7 +741,7 @@
 		}
 
 		if (ibv_post_send(rd->qp, &r_io_u_d->sq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_send fail\n");
+			log_err("fio: ibv_post_send fail: %m\n");
 			return -1;
 		}
 
@@ -651,7 +757,7 @@
 static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
 			   unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_recv_wr *bad_wr;
 	struct rdma_io_u_data *r_io_u_d;
 	int i;
@@ -663,7 +769,7 @@
 			r_io_u_d = io_us[i]->engine_data;
 			if (ibv_post_recv(rd->qp, &r_io_u_d->rq_wr, &bad_wr) !=
 			    0) {
-				log_err("fio: ibv_post_recv fail\n");
+				log_err("fio: ibv_post_recv fail: %m\n");
 				return 1;
 			}
 		}
@@ -671,7 +777,7 @@
 		   || (rd->rdma_protocol == FIO_RDMA_MEM_WRITE)) {
 		/* re-post the rq_wr */
 		if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_recv fail\n");
+			log_err("fio: ibv_post_recv fail: %m\n");
 			return 1;
 		}
 
@@ -685,9 +791,10 @@
 	return i;
 }
 
-static int fio_rdmaio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_rdmaio_queue(struct thread_data *td,
+					  struct io_u *io_u)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -705,8 +812,8 @@
 static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
 			      unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
-	struct timeval now;
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct timespec now;
 	unsigned int i;
 
 	if (!fio_fill_issue_time(td))
@@ -728,7 +835,7 @@
 
 static int fio_rdmaio_commit(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct io_u **io_us;
 	int ret;
 
@@ -760,7 +867,7 @@
 
 static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_conn_param conn_param;
 	struct ibv_send_wr *bad_wr;
 
@@ -770,7 +877,7 @@
 	conn_param.retry_count = 10;
 
 	if (rdma_connect(rd->cm_id, &conn_param) != 0) {
-		log_err("fio: rdma_connect fail\n");
+		log_err("fio: rdma_connect fail: %m\n");
 		return 1;
 	}
 
@@ -785,14 +892,16 @@
 	rd->send_buf.nr = htonl(td->o.iodepth);
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
-	rdma_poll_wait(td, IBV_WC_SEND);
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
 
 	/* wait for remote MR info from server side */
-	rdma_poll_wait(td, IBV_WC_RECV);
+	if (rdma_poll_wait(td, IBV_WC_RECV) < 0)
+		return 1;
 
 	/* In SEND/RECV test, it's a good practice to setup the iodepth of
 	 * of the RECV side deeper than that of the SEND side to
@@ -809,9 +918,10 @@
 
 static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_conn_param conn_param;
 	struct ibv_send_wr *bad_wr;
+	int ret = 0;
 
 	/* rdma_accept() - then wait for accept success */
 	memset(&conn_param, 0, sizeof(conn_param));
@@ -819,7 +929,7 @@
 	conn_param.initiator_depth = 1;
 
 	if (rdma_accept(rd->child_cm_id, &conn_param) != 0) {
-		log_err("fio: rdma_accept\n");
+		log_err("fio: rdma_accept: %m\n");
 		return 1;
 	}
 
@@ -830,16 +940,17 @@
 	}
 
 	/* wait for request */
-	rdma_poll_wait(td, IBV_WC_RECV);
+	ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
-	rdma_poll_wait(td, IBV_WC_SEND);
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
 
-	return 0;
+	return ret;
 }
 
 static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
@@ -852,7 +963,7 @@
 
 static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_send_wr *bad_wr;
 
 	/* unregister rdma buffer */
@@ -865,11 +976,11 @@
 				     || (rd->rdma_protocol ==
 					 FIO_RDMA_MEM_READ))) {
 		if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_send fail");
+			log_err("fio: ibv_post_send fail: %m\n");
 			return 1;
 		}
 
-		dprint(FD_IO, "fio: close infomation sent success\n");
+		dprint(FD_IO, "fio: close information sent success\n");
 		rdma_poll_wait(td, IBV_WC_SEND);
 	}
 
@@ -905,30 +1016,53 @@
 	return 0;
 }
 
+static int aton(struct thread_data *td, const char *host,
+		     struct sockaddr_in *addr)
+{
+	if (inet_aton(host, &addr->sin_addr) != 1) {
+		struct hostent *hent;
+
+		hent = gethostbyname(host);
+		if (!hent) {
+			td_verror(td, errno, "gethostbyname");
+			return 1;
+		}
+
+		memcpy(&addr->sin_addr, hent->h_addr, 4);
+	}
+	return 0;
+}
+
 static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
 				    unsigned short port)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
+	struct sockaddr_storage addrb;
 	struct ibv_recv_wr *bad_wr;
 	int err;
 
 	rd->addr.sin_family = AF_INET;
 	rd->addr.sin_port = htons(port);
 
-	if (inet_aton(host, &rd->addr.sin_addr) != 1) {
-		struct hostent *hent;
+	err = aton(td, host, &rd->addr);
+	if (err)
+		return err;
 
-		hent = gethostbyname(host);
-		if (!hent) {
-			td_verror(td, errno, "gethostbyname");
-			return 1;
-		}
+	/* resolve route */
+	if (strcmp(o->bindname, "") != 0) {
+		addrb.ss_family = AF_INET;
+		err = aton(td, o->bindname, (struct sockaddr_in *)&addrb);
+		if (err)
+			return err;
+		err = rdma_resolve_addr(rd->cm_id, (struct sockaddr *)&addrb,
+					(struct sockaddr *)&rd->addr, 2000);
 
-		memcpy(&rd->addr.sin_addr, hent->h_addr, 4);
+	} else {
+		err = rdma_resolve_addr(rd->cm_id, NULL,
+					(struct sockaddr *)&rd->addr, 2000);
 	}
 
-	/* resolve route */
-	err = rdma_resolve_addr(rd->cm_id, NULL, (struct sockaddr *)&rd->addr, 2000);
 	if (err != 0) {
 		log_err("fio: rdma_resolve_addr: %d\n", err);
 		return 1;
@@ -972,24 +1106,34 @@
 
 static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
 	struct ibv_recv_wr *bad_wr;
+	int state = td->runstate;
+
+	td_set_runstate(td, TD_SETTING_UP);
 
 	rd->addr.sin_family = AF_INET;
-	rd->addr.sin_addr.s_addr = htonl(INADDR_ANY);
 	rd->addr.sin_port = htons(port);
 
+	if (strcmp(o->bindname, "") == 0)
+		rd->addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	else
+		rd->addr.sin_addr.s_addr = htonl(*o->bindname);
+
 	/* rdma_listen */
 	if (rdma_bind_addr(rd->cm_id, (struct sockaddr *)&rd->addr) != 0) {
-		log_err("fio: rdma_bind_addr fail\n");
+		log_err("fio: rdma_bind_addr fail: %m\n");
 		return 1;
 	}
 
 	if (rdma_listen(rd->cm_id, 3) != 0) {
-		log_err("fio: rdma_listen fail\n");
+		log_err("fio: rdma_listen fail: %m\n");
 		return 1;
 	}
 
+	log_info("fio: waiting for connection\n");
+
 	/* wait for CONNECT_REQUEST */
 	if (get_next_channel_event
 	    (td, rd->cm_channel, RDMA_CM_EVENT_CONNECT_REQUEST) != 0) {
@@ -1005,10 +1149,11 @@
 
 	/* post recv buf */
 	if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_recv fail\n");
+		log_err("fio: ibv_post_recv fail: %m\n");
 		return 1;
 	}
 
+	td_set_runstate(td, state);
 	return 0;
 }
 
@@ -1046,77 +1191,100 @@
 	return 0;
 }
 
-static int fio_rdmaio_init(struct thread_data *td)
+static int compat_options(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
-	unsigned int max_bs;
-	unsigned int port;
-	char host[64], buf[128];
-	char *sep, *portp, *modep;
-	int ret, i;
-
-	if (td_rw(td)) {
-		log_err("fio: rdma connections must be read OR write\n");
-		return 1;
-	}
-	if (td_random(td)) {
-		log_err("fio: RDMA network IO can't be random\n");
-		return 1;
-	}
+	// The original RDMA engine had an ugly / seperator
+	// on the filename for it's options. This function
+	// retains backwards compatibility with it. Note we do not
+	// support setting the bindname option is this legacy mode.
+
+	struct rdmaio_options *o = td->eo;
+	char *modep, *portp;
+	char *filename = td->o.filename;
 
-	if (check_set_rlimits(td))
-		return 1;
+	if (!filename)
+		return 0;
 
-	strcpy(buf, td->o.filename);
+	portp = strchr(filename, '/');
+	if (portp == NULL)
+		return 0;
 
-	sep = strchr(buf, '/');
-	if (!sep)
-		goto bad_host;
+	*portp = '\0';
+	portp++;
 
-	*sep = '\0';
-	sep++;
-	strcpy(host, buf);
-	if (!strlen(host))
+	o->port = strtol(portp, NULL, 10);
+	if (!o->port || o->port > 65535)
 		goto bad_host;
 
-	modep = NULL;
-	portp = sep;
-	sep = strchr(portp, '/');
-	if (sep) {
-		*sep = '\0';
-		modep = sep + 1;
+	modep = strchr(portp, '/');
+	if (modep != NULL) {
+		*modep = '\0';
+		modep++;
 	}
 
-	port = strtol(portp, NULL, 10);
-	if (!port || port > 65535)
-		goto bad_host;
-
 	if (modep) {
 		if (!strncmp("rdma_write", modep, strlen(modep)) ||
 		    !strncmp("RDMA_WRITE", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
+			o->verb = FIO_RDMA_MEM_WRITE;
 		else if (!strncmp("rdma_read", modep, strlen(modep)) ||
 			 !strncmp("RDMA_READ", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_MEM_READ;
+			o->verb = FIO_RDMA_MEM_READ;
 		else if (!strncmp("send", modep, strlen(modep)) ||
 			 !strncmp("SEND", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_CHA_SEND;
+			o->verb = FIO_RDMA_CHA_SEND;
 		else
 			goto bad_host;
 	} else
-		rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
+		o->verb = FIO_RDMA_MEM_WRITE;
+
+
+	return 0;
 
+bad_host:
+	log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
+	return 1;
+}
+
+static int fio_rdmaio_init(struct thread_data *td)
+{
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
+	unsigned int max_bs;
+	int ret, i;
+
+	if (td_rw(td)) {
+		log_err("fio: rdma connections must be read OR write\n");
+		return 1;
+	}
+	if (td_random(td)) {
+		log_err("fio: RDMA network IO can't be random\n");
+		return 1;
+	}
+
+	if (compat_options(td))
+		return 1;
+
+	if (!o->port) {
+		log_err("fio: no port has been specified which is required "
+			"for the rdma engine\n");
+		return 1;
+	}
+
+	if (check_set_rlimits(td))
+		return 1;
+
+	rd->rdma_protocol = o->verb;
 	rd->cq_event_num = 0;
 
 	rd->cm_channel = rdma_create_event_channel();
 	if (!rd->cm_channel) {
-		log_err("fio: rdma_create_event_channel fail\n");
+		log_err("fio: rdma_create_event_channel fail: %m\n");
 		return 1;
 	}
 
 	ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP);
 	if (ret) {
-		log_err("fio: rdma_create_id fail\n");
+		log_err("fio: rdma_create_id fail: %m\n");
 		return 1;
 	}
 
@@ -1143,14 +1311,17 @@
 
 	if (td_read(td)) {	/* READ as the server */
 		rd->is_client = 0;
+		td->flags |= TD_F_NO_PROGRESS;
 		/* server rd->rdma_buf_len will be setup after got request */
-		ret = fio_rdmaio_setup_listen(td, port);
+		ret = fio_rdmaio_setup_listen(td, o->port);
 	} else {		/* WRITE as the client */
 		rd->is_client = 1;
-		ret = fio_rdmaio_setup_connect(td, host, port);
+		ret = fio_rdmaio_setup_connect(td, td->o.filename, o->port);
 	}
 
 	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	rd->send_buf.max_bs = htonl(max_bs);
+
 	/* register each io_u in the free list */
 	for (i = 0; i < td->io_u_freelist.nr; i++) {
 		struct io_u *io_u = td->io_u_freelist.io_us[i];
@@ -1164,12 +1335,12 @@
 				      IBV_ACCESS_REMOTE_READ |
 				      IBV_ACCESS_REMOTE_WRITE);
 		if (io_u->mr == NULL) {
-			log_err("fio: ibv_reg_mr io_u failed\n");
+			log_err("fio: ibv_reg_mr io_u failed: %m\n");
 			return 1;
 		}
 
 		rd->send_buf.rmt_us[i].buf =
-		    htonll((uint64_t) (unsigned long)io_u->buf);
+		    cpu_to_be64((uint64_t) (unsigned long)io_u->buf);
 		rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
 		rd->send_buf.rmt_us[i].size = htonl(max_bs);
 
@@ -1181,14 +1352,11 @@
 	rd->send_buf.nr = htonl(i);
 
 	return ret;
-bad_host:
-	log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
-	return 1;
 }
 
 static void fio_rdmaio_cleanup(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	if (rd)
 		free(rd);
@@ -1198,31 +1366,39 @@
 {
 	struct rdmaio_data *rd;
 
-	if (!td->io_ops->data) {
+	if (!td->files_index) {
+		add_file(td, td->o.filename ?: "rdma", 0, 0);
+		td->o.nr_files = td->o.nr_files ?: 1;
+		td->o.open_files++;
+	}
+
+	if (!td->io_ops_data) {
 		rd = malloc(sizeof(*rd));
 
 		memset(rd, 0, sizeof(*rd));
-		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME);
-		td->io_ops->data = rd;
+		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+		td->io_ops_data = rd;
 	}
 
 	return 0;
 }
 
 static struct ioengine_ops ioengine_rw = {
-	.name		= "rdma",
-	.version	= FIO_IOOPS_VERSION,
-	.setup		= fio_rdmaio_setup,
-	.init		= fio_rdmaio_init,
-	.prep		= fio_rdmaio_prep,
-	.queue		= fio_rdmaio_queue,
-	.commit		= fio_rdmaio_commit,
-	.getevents	= fio_rdmaio_getevents,
-	.event		= fio_rdmaio_event,
-	.cleanup	= fio_rdmaio_cleanup,
-	.open_file	= fio_rdmaio_open_file,
-	.close_file	= fio_rdmaio_close_file,
-	.flags		= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.name			= "rdma",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rdmaio_setup,
+	.init			= fio_rdmaio_init,
+	.prep			= fio_rdmaio_prep,
+	.queue			= fio_rdmaio_queue,
+	.commit			= fio_rdmaio_commit,
+	.getevents		= fio_rdmaio_getevents,
+	.event			= fio_rdmaio_event,
+	.cleanup		= fio_rdmaio_cleanup,
+	.open_file		= fio_rdmaio_open_file,
+	.close_file		= fio_rdmaio_close_file,
+	.flags			= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.options		= options,
+	.option_struct_size	= sizeof(struct rdmaio_options),
 };
 
 static void fio_init fio_rdmaio_register(void)
diff -Nru fio-2.1.3/engines/sg.c fio-3.16/engines/sg.c
--- fio-2.1.3/engines/sg.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/sg.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,23 +3,145 @@
  *
  * IO engine that uses the Linux SG v3 interface to talk to SCSI devices
  *
+ * This ioengine can operate in two modes:
+ *	sync	with block devices (/dev/sdX) or
+ *		with character devices (/dev/sgY) with direct=1 or sync=1
+ *	async	with character devices with direct=0 and sync=0
+ *
+ * What value does queue() return for the different cases?
+ *				queue() return value
+ * In sync mode:
+ *  /dev/sdX		RWT	FIO_Q_COMPLETED
+ *  /dev/sgY		RWT	FIO_Q_COMPLETED
+ *   with direct=1 or sync=1
+ *
+ * In async mode:
+ *  /dev/sgY		RWT	FIO_Q_QUEUED
+ *   direct=0 and sync=0
+ *
+ * Because FIO_SYNCIO is set for this ioengine td_io_queue() will fill in
+ * issue_time *before* each IO is sent to queue()
+ *
+ * Where are the IO counting functions called for the different cases?
+ *
+ * In sync mode:
+ *  /dev/sdX (commit==NULL)
+ *   RWT
+ *    io_u_mark_depth()			called in td_io_queue()
+ *    io_u_mark_submit/complete()	called in td_io_queue()
+ *    issue_time			set in td_io_queue()
+ *
+ *  /dev/sgY with direct=1 or sync=1 (commit does nothing)
+ *   RWT
+ *    io_u_mark_depth()			called in td_io_queue()
+ *    io_u_mark_submit/complete()	called in queue()
+ *    issue_time			set in td_io_queue()
+ *  
+ * In async mode:
+ *  /dev/sgY with direct=0 and sync=0
+ *   RW: read and write operations are submitted in queue()
+ *    io_u_mark_depth()			called in td_io_commit()
+ *    io_u_mark_submit()		called in queue()
+ *    issue_time			set in td_io_queue()
+ *   T: trim operations are queued in queue() and submitted in commit()
+ *    io_u_mark_depth()			called in td_io_commit()
+ *    io_u_mark_submit()		called in commit()
+ *    issue_time			set in commit()
+ *
  */
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
-#include <assert.h>
-#include <sys/poll.h>
+#include <poll.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 #ifdef FIO_HAVE_SGIO
 
+enum {
+	FIO_SG_WRITE		= 1,
+	FIO_SG_WRITE_VERIFY	= 2,
+	FIO_SG_WRITE_SAME	= 3
+};
+
+struct sg_options {
+	void *pad;
+	unsigned int readfua;
+	unsigned int writefua;
+	unsigned int write_mode;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "readfua",
+		.lname	= "sg engine read fua flag support",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct sg_options, readfua),
+		.help	= "Set FUA flag (force unit access) for all Read operations",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= "writefua",
+		.lname	= "sg engine write fua flag support",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct sg_options, writefua),
+		.help	= "Set FUA flag (force unit access) for all Write operations",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= "sg_write_mode",
+		.lname	= "specify sg write mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct sg_options, write_mode),
+		.help	= "Specify SCSI WRITE mode",
+		.def	= "write",
+		.posval = {
+			  { .ival = "write",
+			    .oval = FIO_SG_WRITE,
+			    .help = "Issue standard SCSI WRITE commands",
+			  },
+			  { .ival = "verify",
+			    .oval = FIO_SG_WRITE_VERIFY,
+			    .help = "Issue SCSI WRITE AND VERIFY commands",
+			  },
+			  { .ival = "same",
+			    .oval = FIO_SG_WRITE_SAME,
+			    .help = "Issue SCSI WRITE SAME commands",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+#define MAX_10B_LBA  0xFFFFFFFFULL
+#define SCSI_TIMEOUT_MS 30000   // 30 second timeout; currently no method to override
+#define MAX_SB 64               // sense block maximum return size
+/*
+#define FIO_SGIO_DEBUG
+*/
+
 struct sgio_cmd {
-	unsigned char cdb[10];
+	unsigned char cdb[16];      // enhanced from 10 to support 16 byte commands
+	unsigned char sb[MAX_SB];   // add sense block to commands
 	int nr;
 };
 
+struct sgio_trim {
+	uint8_t *unmap_param;
+	unsigned int unmap_range_count;
+	struct io_u **trim_io_us;
+};
+
 struct sgio_data {
 	struct sgio_cmd *cmds;
 	struct io_u **events;
@@ -28,8 +150,49 @@
 	void *sgbuf;
 	unsigned int bs;
 	int type_checked;
+	struct sgio_trim **trim_queues;
+	int current_queue;
+#ifdef FIO_SGIO_DEBUG
+	unsigned int *trim_queue_map;
+#endif
 };
 
+static inline uint32_t sgio_get_be32(uint8_t *buf)
+{
+	return be32_to_cpu(*((uint32_t *) buf));
+}
+
+static inline uint64_t sgio_get_be64(uint8_t *buf)
+{
+	return be64_to_cpu(*((uint64_t *) buf));
+}
+
+static inline void sgio_set_be16(uint16_t val, uint8_t *buf)
+{
+	uint16_t t = cpu_to_be16(val);
+
+	memcpy(buf, &t, sizeof(uint16_t));
+}
+
+static inline void sgio_set_be32(uint32_t val, uint8_t *buf)
+{
+	uint32_t t = cpu_to_be32(val);
+
+	memcpy(buf, &t, sizeof(uint32_t));
+}
+
+static inline void sgio_set_be64(uint64_t val, uint8_t *buf)
+{
+	uint64_t t = cpu_to_be64(val);
+
+	memcpy(buf, &t, sizeof(uint64_t));
+}
+
+static inline bool sgio_unbuffered(struct thread_data *td)
+{
+	return (td->o.odirect || td->o.sync_io);
+}
+
 static void sgio_hdr_init(struct sgio_data *sd, struct sg_io_hdr *hdr,
 			  struct io_u *io_u, int fs)
 {
@@ -41,8 +204,11 @@
 	hdr->interface_id = 'S';
 	hdr->cmdp = sc->cdb;
 	hdr->cmd_len = sizeof(sc->cdb);
+	hdr->sbp = sc->sb;
+	hdr->mx_sb_len = sizeof(sc->sb);
 	hdr->pack_id = io_u->index;
 	hdr->usr_ptr = io_u;
+	hdr->timeout = SCSI_TIMEOUT_MS;
 
 	if (fs) {
 		hdr->dxferp = io_u->xfer_buf;
@@ -61,14 +227,45 @@
 	return 0;
 }
 
+static int sg_fd_read(int fd, void *data, size_t size)
+{
+	int err = 0;
+
+	while (size) {
+		ssize_t ret;
+
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			err = errno;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (err)
+		return err;
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
 static int fio_sgio_getevents(struct thread_data *td, unsigned int min,
-			      unsigned int max, struct timespec fio_unused *t)
+			      unsigned int max,
+			      const struct timespec fio_unused *t)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	int left = max, ret, r = 0;
+	struct sgio_data *sd = td->io_ops_data;
+	int left = max, eventNum, ret, r = 0, trims = 0;
 	void *buf = sd->sgbuf;
-	unsigned int i, events;
+	unsigned int i, j, events;
 	struct fio_file *f;
+	struct io_u *io_u;
 
 	/*
 	 * Fill in the file descriptors
@@ -77,16 +274,29 @@
 		/*
 		 * don't block for min events == 0
 		 */
-		if (!min) {
-			sd->fd_flags[i] = fcntl(f->fd, F_GETFL);
-			fcntl(f->fd, F_SETFL, sd->fd_flags[i] | O_NONBLOCK);
-		}
+		if (!min)
+			sd->fd_flags[i] = fio_set_fd_nonblocking(f->fd, "sg");
+		else
+			sd->fd_flags[i] = -1;
+
 		sd->pfds[i].fd = f->fd;
 		sd->pfds[i].events = POLLIN;
 	}
 
-	while (left) {
-		void *p;
+	/*
+	** There are two counters here:
+	**  - number of SCSI commands completed
+	**  - number of io_us completed
+	**
+	** These are the same with reads and writes, but
+	** could differ with trim/unmap commands because
+	** a single unmap can include multiple io_us
+	*/
+
+	while (left > 0) {
+		char *p;
+
+		dprint(FD_IO, "sgio_getevents: sd %p: min=%d, max=%d, left=%d\n", sd, min, max, left);
 
 		do {
 			if (!min)
@@ -112,20 +322,27 @@
 		p = buf;
 		events = 0;
 		for_each_file(td, f, i) {
-			ret = read(f->fd, p, left * sizeof(struct sg_io_hdr));
-			if (ret < 0) {
-				if (errno == EAGAIN)
-					continue;
-				r = -errno;
-				td_verror(td, errno, "read");
-				break;
-			} else if (ret) {
-				p += ret;
-				events += ret / sizeof(struct sg_io_hdr);
+			for (eventNum = 0; eventNum < left; eventNum++) {
+				ret = sg_fd_read(f->fd, p, sizeof(struct sg_io_hdr));
+				dprint(FD_IO, "sgio_getevents: sg_fd_read ret: %d\n", ret);
+				if (ret) {
+					r = -ret;
+					td_verror(td, r, "sg_read");
+					break;
+				}
+				io_u = ((struct sg_io_hdr *)p)->usr_ptr;
+				if (io_u->ddir == DDIR_TRIM) {
+					events += sd->trim_queues[io_u->index]->unmap_range_count;
+					eventNum += sd->trim_queues[io_u->index]->unmap_range_count - 1;
+				} else
+					events++;
+
+				p += sizeof(struct sg_io_hdr);
+				dprint(FD_IO, "sgio_getevents: events: %d, eventNum: %d, left: %d\n", events, eventNum, left);
 			}
 		}
 
-		if (r < 0)
+		if (r < 0 && !events)
 			break;
 		if (!events) {
 			usleep(1000);
@@ -137,23 +354,60 @@
 
 		for (i = 0; i < events; i++) {
 			struct sg_io_hdr *hdr = (struct sg_io_hdr *) buf + i;
+			sd->events[i + trims] = hdr->usr_ptr;
+			io_u = (struct io_u *)(hdr->usr_ptr);
 
-			sd->events[i] = hdr->usr_ptr;
+			if (hdr->info & SG_INFO_CHECK) {
+				/* record if an io error occurred, ignore resid */
+				memcpy(&io_u->hdr, hdr, sizeof(struct sg_io_hdr));
+				sd->events[i + trims]->error = EIO;
+			}
+
+			if (io_u->ddir == DDIR_TRIM) {
+				struct sgio_trim *st = sd->trim_queues[io_u->index];
+#ifdef FIO_SGIO_DEBUG
+				assert(st->trim_io_us[0] == io_u);
+				assert(sd->trim_queue_map[io_u->index] == io_u->index);
+				dprint(FD_IO, "sgio_getevents: reaping %d io_us from trim queue %d\n", st->unmap_range_count, io_u->index);
+				dprint(FD_IO, "sgio_getevents: reaped io_u %d and stored in events[%d]\n", io_u->index, i+trims);
+#endif
+				for (j = 1; j < st->unmap_range_count; j++) {
+					++trims;
+					sd->events[i + trims] = st->trim_io_us[j];
+#ifdef FIO_SGIO_DEBUG
+					dprint(FD_IO, "sgio_getevents: reaped io_u %d and stored in events[%d]\n", st->trim_io_us[j]->index, i+trims);
+					assert(sd->trim_queue_map[st->trim_io_us[j]->index] == io_u->index);
+#endif
+					if (hdr->info & SG_INFO_CHECK) {
+						/* record if an io error occurred, ignore resid */
+						memcpy(&st->trim_io_us[j]->hdr, hdr, sizeof(struct sg_io_hdr));
+						sd->events[i + trims]->error = EIO;
+					}
+				}
+				events -= st->unmap_range_count - 1;
+				st->unmap_range_count = 0;
+			}
 		}
 	}
 
 	if (!min) {
-		for_each_file(td, f, i)
-			fcntl(f->fd, F_SETFL, sd->fd_flags[i]);
+		for_each_file(td, f, i) {
+			if (sd->fd_flags[i] == -1)
+				continue;
+
+			if (fcntl(f->fd, F_SETFL, sd->fd_flags[i]) < 0)
+				log_err("fio: sg failed to restore fcntl flags: %s\n", strerror(errno));
+		}
 	}
 
 	return r;
 }
 
-static int fio_sgio_ioctl_doio(struct thread_data *td,
-			       struct fio_file *f, struct io_u *io_u)
+static enum fio_q_status fio_sgio_ioctl_doio(struct thread_data *td,
+					     struct fio_file *f,
+					     struct io_u *io_u)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 	struct sg_io_hdr *hdr = &io_u->hdr;
 	int ret;
 
@@ -163,10 +417,16 @@
 	if (ret < 0)
 		return ret;
 
+	/* record if an io error occurred */
+	if (hdr->info & SG_INFO_CHECK)
+		io_u->error = EIO;
+
 	return FIO_Q_COMPLETED;
 }
 
-static int fio_sgio_rw_doio(struct fio_file *f, struct io_u *io_u, int do_sync)
+static enum fio_q_status fio_sgio_rw_doio(struct thread_data *td,
+					  struct fio_file *f,
+					  struct io_u *io_u, int do_sync)
 {
 	struct sg_io_hdr *hdr = &io_u->hdr;
 	int ret;
@@ -176,77 +436,223 @@
 		return ret;
 
 	if (do_sync) {
-		ret = read(f->fd, hdr, sizeof(*hdr));
-		if (ret < 0)
-			return ret;
+		/*
+		 * We can't just read back the first command that completes
+		 * and assume it's the one we need, it could be any command
+		 * that is inflight.
+		 */
+		do {
+			struct io_u *__io_u;
+
+			ret = read(f->fd, hdr, sizeof(*hdr));
+			if (ret < 0)
+				return ret;
+
+			__io_u = hdr->usr_ptr;
+
+			/* record if an io error occurred */
+			if (hdr->info & SG_INFO_CHECK)
+				__io_u->error = EIO;
+
+			if (__io_u == io_u)
+				break;
+
+			if (io_u_sync_complete(td, __io_u)) {
+				ret = -1;
+				break;
+			}
+		} while (1);
+
 		return FIO_Q_COMPLETED;
 	}
 
 	return FIO_Q_QUEUED;
 }
 
-static int fio_sgio_doio(struct thread_data *td, struct io_u *io_u, int do_sync)
+static enum fio_q_status fio_sgio_doio(struct thread_data *td,
+				       struct io_u *io_u, int do_sync)
 {
 	struct fio_file *f = io_u->file;
+	enum fio_q_status ret;
 
-	if (f->filetype == FIO_TYPE_BD)
-		return fio_sgio_ioctl_doio(td, f, io_u);
+	if (f->filetype == FIO_TYPE_BLOCK) {
+		ret = fio_sgio_ioctl_doio(td, f, io_u);
+		if (io_u->error)
+			td_verror(td, io_u->error, __func__);
+	} else {
+		ret = fio_sgio_rw_doio(td, f, io_u, do_sync);
+		if (io_u->error && do_sync)
+			td_verror(td, io_u->error, __func__);
+	}
+
+	return ret;
+}
+
+static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
+			    unsigned long long nr_blocks)
+{
+	if (lba < MAX_10B_LBA) {
+		sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
+		sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
+	} else {
+		sgio_set_be64(lba, &hdr->cmdp[2]);
+		sgio_set_be32((uint32_t) nr_blocks, &hdr->cmdp[10]);
+	}
 
-	return fio_sgio_rw_doio(f, io_u, do_sync);
+	return;
 }
 
 static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct sg_io_hdr *hdr = &io_u->hdr;
-	struct sgio_data *sd = td->io_ops->data;
-	int nr_blocks, lba;
+	struct sg_options *o = td->eo;
+	struct sgio_data *sd = td->io_ops_data;
+	unsigned long long nr_blocks, lba;
+	int offset;
 
 	if (io_u->xfer_buflen & (sd->bs - 1)) {
 		log_err("read/write not sector aligned\n");
 		return EINVAL;
 	}
 
+	nr_blocks = io_u->xfer_buflen / sd->bs;
+	lba = io_u->offset / sd->bs;
+
 	if (io_u->ddir == DDIR_READ) {
 		sgio_hdr_init(sd, hdr, io_u, 1);
 
 		hdr->dxfer_direction = SG_DXFER_FROM_DEV;
-		hdr->cmdp[0] = 0x28;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x28; // read(10)
+		else
+			hdr->cmdp[0] = 0x88; // read(16)
+
+		if (o->readfua)
+			hdr->cmdp[1] |= 0x08;
+
+		fio_sgio_rw_lba(hdr, lba, nr_blocks);
+
 	} else if (io_u->ddir == DDIR_WRITE) {
 		sgio_hdr_init(sd, hdr, io_u, 1);
 
 		hdr->dxfer_direction = SG_DXFER_TO_DEV;
-		hdr->cmdp[0] = 0x2a;
-	} else {
-		sgio_hdr_init(sd, hdr, io_u, 0);
+		switch(o->write_mode) {
+		case FIO_SG_WRITE:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2a; // write(10)
+			else
+				hdr->cmdp[0] = 0x8a; // write(16)
+			if (o->writefua)
+				hdr->cmdp[1] |= 0x08;
+			break;
+		case FIO_SG_WRITE_VERIFY:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2e; // write and verify(10)
+			else
+				hdr->cmdp[0] = 0x8e; // write and verify(16)
+			break;
+			// BYTCHK is disabled by virtue of the memset in sgio_hdr_init
+		case FIO_SG_WRITE_SAME:
+			hdr->dxfer_len = sd->bs;
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x41; // write same(10)
+			else
+				hdr->cmdp[0] = 0x93; // write same(16)
+			break;
+		};
 
-		hdr->dxfer_direction = SG_DXFER_NONE;
-		hdr->cmdp[0] = 0x35;
-	}
+		fio_sgio_rw_lba(hdr, lba, nr_blocks);
 
-	if (hdr->dxfer_direction != SG_DXFER_NONE) {
-		nr_blocks = io_u->xfer_buflen / sd->bs;
-		lba = io_u->offset / sd->bs;
-		hdr->cmdp[2] = (unsigned char) ((lba >> 24) & 0xff);
-		hdr->cmdp[3] = (unsigned char) ((lba >> 16) & 0xff);
-		hdr->cmdp[4] = (unsigned char) ((lba >>  8) & 0xff);
-		hdr->cmdp[5] = (unsigned char) (lba & 0xff);
-		hdr->cmdp[7] = (unsigned char) ((nr_blocks >> 8) & 0xff);
-		hdr->cmdp[8] = (unsigned char) (nr_blocks & 0xff);
-	}
+	} else if (io_u->ddir == DDIR_TRIM) {
+		struct sgio_trim *st;
+
+		if (sd->current_queue == -1) {
+			sgio_hdr_init(sd, hdr, io_u, 0);
+
+			hdr->cmd_len = 10;
+			hdr->dxfer_direction = SG_DXFER_TO_DEV;
+			hdr->cmdp[0] = 0x42; // unmap
+			sd->current_queue = io_u->index;
+			st = sd->trim_queues[sd->current_queue];
+			hdr->dxferp = st->unmap_param;
+#ifdef FIO_SGIO_DEBUG
+			assert(sd->trim_queues[io_u->index]->unmap_range_count == 0);
+			dprint(FD_IO, "sg: creating new queue based on io_u %d\n", io_u->index);
+#endif
+		}
+		else
+			st = sd->trim_queues[sd->current_queue];
+
+		dprint(FD_IO, "sg: adding io_u %d to trim queue %d\n", io_u->index, sd->current_queue);
+		st->trim_io_us[st->unmap_range_count] = io_u;
+#ifdef FIO_SGIO_DEBUG
+		sd->trim_queue_map[io_u->index] = sd->current_queue;
+#endif
+
+		offset = 8 + 16 * st->unmap_range_count;
+		sgio_set_be64(lba, &st->unmap_param[offset]);
+		sgio_set_be32((uint32_t) nr_blocks, &st->unmap_param[offset + 8]);
+
+		st->unmap_range_count++;
+
+	} else if (ddir_sync(io_u->ddir)) {
+		sgio_hdr_init(sd, hdr, io_u, 0);
+		hdr->dxfer_direction = SG_DXFER_NONE;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x35; // synccache(10)
+		else
+			hdr->cmdp[0] = 0x91; // synccache(16)
+	} else
+		assert(0);
 
 	return 0;
 }
 
-static int fio_sgio_queue(struct thread_data *td, struct io_u *io_u)
+static void fio_sgio_unmap_setup(struct sg_io_hdr *hdr, struct sgio_trim *st)
+{
+	uint16_t cnt = st->unmap_range_count * 16;
+
+	hdr->dxfer_len = cnt + 8;
+	sgio_set_be16(cnt + 8, &hdr->cmdp[7]);
+	sgio_set_be16(cnt + 6, st->unmap_param);
+	sgio_set_be16(cnt, &st->unmap_param[2]);
+
+	return;
+}
+
+static enum fio_q_status fio_sgio_queue(struct thread_data *td,
+					struct io_u *io_u)
 {
 	struct sg_io_hdr *hdr = &io_u->hdr;
+	struct sgio_data *sd = td->io_ops_data;
 	int ret, do_sync = 0;
 
 	fio_ro_check(td, io_u);
 
-	if (td->o.sync_io || td->o.odirect || ddir_sync(io_u->ddir))
+	if (sgio_unbuffered(td) || ddir_sync(io_u->ddir))
 		do_sync = 1;
 
+	if (io_u->ddir == DDIR_TRIM) {
+		if (do_sync || io_u->file->filetype == FIO_TYPE_BLOCK) {
+			struct sgio_trim *st = sd->trim_queues[sd->current_queue];
+
+			/* finish cdb setup for unmap because we are
+			** doing unmap commands synchronously */
+#ifdef FIO_SGIO_DEBUG
+			assert(st->unmap_range_count == 1);
+			assert(io_u == st->trim_io_us[0]);
+#endif
+			hdr = &io_u->hdr;
+
+			fio_sgio_unmap_setup(hdr, st);
+
+			st->unmap_range_count = 0;
+			sd->current_queue = -1;
+		} else
+			/* queue up trim ranges and submit in commit() */
+			return FIO_Q_QUEUED;
+	}
+
 	ret = fio_sgio_doio(td, io_u, do_sync);
 
 	if (ret < 0)
@@ -254,6 +660,14 @@
 	else if (hdr->status) {
 		io_u->resid = hdr->resid;
 		io_u->error = EIO;
+	} else if (td->io_ops->commit != NULL) {
+		if (do_sync && !ddir_sync(io_u->ddir)) {
+			io_u_mark_submit(td, 1);
+			io_u_mark_complete(td, 1);
+		} else if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+			io_u_mark_submit(td, 1);
+			io_u_queued(td, io_u);
+		}
 	}
 
 	if (io_u->error) {
@@ -264,44 +678,167 @@
 	return ret;
 }
 
+static int fio_sgio_commit(struct thread_data *td)
+{
+	struct sgio_data *sd = td->io_ops_data;
+	struct sgio_trim *st;
+	struct io_u *io_u;
+	struct sg_io_hdr *hdr;
+	struct timespec now;
+	unsigned int i;
+	int ret;
+
+	if (sd->current_queue == -1)
+		return 0;
+
+	st = sd->trim_queues[sd->current_queue];
+	io_u = st->trim_io_us[0];
+	hdr = &io_u->hdr;
+
+	fio_sgio_unmap_setup(hdr, st);
+
+	sd->current_queue = -1;
+
+	ret = fio_sgio_rw_doio(td, io_u->file, io_u, 0);
+
+	if (ret < 0 || hdr->status) {
+		int error;
+
+		if (ret < 0)
+			error = errno;
+		else {
+			error = EIO;
+			ret = -EIO;
+		}
+
+		for (i = 0; i < st->unmap_range_count; i++) {
+			st->trim_io_us[i]->error = error;
+			clear_io_u(td, st->trim_io_us[i]);
+			if (hdr->status)
+				st->trim_io_us[i]->resid = hdr->resid;
+		}
+
+		td_verror(td, error, "xfer");
+		return ret;
+	}
+
+	if (fio_fill_issue_time(td)) {
+		fio_gettime(&now, NULL);
+		for (i = 0; i < st->unmap_range_count; i++) {
+			memcpy(&st->trim_io_us[i]->issue_time, &now, sizeof(now));
+			io_u_queued(td, io_u);
+		}
+	}
+	io_u_mark_submit(td, st->unmap_range_count);
+
+	return 0;
+}
+
 static struct io_u *fio_sgio_event(struct thread_data *td, int event)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 
 	return sd->events[event];
 }
 
-static int fio_sgio_get_bs(struct thread_data *td, unsigned int *bs)
+static int fio_sgio_read_capacity(struct thread_data *td, unsigned int *bs,
+				  unsigned long long *max_lba)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	struct io_u io_u;
-	struct sg_io_hdr *hdr;
-	unsigned char buf[8];
+	/*
+	 * need to do read capacity operation w/o benefit of sd or
+	 * io_u structures, which are not initialized until later.
+	 */
+	struct sg_io_hdr hdr;
+	unsigned long long hlba;
+	unsigned int blksz = 0;
+	unsigned char cmd[16];
+	unsigned char sb[64];
+	unsigned char buf[32];  // read capacity return
 	int ret;
+	int fd = -1;
 
-	memset(&io_u, 0, sizeof(io_u));
-	io_u.file = td->files[0];
+	struct fio_file *f = td->files[0];
 
-	hdr = &io_u.hdr;
-	sgio_hdr_init(sd, hdr, &io_u, 0);
+	/* open file independent of rest of application */
+	fd = open(f->file_name, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	memset(&hdr, 0, sizeof(hdr));
+	memset(cmd, 0, sizeof(cmd));
+	memset(sb, 0, sizeof(sb));
 	memset(buf, 0, sizeof(buf));
 
-	hdr->cmdp[0] = 0x25;
-	hdr->dxfer_direction = SG_DXFER_FROM_DEV;
-	hdr->dxferp = buf;
-	hdr->dxfer_len = sizeof(buf);
-
-	ret = fio_sgio_doio(td, &io_u, 1);
-	if (ret)
+	/* First let's try a 10 byte read capacity. */
+	hdr.interface_id = 'S';
+	hdr.cmdp = cmd;
+	hdr.cmd_len = 10;
+	hdr.sbp = sb;
+	hdr.mx_sb_len = sizeof(sb);
+	hdr.timeout = SCSI_TIMEOUT_MS;
+	hdr.cmdp[0] = 0x25;  // Read Capacity(10)
+	hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	hdr.dxferp = buf;
+	hdr.dxfer_len = sizeof(buf);
+
+	ret = ioctl(fd, SG_IO, &hdr);
+	if (ret < 0) {
+		close(fd);
 		return ret;
+	}
 
-	*bs = (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7];
-	return 0;
+	if (hdr.info & SG_INFO_CHECK) {
+		/* RCAP(10) might be unsupported by device. Force RCAP(16) */
+		hlba = MAX_10B_LBA;
+	} else {
+		blksz = sgio_get_be32(&buf[4]);
+		hlba = sgio_get_be32(buf);
+	}
+
+	/*
+	 * If max lba masked by MAX_10B_LBA equals MAX_10B_LBA,
+	 * then need to retry with 16 byte Read Capacity command.
+	 */
+	if (hlba == MAX_10B_LBA) {
+		hdr.cmd_len = 16;
+		hdr.cmdp[0] = 0x9e; // service action
+		hdr.cmdp[1] = 0x10; // Read Capacity(16)
+		sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+		hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+		hdr.dxferp = buf;
+		hdr.dxfer_len = sizeof(buf);
+
+		ret = ioctl(fd, SG_IO, &hdr);
+		if (ret < 0) {
+			close(fd);
+			return ret;
+		}
+
+		/* record if an io error occurred */
+		if (hdr.info & SG_INFO_CHECK)
+			td_verror(td, EIO, "fio_sgio_read_capacity");
+
+		blksz = sgio_get_be32(&buf[8]);
+		hlba = sgio_get_be64(buf);
+	}
+
+	if (blksz) {
+		*bs = blksz;
+		*max_lba = hlba;
+		ret = 0;
+	} else {
+		ret = EIO;
+	}
+
+	close(fd);
+	return ret;
 }
 
 static void fio_sgio_cleanup(struct thread_data *td)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
+	int i;
 
 	if (sd) {
 		free(sd->events);
@@ -309,6 +846,17 @@
 		free(sd->fd_flags);
 		free(sd->pfds);
 		free(sd->sgbuf);
+#ifdef FIO_SGIO_DEBUG
+		free(sd->trim_queue_map);
+#endif
+
+		for (i = 0; i < td->o.iodepth; i++) {
+			free(sd->trim_queues[i]->unmap_param);
+			free(sd->trim_queues[i]->trim_io_us);
+			free(sd->trim_queues[i]);
+		}
+
+		free(sd->trim_queues);
 		free(sd);
 	}
 }
@@ -316,21 +864,31 @@
 static int fio_sgio_init(struct thread_data *td)
 {
 	struct sgio_data *sd;
+	struct sgio_trim *st;
+	int i;
 
-	sd = malloc(sizeof(*sd));
-	memset(sd, 0, sizeof(*sd));
-	sd->cmds = malloc(td->o.iodepth * sizeof(struct sgio_cmd));
-	memset(sd->cmds, 0, td->o.iodepth * sizeof(struct sgio_cmd));
-	sd->events = malloc(td->o.iodepth * sizeof(struct io_u *));
-	memset(sd->events, 0, td->o.iodepth * sizeof(struct io_u *));
-	sd->pfds = malloc(sizeof(struct pollfd) * td->o.nr_files);
-	memset(sd->pfds, 0, sizeof(struct pollfd) * td->o.nr_files);
-	sd->fd_flags = malloc(sizeof(int) * td->o.nr_files);
-	memset(sd->fd_flags, 0, sizeof(int) * td->o.nr_files);
-	sd->sgbuf = malloc(sizeof(struct sg_io_hdr) * td->o.iodepth);
-	memset(sd->sgbuf, 0, sizeof(struct sg_io_hdr) * td->o.iodepth);
+	sd = calloc(1, sizeof(*sd));
+	sd->cmds = calloc(td->o.iodepth, sizeof(struct sgio_cmd));
+	sd->sgbuf = calloc(td->o.iodepth, sizeof(struct sg_io_hdr));
+	sd->events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	sd->pfds = calloc(td->o.nr_files, sizeof(struct pollfd));
+	sd->fd_flags = calloc(td->o.nr_files, sizeof(int));
+	sd->type_checked = 0;
+
+	sd->trim_queues = calloc(td->o.iodepth, sizeof(struct sgio_trim *));
+	sd->current_queue = -1;
+#ifdef FIO_SGIO_DEBUG
+	sd->trim_queue_map = calloc(td->o.iodepth, sizeof(int));
+#endif
+	for (i = 0; i < td->o.iodepth; i++) {
+		sd->trim_queues[i] = calloc(1, sizeof(struct sgio_trim));
+		st = sd->trim_queues[i];
+		st->unmap_param = calloc(td->o.iodepth + 1, sizeof(char[16]));
+		st->unmap_range_count = 0;
+		st->trim_io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+	}
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 
 	/*
 	 * we want to do it, regardless of whether odirect is set or not
@@ -341,10 +899,11 @@
 
 static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	unsigned int bs;
+	struct sgio_data *sd = td->io_ops_data;
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
 
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		if (ioctl(f->fd, BLKSSZGET, &bs) < 0) {
 			td_verror(td, errno, "ioctl");
 			return 1;
@@ -357,27 +916,44 @@
 			return 1;
 		}
 
-		ret = fio_sgio_get_bs(td, &bs);
-		if (ret)
+		ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+		if (ret) {
+			td_verror(td, td->error, "fio_sgio_read_capacity");
+			log_err("ioengine sg unable to read capacity successfully\n");
 			return 1;
+		}
 	} else {
-		log_err("ioengine sg only works on block devices\n");
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
 		return 1;
 	}
 
 	sd->bs = bs;
+	// Determine size of commands needed based on max_lba
+	if (max_lba >= MAX_10B_LBA) {
+		dprint(FD_IO, "sgio_type_check: using 16 byte read/write "
+			"commands for lba above 0x%016llx/0x%016llx\n",
+			MAX_10B_LBA, max_lba);
+	}
 
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		td->io_ops->getevents = NULL;
 		td->io_ops->event = NULL;
+		td->io_ops->commit = NULL;
+		/*
+		** Setting these functions to null may cause problems
+		** with filename=/dev/sda:/dev/sg0 since we are only
+		** considering a single file
+		*/
 	}
+	sd->type_checked = 1;
 
 	return 0;
 }
 
 static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 	int ret;
 
 	ret = generic_open_file(td, f);
@@ -392,19 +968,285 @@
 	return 0;
 }
 
+/*
+ * Build an error string with details about the driver, host or scsi
+ * error contained in the sg header Caller will use as necessary.
+ */
+static char *fio_sgio_errdetails(struct io_u *io_u)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+#define MAXERRDETAIL 1024
+#define MAXMSGCHUNK  128
+	char *msg, msgchunk[MAXMSGCHUNK];
+	int i;
+
+	msg = calloc(1, MAXERRDETAIL);
+	strcpy(msg, "");
+
+	/*
+	 * can't seem to find sg_err.h, so I'll just echo the define values
+	 * so others can search on internet to find clearer clues of meaning.
+	 */
+	if (hdr->info & SG_INFO_CHECK) {
+		if (hdr->host_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->host_status) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DID_NO_CONNECT", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DID_BUS_BUSY", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DID_TIME_OUT", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DID_BAD_TARGET", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DID_ABORT", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DID_PARITY", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DID_ERROR (internal error)", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DID_RESET", MAXERRDETAIL);
+				break;
+			case 0x09:
+				strlcat(msg, "SG_ERR_DID_BAD_INTR (unexpected)", MAXERRDETAIL);
+				break;
+			case 0x0a:
+				strlcat(msg, "SG_ERR_DID_PASSTHROUGH", MAXERRDETAIL);
+				break;
+			case 0x0b:
+				strlcat(msg, "SG_ERR_DID_SOFT_ERROR (driver retry?)", MAXERRDETAIL);
+				break;
+			case 0x0c:
+				strlcat(msg, "SG_ERR_DID_IMM_RETRY", MAXERRDETAIL);
+				break;
+			case 0x0d:
+				strlcat(msg, "SG_ERR_DID_REQUEUE", MAXERRDETAIL);
+				break;
+			case 0x0e:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_DISRUPTED", MAXERRDETAIL);
+				break;
+			case 0x0f:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_FAILFAST", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "SG_ERR_DID_TARGET_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x11:
+				strlcat(msg, "SG_ERR_DID_NEXUS_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x12:
+				strlcat(msg, "SG_ERR_DID_ALLOC_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x13:
+				strlcat(msg, "SG_ERR_DID_MEDIUM_ERROR", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->driver_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver Status: 0x%02x; ", hdr->driver_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->driver_status & 0x0F) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DRIVER_BUSY", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DRIVER_SOFT", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DRIVER_MEDIA", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DRIVER_ERROR", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DRIVER_INVALID", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DRIVER_TIMEOUT", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DRIVER_HARD", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DRIVER_SENSE", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, "; ", MAXERRDETAIL);
+			switch (hdr->driver_status & 0xF0) {
+			case 0x10:
+				strlcat(msg, "SG_ERR_SUGGEST_RETRY", MAXERRDETAIL);
+				break;
+			case 0x20:
+				strlcat(msg, "SG_ERR_SUGGEST_ABORT", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "SG_ERR_SUGGEST_REMAP", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "SG_ERR_SUGGEST_DIE", MAXERRDETAIL);
+				break;
+			case 0x80:
+				strlcat(msg, "SG_ERR_SUGGEST_SENSE", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG SCSI Status: 0x%02x; ", hdr->status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			// SCSI 3 status codes
+			switch (hdr->status) {
+			case 0x02:
+				strlcat(msg, "CHECK_CONDITION", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "BUSY", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "INTERMEDIATE", MAXERRDETAIL);
+				break;
+			case 0x14:
+				strlcat(msg, "INTERMEDIATE_CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x18:
+				strlcat(msg, "RESERVATION_CONFLICT", MAXERRDETAIL);
+				break;
+			case 0x22:
+				strlcat(msg, "COMMAND_TERMINATED", MAXERRDETAIL);
+				break;
+			case 0x28:
+				strlcat(msg, "TASK_SET_FULL", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "ACA_ACTIVE", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "TASK_ABORTED", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->sb_len_wr) {
+			snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			for (i = 0; i < hdr->sb_len_wr; i++) {
+				snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]);
+				strlcat(msg, msgchunk, MAXERRDETAIL);
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->resid != 0) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+		}
+		if (hdr->cmdp) {
+			strlcat(msg, "cdb:", MAXERRDETAIL);
+			for (i = 0; i < hdr->cmd_len; i++) {
+				snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->cmdp[i]);
+				strlcat(msg, msgchunk, MAXERRDETAIL);
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+			if (io_u->ddir == DDIR_TRIM) {
+				unsigned char *param_list = hdr->dxferp;
+				strlcat(msg, "dxferp:", MAXERRDETAIL);
+				for (i = 0; i < hdr->dxfer_len; i++) {
+					snprintf(msgchunk, MAXMSGCHUNK, " %02x", param_list[i]);
+					strlcat(msg, msgchunk, MAXERRDETAIL);
+				}
+				strlcat(msg, ". ", MAXERRDETAIL);
+			}
+		}
+	}
+
+	if (!(hdr->info & SG_INFO_CHECK) && !strlen(msg))
+		snprintf(msg, MAXERRDETAIL, "%s",
+			 "SG Driver did not report a Host, Driver or Device check");
+
+	return msg;
+}
+
+/*
+ * get max file size from read capacity.
+ */
+static int fio_sgio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	/*
+	 * get_file_size is being called even before sgio_init is
+	 * called, so none of the sg_io structures are
+	 * initialized in the thread_data yet.  So we need to do the
+	 * ReadCapacity without any of those helpers.  One of the effects
+	 * is that ReadCapacity may get called 4 times on each open:
+	 * readcap(10) followed by readcap(16) if needed - just to get
+	 * the file size after the init occurs - it will be called
+	 * again when "type_check" is called during structure
+	 * initialization I'm not sure how to prevent this little
+	 * inefficiency.
+	 */
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
+	int ret;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
+		return 1;
+	}
+
+	ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+	if (ret ) {
+		td_verror(td, td->error, "fio_sgio_read_capacity");
+		log_err("ioengine sg unable to successfully execute read capacity to get block size and maximum lba\n");
+		return 1;
+	}
+
+	f->real_file_size = (max_lba + 1) * bs;
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+
 static struct ioengine_ops ioengine = {
 	.name		= "sg",
 	.version	= FIO_IOOPS_VERSION,
 	.init		= fio_sgio_init,
 	.prep		= fio_sgio_prep,
 	.queue		= fio_sgio_queue,
+	.commit		= fio_sgio_commit,
 	.getevents	= fio_sgio_getevents,
+	.errdetails	= fio_sgio_errdetails,
 	.event		= fio_sgio_event,
 	.cleanup	= fio_sgio_cleanup,
 	.open_file	= fio_sgio_open,
 	.close_file	= generic_close_file,
-	.get_file_size	= generic_get_file_size,
+	.get_file_size	= fio_sgio_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_RAWIO,
+	.options	= options,
+	.option_struct_size	= sizeof(struct sg_options)
 };
 
 #else /* FIO_HAVE_SGIO */
diff -Nru fio-2.1.3/engines/skeleton_external.c fio-3.16/engines/skeleton_external.c
--- fio-2.1.3/engines/skeleton_external.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/skeleton_external.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,7 +3,8 @@
  *
  * Should be compiled with:
  *
- * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o engine.o engine.c
+ * gcc -Wall -O2 -g -D_GNU_SOURCE -include ../config-host.h -shared -rdynamic -fPIC -o skeleton_external.o skeleton_external.c
+ * (also requires -D_GNU_SOURCE -DCONFIG_STRSEP on Linux)
  *
  */
 #include <stdio.h>
@@ -13,6 +14,7 @@
 #include <assert.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 /*
  * The core of the module is identical to the ones included with fio,
@@ -21,6 +23,32 @@
  */
 
 /*
+ * The io engine can define its own options within the io engine source.
+ * The option member must not be at offset 0, due to the way fio parses
+ * the given option. Just add a padding pointer unless the io engine has
+ * something usable.
+ */
+struct fio_skeleton_options {
+	void *pad; /* avoid ->off1 of fio_option becomes 0 */
+	unsigned int dummy;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "dummy",
+		.lname	= "ldummy",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct fio_skeleton_options, dummy),
+		.help	= "Set dummy",
+		.category = FIO_OPT_C_ENGINE, /* always use this */
+		.group	= FIO_OPT_G_INVALID, /* this can be different */
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+/*
  * The ->event() hook is called to match an event number with an io_u.
  * After the core has called ->getevents() and it has returned eg 3,
  * the ->event() hook must return the 3 events that have completed for
@@ -38,7 +66,7 @@
  * numbers. Required.
  */
 static int fio_skeleton_getevents(struct thread_data *td, unsigned int min,
-				  unsigned int max, struct timespec *t)
+				  unsigned int max, const struct timespec *t)
 {
 	return 0;
 }
@@ -62,7 +90,8 @@
  * io_u->xfer_buflen. Residual data count may be set in io_u->resid
  * for a short read/write.
  */
-static int fio_skeleton_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_skeleton_queue(struct thread_data *td,
+					    struct io_u *io_u)
 {
 	/*
 	 * Double sanity check to catch errant write on a readonly setup
@@ -99,7 +128,7 @@
 }
 
 /*
- * This is paired with the ->init() funtion and is called when a thread is
+ * This is paired with the ->init() function and is called when a thread is
  * done doing io. Should tear down anything setup by the ->init() function.
  * Not required.
  */
@@ -109,11 +138,11 @@
 
 /*
  * Hook for opening the given file. Unless the engine has special
- * needs, it usually just provides generic_file_open() as the handler.
+ * needs, it usually just provides generic_open_file() as the handler.
  */
 static int fio_skeleton_open(struct thread_data *td, struct fio_file *f)
 {
-	return generic_file_open(td, f);
+	return generic_open_file(td, f);
 }
 
 /*
@@ -121,12 +150,12 @@
  */
 static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
 {
-	generic_file_close(td, f);
+	return generic_close_file(td, f);
 }
 
 /*
  * Note that the structure is exported, so that fio can get it via
- * dlsym(..., "ioengine");
+ * dlsym(..., "ioengine"); for (and only for) external engines.
  */
 struct ioengine_ops ioengine = {
 	.name		= "engine_name",
@@ -140,4 +169,6 @@
 	.cleanup	= fio_skeleton_cleanup,
 	.open_file	= fio_skeleton_open,
 	.close_file	= fio_skeleton_close,
+	.options	= options,
+	.option_struct_size	= sizeof(struct fio_skeleton_options),
 };
diff -Nru fio-2.1.3/engines/solarisaio.c fio-3.16/engines/solarisaio.c
--- fio-2.1.3/engines/solarisaio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/solarisaio.c	2019-09-20 01:01:52.000000000 +0000
@@ -28,7 +28,7 @@
 static int fio_solarisaio_prep(struct thread_data fio_unused *td,
 			    struct io_u *io_u)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	io_u->resultp.aio_return = AIO_INPROGRESS;
 	io_u->engine_data = sd;
@@ -73,9 +73,9 @@
 }
 
 static int fio_solarisaio_getevents(struct thread_data *td, unsigned int min,
-				    unsigned int max, struct timespec *t)
+				    unsigned int max, const struct timespec *t)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 	struct timeval tv;
 	int ret;
 
@@ -100,15 +100,15 @@
 
 static struct io_u *fio_solarisaio_event(struct thread_data *td, int event)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	return sd->aio_events[event];
 }
 
-static int fio_solarisaio_queue(struct thread_data fio_unused *td,
+static enum fio_q_status fio_solarisaio_queue(struct thread_data fio_unused *td,
 			      struct io_u *io_u)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	off_t off;
 	int ret;
@@ -155,7 +155,7 @@
 
 static void fio_solarisaio_cleanup(struct thread_data *td)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	if (sd) {
 		free(sd->aio_events);
@@ -204,7 +204,7 @@
 	fio_solarisaio_init_sigio();
 #endif
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 
diff -Nru fio-2.1.3/engines/splice.c fio-3.16/engines/splice.c
--- fio-2.1.3/engines/splice.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/splice.c	2019-09-20 01:01:52.000000000 +0000
@@ -9,8 +9,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
-#include <assert.h>
-#include <sys/poll.h>
+#include <poll.h>
 #include <sys/mman.h>
 
 #include "../fio.h"
@@ -28,11 +27,11 @@
  */
 static int fio_splice_read_old(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	int ret, ret2, buflen;
 	off_t offset;
-	void *p;
+	char *p;
 
 	offset = io_u->offset;
 	buflen = io_u->xfer_buflen;
@@ -72,12 +71,13 @@
  */
 static int fio_splice_read(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	struct iovec iov;
 	int ret , buflen, mmap_len;
 	off_t offset;
-	void *p, *map;
+	void *map;
+	char *p;
 
 	ret = 0;
 	offset = io_u->offset;
@@ -166,7 +166,7 @@
  */
 static int fio_splice_write(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct iovec iov = {
 		.iov_base = io_u->xfer_buf,
 		.iov_len = io_u->xfer_buflen,
@@ -199,9 +199,10 @@
 	return io_u->xfer_buflen;
 }
 
-static int fio_spliceio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_spliceio_queue(struct thread_data *td,
+					    struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	int ret = 0;
 
 	fio_ro_check(td, io_u);
@@ -247,7 +248,7 @@
 
 static void fio_spliceio_cleanup(struct thread_data *td)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 
 	if (sd) {
 		close(sd->pipe[0]);
@@ -277,14 +278,7 @@
 	 */
 	sd->vmsplice_to_user_map = 1;
 
-	/*
-	 * And if vmsplice_to_user works, we definitely need aligned
-	 * buffers. Just set ->odirect to force that.
-	 */
-	if (td_read(td))
-		td->o.mem_align = 1;
-
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 
diff -Nru fio-2.1.3/engines/sync.c fio-3.16/engines/sync.c
--- fio-2.1.3/engines/sync.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/sync.c	2019-09-20 01:01:52.000000000 +0000
@@ -10,14 +10,15 @@
 #include <unistd.h>
 #include <sys/uio.h>
 #include <errno.h>
-#include <assert.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
+#include "../lib/rand.h"
 
 /*
  * Sync engine uses engine_data to store last offset
  */
-#define LAST_POS(f)	((f)->engine_data)
+#define LAST_POS(f)	((f)->engine_pos)
 
 struct syncio_data {
 	struct iovec *iovecs;
@@ -29,8 +30,45 @@
 	unsigned long long last_offset;
 	struct fio_file *last_file;
 	enum fio_ddir last_ddir;
+
+	struct frand_state rand_state;
+};
+
+#ifdef FIO_HAVE_PWRITEV2
+struct psyncv2_options {
+	void *pad;
+	unsigned int hipri;
+	unsigned int hipri_percentage;
 };
 
+static struct fio_option options[] = {
+	{
+		.name	= "hipri",
+		.lname	= "RWF_HIPRI",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct psyncv2_options, hipri),
+		.help	= "Set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "hipri_percentage",
+		.lname	= "RWF_HIPRI_PERCENTAGE",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct psyncv2_options, hipri_percentage),
+		.minval	= 0,
+		.maxval	= 100,
+		.def    = "100",
+		.help	= "Probabilistically set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= NULL,
+	},
+};
+#endif
+
 static int fio_syncio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
@@ -63,16 +101,19 @@
 			io_u->error = errno;
 	}
 
-	if (io_u->error)
+	if (io_u->error) {
+		io_u_log_error(td, io_u);
 		td_verror(td, io_u->error, "xfer");
+	}
 
 	return FIO_Q_COMPLETED;
 }
 
 #ifdef CONFIG_PWRITEV
-static int fio_pvsyncio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_pvsyncio_queue(struct thread_data *td,
+					    struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct iovec *iov = &sd->iovecs[0];
 	struct fio_file *f = io_u->file;
 	int ret;
@@ -96,7 +137,41 @@
 }
 #endif
 
-static int fio_psyncio_queue(struct thread_data *td, struct io_u *io_u)
+#ifdef FIO_HAVE_PWRITEV2
+static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td,
+					     struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct psyncv2_options *o = td->eo;
+	struct iovec *iov = &sd->iovecs[0];
+	struct fio_file *f = io_u->file;
+	int ret, flags = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (o->hipri &&
+	    (rand_between(&sd->rand_state, 1, 100) <= o->hipri_percentage))
+		flags |= RWF_HIPRI;
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (io_u->ddir == DDIR_READ)
+		ret = preadv2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = pwritev2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+#endif
+
+static enum fio_q_status fio_psyncio_queue(struct thread_data *td,
+					   struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
 	int ret;
@@ -116,7 +191,8 @@
 	return fio_io_end(td, io_u, ret);
 }
 
-static int fio_syncio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_syncio_queue(struct thread_data *td,
+					  struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
 	int ret;
@@ -138,9 +214,9 @@
 
 static int fio_vsyncio_getevents(struct thread_data *td, unsigned int min,
 				 unsigned int max,
-				 struct timespec fio_unused *t)
+				 const struct timespec fio_unused *t)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	int ret;
 
 	if (min) {
@@ -155,14 +231,14 @@
 
 static struct io_u *fio_vsyncio_event(struct thread_data *td, int event)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	return sd->io_us[event];
 }
 
 static int fio_vsyncio_append(struct thread_data *td, struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	if (ddir_sync(io_u->ddir))
 		return 0;
@@ -187,9 +263,10 @@
 	sd->queued++;
 }
 
-static int fio_vsyncio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_vsyncio_queue(struct thread_data *td,
+					   struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -229,7 +306,7 @@
  */
 static int fio_vsyncio_end(struct thread_data *td, ssize_t bytes)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct io_u *io_u;
 	unsigned int i;
 	int err;
@@ -269,7 +346,7 @@
 
 static int fio_vsyncio_commit(struct thread_data *td)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct fio_file *f;
 	ssize_t ret;
 
@@ -306,18 +383,21 @@
 	sd->last_offset = -1ULL;
 	sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
 	sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
+	init_rand(&sd->rand_state, 0);
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 
 static void fio_vsyncio_cleanup(struct thread_data *td)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
-	free(sd->iovecs);
-	free(sd->io_us);
-	free(sd);
+	if (sd) {
+		free(sd->iovecs);
+		free(sd->io_us);
+		free(sd);
+	}
 }
 
 static struct ioengine_ops ioengine_rw = {
@@ -370,6 +450,22 @@
 };
 #endif
 
+#ifdef FIO_HAVE_PWRITEV2
+static struct ioengine_ops ioengine_pvrw2 = {
+	.name		= "pvsync2",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_vsyncio_init,
+	.cleanup	= fio_vsyncio_cleanup,
+	.queue		= fio_pvsyncio2_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+	.options	= options,
+	.option_struct_size	= sizeof(struct psyncv2_options),
+};
+#endif
+
 static void fio_init fio_syncio_register(void)
 {
 	register_ioengine(&ioengine_rw);
@@ -378,6 +474,9 @@
 #ifdef CONFIG_PWRITEV
 	register_ioengine(&ioengine_pvrw);
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+	register_ioengine(&ioengine_pvrw2);
+#endif
 }
 
 static void fio_exit fio_syncio_unregister(void)
@@ -388,4 +487,7 @@
 #ifdef CONFIG_PWRITEV
 	unregister_ioengine(&ioengine_pvrw);
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+	unregister_ioengine(&ioengine_pvrw2);
+#endif
 }
diff -Nru fio-2.1.3/engines/windowsaio.c fio-3.16/engines/windowsaio.c
--- fio-2.1.3/engines/windowsaio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/engines/windowsaio.c	2019-09-20 01:01:52.000000000 +0000
@@ -35,17 +35,7 @@
 	struct windowsaio_data *wd;
 };
 
-static BOOL timeout_expired(DWORD start_count, DWORD end_count);
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-					unsigned int max, struct timespec *t);
-static struct io_u *fio_windowsaio_event(struct thread_data *td, int event);
-static int fio_windowsaio_queue(struct thread_data *td,
-				  struct io_u *io_u);
-static void fio_windowsaio_cleanup(struct thread_data *td);
 static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
-static int fio_windowsaio_init(struct thread_data *td);
-static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f);
-static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f);
 
 static int fio_windowsaio_init(struct thread_data *td)
 {
@@ -84,7 +74,7 @@
 		}
 	}
 
-	td->io_ops->data = wd;
+	td->io_ops_data = wd;
 
 	if (!rc) {
 		struct thread_ctx *ctx;
@@ -97,26 +87,29 @@
 			rc = 1;
 		}
 
-		wd = td->io_ops->data;
+		wd = td->io_ops_data;
 		wd->iothread_running = TRUE;
 		wd->iocp = hFile;
 
 		if (!rc)
 			ctx = malloc(sizeof(struct thread_ctx));
 
-		if (!rc && ctx == NULL)
-		{
+		if (!rc && ctx == NULL) {
 			log_err("windowsaio: failed to allocate memory for thread context structure\n");
 			CloseHandle(hFile);
 			rc = 1;
 		}
 
-		if (!rc)
-		{
+		if (!rc) {
+			DWORD threadid;
+
 			ctx->iocp = hFile;
 			ctx->wd = wd;
-			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, NULL);
-			if (wd->iothread == NULL)
+			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+
+			if (wd->iothread != NULL)
+				fio_setaffinity(threadid, td->o.cpumask);
+			else
 				log_err("windowsaio: failed to create io completion thread\n");
 		}
 
@@ -131,7 +124,7 @@
 {
 	struct windowsaio_data *wd;
 
-	wd = td->io_ops->data;
+	wd = td->io_ops_data;
 
 	if (wd != NULL) {
 		wd->iothread_running = FALSE;
@@ -143,10 +136,47 @@
 		free(wd->aio_events);
 		free(wd);
 
-		td->io_ops->data = NULL;
+		td->io_ops_data = NULL;
 	}
 }
 
+static int windowsaio_invalidate_cache(struct fio_file *f)
+{
+	DWORD error;
+	DWORD isharemode = (FILE_SHARE_DELETE | FILE_SHARE_READ |
+				FILE_SHARE_WRITE);
+	HANDLE ihFile;
+	int rc = 0;
+
+	/*
+	 * Encourage Windows to drop cached parts of a file by temporarily
+	 * opening it for non-buffered access. Note: this will only work when
+	 * the following is the only thing with the file open on the whole
+	 * system.
+	 */
+	dprint(FD_IO, "windowaio: attempt invalidate cache for %s\n",
+			f->file_name);
+	ihFile = CreateFile(f->file_name, 0, isharemode, NULL, OPEN_EXISTING,
+			FILE_FLAG_NO_BUFFERING, NULL);
+
+	if (ihFile != INVALID_HANDLE_VALUE) {
+		if (!CloseHandle(ihFile)) {
+			error = GetLastError();
+			log_info("windowsaio: invalidation fd close %s "
+				 "failed: error %d\n", f->file_name, error);
+			rc = 1;
+		}
+	} else {
+		error = GetLastError();
+		if (error != ERROR_FILE_NOT_FOUND) {
+			log_info("windowsaio: cache invalidation of %s failed: "
+					"error %d\n", f->file_name, error);
+			rc = 1;
+		}
+	}
+
+	return rc;
+}
 
 static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
 {
@@ -175,13 +205,26 @@
 
 	/*
 	 * Inform Windows whether we're going to be doing sequential or
-	 * random io so it can tune the Cache Manager
+	 * random IO so it can tune the Cache Manager
 	 */
-	if (td->o.td_ddir == TD_DDIR_READ  ||
-		td->o.td_ddir == TD_DDIR_WRITE)
-		flags |= FILE_FLAG_SEQUENTIAL_SCAN;
-	else
+	switch (td->o.fadvise_hint) {
+	case F_ADV_TYPE:
+		if (td_random(td))
+			flags |= FILE_FLAG_RANDOM_ACCESS;
+		else
+			flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_RANDOM:
 		flags |= FILE_FLAG_RANDOM_ACCESS;
+		break;
+	case F_ADV_SEQUENTIAL:
+		flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+		break;
+	case F_ADV_NONE:
+		break;
+	default:
+		log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
+	}
 
 	if (!td_write(td) || read_only)
 		access = GENERIC_READ;
@@ -193,6 +236,11 @@
 	else
 		openmode = OPEN_EXISTING;
 
+	/* If we're going to use direct I/O, Windows will try and invalidate
+	 * its cache at that point so there's no need to do it here */
+	if (td->o.invalidate_cache && !td->o.odirect)
+		windowsaio_invalidate_cache(f);
+
 	f->hFile = CreateFile(f->file_name, access, sharemode,
 		NULL, openmode, flags, NULL);
 
@@ -203,10 +251,10 @@
 
 	/* Only set up the completion port and thread if we're not just
 	 * querying the device size */
-	if (!rc && td->io_ops->data != NULL) {
+	if (!rc && td->io_ops_data != NULL) {
 		struct windowsaio_data *wd;
 
-		wd = td->io_ops->data;
+		wd = td->io_ops_data;
 
 		if (CreateIoCompletionPort(f->hFile, wd->iocp, 0, 0) == NULL) {
 			log_err("windowsaio: failed to create io completion port\n");
@@ -251,14 +299,15 @@
 
 static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
 {
-	struct windowsaio_data *wd = td->io_ops->data;
+	struct windowsaio_data *wd = td->io_ops_data;
 	return wd->aio_events[event];
 }
 
 static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-					unsigned int max, struct timespec *t)
+				    unsigned int max,
+				    const struct timespec *t)
 {
-	struct windowsaio_data *wd = td->io_ops->data;
+	struct windowsaio_data *wd = td->io_ops_data;
 	unsigned int dequeued = 0;
 	struct io_u *io_u;
 	int i;
@@ -283,14 +332,13 @@
 
 			if (fov->io_complete) {
 				fov->io_complete = FALSE;
-				ResetEvent(fov->o.hEvent);
 				wd->aio_events[dequeued] = io_u;
 				dequeued++;
 			}
 
-			if (dequeued >= min)
-				break;
 		}
+		if (dequeued >= min)
+			break;
 
 		if (dequeued < min) {
 			status = WaitForSingleObject(wd->iocomplete_event, mswait);
@@ -298,34 +346,37 @@
 				break;
 		}
 
-		if (dequeued >= min || (t != NULL && timeout_expired(start_count, end_count)))
+		if (dequeued >= min ||
+		    (t != NULL && timeout_expired(start_count, end_count)))
 			break;
 	} while (1);
 
 	return dequeued;
 }
 
-static int fio_windowsaio_queue(struct thread_data *td, struct io_u *io_u)
+static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
+					      struct io_u *io_u)
 {
 	struct fio_overlapped *o = io_u->engine_data;
 	LPOVERLAPPED lpOvl = &o->o;
-	DWORD iobytes;
 	BOOL success = FALSE;
 	int rc = FIO_Q_COMPLETED;
 
 	fio_ro_check(td, io_u);
 
-	lpOvl->Internal = STATUS_PENDING;
+	lpOvl->Internal = 0;
 	lpOvl->InternalHigh = 0;
 	lpOvl->Offset = io_u->offset & 0xFFFFFFFF;
 	lpOvl->OffsetHigh = io_u->offset >> 32;
 
 	switch (io_u->ddir) {
 	case DDIR_WRITE:
-		success = WriteFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl);
+		success = WriteFile(io_u->file->hFile, io_u->xfer_buf,
+					io_u->xfer_buflen, NULL, lpOvl);
 		break;
 	case DDIR_READ:
-		success = ReadFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl);
+		success = ReadFile(io_u->file->hFile, io_u->xfer_buf,
+					io_u->xfer_buflen, NULL, lpOvl);
 		break;
 	case DDIR_SYNC:
 	case DDIR_DATASYNC:
@@ -337,13 +388,11 @@
 		}
 
 		return FIO_Q_COMPLETED;
-		break;
 	case DDIR_TRIM:
 		log_err("windowsaio: manual TRIM isn't supported on Windows\n");
 		io_u->error = 1;
 		io_u->resid = io_u->xfer_buflen;
 		return FIO_Q_COMPLETED;
-		break;
 	default:
 		assert(0);
 		break;
@@ -374,7 +423,11 @@
 	wd = ctx->wd;
 
 	do {
-		if (!GetQueuedCompletionStatus(ctx->iocp, &bytes, &ulKey, &ovl, 250) && ovl == NULL)
+		BOOL ret;
+
+		ret = GetQueuedCompletionStatus(ctx->iocp, &bytes, &ulKey,
+						&ovl, 250);
+		if (!ret && ovl == NULL)
 			continue;
 
 		fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
@@ -402,7 +455,6 @@
 	struct fio_overlapped *o = io_u->engine_data;
 
 	if (o) {
-		CloseHandle(o->o.hEvent);
 		io_u->engine_data = NULL;
 		free(o);
 	}
@@ -415,13 +467,7 @@
 	o = malloc(sizeof(*o));
 	o->io_complete = FALSE;
 	o->io_u = io_u;
-	o->o.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
-	if (o->o.hEvent == NULL) {
-		log_err("windowsaio: failed to create event handle\n");
-		free(o);
-		return 1;
-	}
-
+	o->o.hEvent = NULL;
 	io_u->engine_data = o;
 	return 0;
 }
diff -Nru fio-2.1.3/err.h fio-3.16/err.h
--- fio-2.1.3/err.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/err.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,44 @@
+#ifndef FIO_ERR_H
+#define FIO_ERR_H
+
+/*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a dentry
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+#define MAX_ERRNO	4095
+
+#define IS_ERR_VALUE(x) ((x) >= (uintptr_t)-MAX_ERRNO)
+
+static inline void *ERR_PTR(uintptr_t error)
+{
+	return (void *) error;
+}
+
+static inline uintptr_t PTR_ERR(const void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static inline uintptr_t IS_ERR(const void *ptr)
+{
+	return IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+static inline uintptr_t IS_ERR_OR_NULL(const void *ptr)
+{
+	return !ptr || IS_ERR_VALUE((uintptr_t)ptr);
+}
+
+static inline int PTR_ERR_OR_ZERO(const void *ptr)
+{
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	else
+		return 0;
+}
+
+#endif
diff -Nru fio-2.1.3/eta.c fio-3.16/eta.c
--- fio-2.1.3/eta.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/eta.c	2019-09-20 01:01:52.000000000 +0000
@@ -2,19 +2,43 @@
  * Status and ETA code
  */
 #include <unistd.h>
-#include <fcntl.h>
 #include <string.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
 
 #include "fio.h"
+#include "lib/pow2.h"
 
-static char run_str[REAL_MAX_JOBS + 1];
+static char __run_str[REAL_MAX_JOBS + 1];
+static char run_str[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS) + 1];
+
+static void update_condensed_str(char *rstr, char *run_str_condensed)
+{
+	if (*rstr) {
+		while (*rstr) {
+			int nr = 1;
+
+			*run_str_condensed++ = *rstr++;
+			while (*(rstr - 1) == *rstr) {
+				rstr++;
+				nr++;
+			}
+			run_str_condensed += sprintf(run_str_condensed, "(%u),", nr);
+		}
+		run_str_condensed--;
+	}
+	*run_str_condensed = '\0';
+}
 
 /*
  * Sets the status of the 'td' in the printed status map.
  */
 static void check_str_update(struct thread_data *td)
 {
-	char c = run_str[td->thread_number - 1];
+	char c = __run_str[td->thread_number - 1];
 
 	switch (td->runstate) {
 	case TD_REAPED:
@@ -74,6 +98,9 @@
 	case TD_FSYNCING:
 		c = 'F';
 		break;
+	case TD_FINISHING:
+		c = 'f';
+		break;
 	case TD_CREATED:
 		c = 'C';
 		break;
@@ -88,7 +115,8 @@
 		log_err("state %d\n", td->runstate);
 	}
 
-	run_str[td->thread_number - 1] = c;
+	__run_str[td->thread_number - 1] = c;
+	update_condensed_str(__run_str, run_str);
 }
 
 /*
@@ -99,6 +127,11 @@
 	unsigned int d, h, m, s;
 	int disp_hour = 0;
 
+	if (eta_sec == -1) {
+		sprintf(str, "--");
+		return;
+	}
+
 	s = eta_sec % 60;
 	eta_sec /= 60;
 	m = eta_sec % 60;
@@ -116,22 +149,27 @@
 		str += sprintf(str, "%02uh:", h);
 
 	str += sprintf(str, "%02um:", m);
-	str += sprintf(str, "%02us", s);
+	sprintf(str, "%02us", s);
 }
 
 /*
  * Best effort calculation of the estimated pending runtime of a job.
  */
-static int thread_eta(struct thread_data *td)
+static unsigned long thread_eta(struct thread_data *td)
 {
 	unsigned long long bytes_total, bytes_done;
 	unsigned long eta_sec = 0;
 	unsigned long elapsed;
+	uint64_t timeout;
 
 	elapsed = (mtime_since_now(&td->epoch) + 999) / 1000;
+	timeout = td->o.timeout / 1000000UL;
 
 	bytes_total = td->total_io_size;
 
+	if (td->flags & TD_F_NO_PROGRESS)
+		return -1;
+
 	if (td->o.fill_device && td->o.size  == -1ULL) {
 		if (!td->fill_device_size || td->fill_device_size == -1ULL)
 			return 0;
@@ -139,12 +177,27 @@
 		bytes_total = td->fill_device_size;
 	}
 
-	if (td->o.zone_size && td->o.zone_skip && bytes_total) {
+	/*
+	 * If io_size is set, bytes_total is an exact value that does not need
+	 * adjustment.
+	 */
+	if (td->o.zone_size && td->o.zone_skip && bytes_total &&
+	    !fio_option_is_set(&td->o, io_size)) {
 		unsigned int nr_zones;
 		uint64_t zone_bytes;
 
-		zone_bytes = bytes_total + td->o.zone_size + td->o.zone_skip;
-		nr_zones = (zone_bytes - 1) / (td->o.zone_size + td->o.zone_skip);
+		/*
+		 * Calculate the upper bound of the number of zones that will
+		 * be processed, including skipped bytes between zones. If this
+		 * is larger than total_io_size (e.g. when --io_size or --size
+		 * specify a small value), use the lower bound to avoid
+		 * adjustments to a negative value that would result in a very
+		 * large bytes_total and an incorrect eta.
+		 */
+		zone_bytes = td->o.zone_size + td->o.zone_skip;
+		nr_zones = (bytes_total + zone_bytes - 1) / zone_bytes;
+		if (bytes_total < nr_zones * td->o.zone_skip)
+			nr_zones = bytes_total / zone_bytes;
 		bytes_total -= nr_zones * td->o.zone_skip;
 	}
 
@@ -169,27 +222,43 @@
 		double perc, perc_t;
 
 		bytes_done = ddir_rw_sum(td->io_bytes);
-		perc = (double) bytes_done / (double) bytes_total;
-		if (perc > 1.0)
-			perc = 1.0;
+
+		if (bytes_total) {
+			perc = (double) bytes_done / (double) bytes_total;
+			if (perc > 1.0)
+				perc = 1.0;
+		} else
+			perc = 0.0;
 
 		if (td->o.time_based) {
-			perc_t = (double) elapsed / (double) td->o.timeout;
-			if (perc_t < perc)
-				perc = perc_t;
+			if (timeout) {
+				perc_t = (double) elapsed / (double) timeout;
+				if (perc_t < perc)
+					perc = perc_t;
+			} else {
+				/*
+				 * Will never hit, we can't have time_based
+				 * without a timeout set.
+				 */
+				perc = 0.0;
+			}
 		}
 
-		eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed;
+		if (perc == 0.0) {
+			eta_sec = timeout;
+		} else {
+			eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed;
+		}
 
 		if (td->o.timeout &&
-		    eta_sec > (td->o.timeout + done_secs - elapsed))
-			eta_sec = td->o.timeout + done_secs - elapsed;
+		    eta_sec > (timeout + done_secs - elapsed))
+			eta_sec = timeout + done_secs - elapsed;
 	} else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED
 			|| td->runstate == TD_INITIALIZED
 			|| td->runstate == TD_SETTING_UP
 			|| td->runstate == TD_RAMP
 			|| td->runstate == TD_PRE_READING) {
-		int t_eta = 0, r_eta = 0;
+		int64_t t_eta = 0, r_eta = 0;
 		unsigned long long rate_bytes;
 
 		/*
@@ -197,10 +266,17 @@
 		 * if given, otherwise assume it'll run at the specified rate.
 		 */
 		if (td->o.timeout) {
-			t_eta = td->o.timeout + td->o.start_delay +
-					td->o.ramp_time;
+			uint64_t __timeout = td->o.timeout;
+			uint64_t start_delay = td->o.start_delay;
+			uint64_t ramp_time = td->o.ramp_time;
+
+			t_eta = __timeout + start_delay;
+			if (!td->ramp_time_over) {
+				t_eta += ramp_time;
+			}
+			t_eta /= 1000000ULL;
 
-			if (in_ramp_time(td)) {
+			if ((td->runstate == TD_RAMP) && in_ramp_time(td)) {
 				unsigned long ramp_left;
 
 				ramp_left = mtime_since_now(&td->epoch);
@@ -209,10 +285,17 @@
 					t_eta -= ramp_left;
 			}
 		}
-		rate_bytes = ddir_rw_sum(td->o.rate);
+		rate_bytes = 0;
+		if (td_read(td))
+			rate_bytes  = td->o.rate[DDIR_READ];
+		if (td_write(td))
+			rate_bytes += td->o.rate[DDIR_WRITE];
+		if (td_trim(td))
+			rate_bytes += td->o.rate[DDIR_TRIM];
+
 		if (rate_bytes) {
-			r_eta = (bytes_total / 1024) / rate_bytes;
-			r_eta += td->o.start_delay;
+			r_eta = bytes_total / rate_bytes;
+			r_eta += (td->o.start_delay / 1000000ULL);
 		}
 
 		if (r_eta && t_eta)
@@ -235,19 +318,24 @@
 
 static void calc_rate(int unified_rw_rep, unsigned long mtime,
 		      unsigned long long *io_bytes,
-		      unsigned long long *prev_io_bytes, unsigned int *rate)
+		      unsigned long long *prev_io_bytes, uint64_t *rate)
 {
 	int i;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		unsigned long long diff;
+		unsigned long long diff, this_rate;
 
 		diff = io_bytes[i] - prev_io_bytes[i];
+		if (mtime)
+			this_rate = ((1000 * diff) / mtime) / 1024; /* KiB/s */
+		else
+			this_rate = 0;
+
 		if (unified_rw_rep) {
 			rate[i] = 0;
-			rate[0] += ((1000 * diff) / mtime) / 1024;
+			rate[0] += this_rate;
 		} else
-			rate[i] = ((1000 * diff) / mtime) / 1024;
+			rate[i] = this_rate;
 
 		prev_io_bytes[i] = io_bytes[i];
 	}
@@ -260,46 +348,59 @@
 	int i;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		unsigned long long diff;
+		unsigned long long diff, this_iops;
 
 		diff = io_iops[i] - prev_io_iops[i];
+		if (mtime)
+			this_iops = (diff * 1000) / mtime;
+		else
+			this_iops = 0;
+
 		if (unified_rw_rep) {
 			iops[i] = 0;
-			iops[0] += (diff * 1000) / mtime;
+			iops[0] += this_iops;
 		} else
-			iops[i] = (diff * 1000) / mtime;
+			iops[i] = this_iops;
 
 		prev_io_iops[i] = io_iops[i];
 	}
 }
 
 /*
+ * Allow a little slack - if we're within 95% of the time, allow ETA.
+ */
+bool eta_time_within_slack(unsigned int time)
+{
+	return time > ((eta_interval_msec * 95) / 100);
+}
+
+/*
  * Print status of the jobs we know about. This includes rate estimates,
  * ETA, thread state, etc.
  */
-int calc_thread_status(struct jobs_eta *je, int force)
+bool calc_thread_status(struct jobs_eta *je, int force)
 {
 	struct thread_data *td;
 	int i, unified_rw_rep;
-	unsigned long rate_time, disp_time, bw_avg_time, *eta_secs;
+	uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
 	unsigned long long io_bytes[DDIR_RWDIR_CNT];
 	unsigned long long io_iops[DDIR_RWDIR_CNT];
-	struct timeval now;
+	struct timespec now;
 
 	static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT];
 	static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT];
 	static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
-	static struct timeval rate_prev_time, disp_prev_time;
+	static struct timespec rate_prev_time, disp_prev_time;
 
 	if (!force) {
-		if (output_format != FIO_OUTPUT_NORMAL &&
+		if (!(output_format & FIO_OUTPUT_NORMAL) &&
 		    f_out == stdout)
-			return 0;
+			return false;
 		if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
-			return 0;
+			return false;
 
 		if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
-			return 0;
+			return false;
 	}
 
 	if (!ddir_rw_sum(rate_io_bytes))
@@ -307,8 +408,8 @@
 	if (!ddir_rw_sum(disp_io_bytes))
 		fill_start_time(&disp_prev_time);
 
-	eta_secs = malloc(thread_number * sizeof(unsigned long));
-	memset(eta_secs, 0, thread_number * sizeof(unsigned long));
+	eta_secs = malloc(thread_number * sizeof(uint64_t));
+	memset(eta_secs, 0, thread_number * sizeof(uint64_t));
 
 	je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
 
@@ -325,7 +426,8 @@
 			bw_avg_time = td->o.bw_avg_time;
 		if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING
 		    || td->runstate == TD_FSYNCING
-		    || td->runstate == TD_PRE_READING) {
+		    || td->runstate == TD_PRE_READING
+		    || td->runstate == TD_FINISHING) {
 			je->nr_running++;
 			if (td_read(td)) {
 				je->t_rate[0] += td->o.rate[DDIR_READ];
@@ -350,10 +452,9 @@
 		} else if (td->runstate == TD_RAMP) {
 			je->nr_running++;
 			je->nr_ramp++;
-		} else if (td->runstate == TD_SETTING_UP) {
-			je->nr_running++;
+		} else if (td->runstate == TD_SETTING_UP)
 			je->nr_setting_up++;
-		} else if (td->runstate < TD_RUNNING)
+		else if (td->runstate < TD_RUNNING)
 			je->nr_pending++;
 
 		if (je->elapsed_sec >= 3)
@@ -366,7 +467,7 @@
 		if (td->runstate > TD_SETTING_UP) {
 			int ddir;
 
-			for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+			for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 				if (unified_rw_rep) {
 					io_bytes[0] += td->io_bytes[ddir];
 					io_iops[0] += td->io_blocks[ddir];
@@ -378,19 +479,25 @@
 		}
 	}
 
-	if (exitall_on_terminate)
+	if (exitall_on_terminate) {
 		je->eta_sec = INT_MAX;
-	else
-		je->eta_sec = 0;
-
-	for_each_td(td, i) {
-		if (exitall_on_terminate) {
+		for_each_td(td, i) {
 			if (eta_secs[i] < je->eta_sec)
 				je->eta_sec = eta_secs[i];
-		} else {
-			if (eta_secs[i] > je->eta_sec)
-				je->eta_sec = eta_secs[i];
 		}
+	} else {
+		unsigned long eta_stone = 0;
+
+		je->eta_sec = 0;
+		for_each_td(td, i) {
+			if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall)
+				eta_stone += eta_secs[i];
+			else {
+				if (eta_secs[i] > je->eta_sec)
+					je->eta_sec = eta_secs[i];
+			}
+		}
+		je->eta_sec += eta_stone;
 	}
 
 	free(eta_secs);
@@ -402,18 +509,15 @@
 		calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
 				je->rate);
 		memcpy(&rate_prev_time, &now, sizeof(now));
-		add_agg_sample(je->rate[DDIR_READ], DDIR_READ, 0);
-		add_agg_sample(je->rate[DDIR_WRITE], DDIR_WRITE, 0);
-		add_agg_sample(je->rate[DDIR_TRIM], DDIR_TRIM, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_READ]), DDIR_READ, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_WRITE]), DDIR_WRITE, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_TRIM]), DDIR_TRIM, 0);
 	}
 
 	disp_time = mtime_since(&disp_prev_time, &now);
 
-	/*
-	 * Allow a little slack, the target is to print it every 1000 msecs
-	 */
-	if (!force && disp_time < 900)
-		return 0;
+	if (!force && !eta_time_within_slack(disp_time))
+		return false;
 
 	calc_rate(unified_rw_rep, disp_time, io_bytes, disp_io_bytes, je->rate);
 	calc_iops(unified_rw_rep, disp_time, io_iops, disp_io_iops, je->iops);
@@ -421,20 +525,78 @@
 	memcpy(&disp_prev_time, &now, sizeof(now));
 
 	if (!force && !je->nr_running && !je->nr_pending)
-		return 0;
+		return false;
 
 	je->nr_threads = thread_number;
-	memcpy(je->run_str, run_str, thread_number * sizeof(char));
-	return 1;
+	update_condensed_str(__run_str, run_str);
+	memcpy(je->run_str, run_str, strlen(run_str));
+	return true;
+}
+
+static int gen_eta_str(struct jobs_eta *je, char *p, size_t left,
+		       char **rate_str, char **iops_str)
+{
+	bool has_r = je->rate[DDIR_READ] || je->iops[DDIR_READ];
+	bool has_w = je->rate[DDIR_WRITE] || je->iops[DDIR_WRITE];
+	bool has_t = je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM];
+	int l = 0;
+
+	if (!has_r && !has_w && !has_t)
+		return 0;
+
+	if (has_r) {
+		l += snprintf(p + l, left - l, "[r=%s", rate_str[DDIR_READ]);
+		if (!has_w)
+			l += snprintf(p + l, left - l, "]");
+	}
+	if (has_w) {
+		if (has_r)
+			l += snprintf(p + l, left - l, ",");
+		else
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "w=%s", rate_str[DDIR_WRITE]);
+		if (!has_t)
+			l += snprintf(p + l, left - l, "]");
+	}
+	if (has_t) {
+		if (has_r || has_w)
+			l += snprintf(p + l, left - l, ",");
+		else if (!has_r && !has_w)
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "t=%s]", rate_str[DDIR_TRIM]);
+	}
+	if (has_r) {
+		l += snprintf(p + l, left - l, "[r=%s", iops_str[DDIR_READ]);
+		if (!has_w)
+			l += snprintf(p + l, left - l, " IOPS]");
+	}
+	if (has_w) {
+		if (has_r)
+			l += snprintf(p + l, left - l, ",");
+		else
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "w=%s", iops_str[DDIR_WRITE]);
+		if (!has_t)
+			l += snprintf(p + l, left - l, " IOPS]");
+	}
+	if (has_t) {
+		if (has_r || has_w)
+			l += snprintf(p + l, left - l, ",");
+		else if (!has_r && !has_w)
+			l += snprintf(p + l, left - l, "[");
+		l += snprintf(p + l, left - l, "t=%s IOPS]", iops_str[DDIR_TRIM]);
+	}
+
+	return l;
 }
 
 void display_thread_status(struct jobs_eta *je)
 {
-	static struct timeval disp_eta_new_line;
+	static struct timespec disp_eta_new_line;
 	static int eta_new_line_init, eta_new_line_pending;
 	static int linelen_last;
 	static int eta_good;
-	char output[REAL_MAX_JOBS + 512], *p = output;
+	char output[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS) + 512], *p = output;
 	char eta_str[128];
 	double perc = 0.0;
 
@@ -445,23 +607,33 @@
 
 	if (eta_new_line_pending) {
 		eta_new_line_pending = 0;
+		linelen_last = 0;
 		p += sprintf(p, "\n");
 	}
 
 	p += sprintf(p, "Jobs: %d (f=%d)", je->nr_running, je->files_open);
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
+
+	/* rate limits, if any */
+	if (je->m_rate[0] || je->m_rate[1] || je->m_rate[2] ||
+	    je->t_rate[0] || je->t_rate[1] || je->t_rate[2]) {
 		char *tr, *mr;
 
-		mr = num2str(je->m_rate[0] + je->m_rate[1], 4, 0, je->is_pow2, 8);
-		tr = num2str(je->t_rate[0] + je->t_rate[1], 4, 0, je->is_pow2, 8);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
+		mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
+				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+		tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
+				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+
+		p += sprintf(p, ", %s-%s", mr, tr);
 		free(tr);
 		free(mr);
-	} else if (je->m_iops[0] || je->m_iops[1] || je->t_iops[0] || je->t_iops[1]) {
-		p += sprintf(p, ", CR=%d/%d IOPS",
-					je->t_iops[0] + je->t_iops[1],
-					je->m_iops[0] + je->m_iops[1]);
+	} else if (je->m_iops[0] || je->m_iops[1] || je->m_iops[2] ||
+		   je->t_iops[0] || je->t_iops[1] || je->t_iops[2]) {
+		p += sprintf(p, ", %d-%d IOPS",
+					je->m_iops[0] + je->m_iops[1] + je->m_iops[2],
+					je->t_iops[0] + je->t_iops[1] + je->t_iops[2]);
 	}
+
+	/* current run string, % done, bandwidth, iops, eta */
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char perc_str[32];
 		char *iops_str[DDIR_RWDIR_CNT];
@@ -469,9 +641,11 @@
 		size_t left;
 		int l;
 		int ddir;
+		int linelen;
 
-		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
-			strcpy(perc_str, "-.-% done");
+		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running ||
+		    je->eta_sec == -1)
+			strcpy(perc_str, "-.-%");
 		else {
 			double mult = 100.0;
 
@@ -480,41 +654,42 @@
 
 			eta_good = 1;
 			perc *= mult;
-			sprintf(perc_str, "%3.1f%% done", perc);
+			sprintf(perc_str, "%3.1f%%", perc);
 		}
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-			rate_str[ddir] = num2str(je->rate[ddir], 5,
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			rate_str[ddir] = num2str(je->rate[ddir], 4,
 						1024, je->is_pow2, je->unit_base);
-			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, 0);
+			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, N2S_NONE);
 		}
 
 		left = sizeof(output) - (p - output) - 1;
-
-		l = snprintf(p, left, ": [%s] [%s] [%s/%s/%s /s] [%s/%s/%s iops] [eta %s]",
-				je->run_str, perc_str, rate_str[DDIR_READ],
-				rate_str[DDIR_WRITE], rate_str[DDIR_TRIM],
-				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
-				iops_str[DDIR_TRIM], eta_str);
+		l = snprintf(p, left, ": [%s][%s]", je->run_str, perc_str);
+		l += gen_eta_str(je, p + l, left - l, rate_str, iops_str);
+		l += snprintf(p + l, left - l, "[eta %s]", eta_str);
+
+		/* If truncation occurred adjust l so p is on the null */
+		if (l >= left)
+			l = left - 1;
 		p += l;
-		if (l >= 0 && l < linelen_last)
-			p += sprintf(p, "%*s", linelen_last - l, "");
-		linelen_last = l;
+		linelen = p - output;
+		if (l >= 0 && linelen < linelen_last)
+			p += sprintf(p, "%*s", linelen_last - linelen, "");
+		linelen_last = linelen;
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			free(rate_str[ddir]);
 			free(iops_str[ddir]);
 		}
 	}
-	p += sprintf(p, "\r");
+	sprintf(p, "\r");
 
 	printf("%s", output);
 
 	if (!eta_new_line_init) {
 		fio_gettime(&disp_eta_new_line, NULL);
 		eta_new_line_init = 1;
-	} else if (eta_new_line &&
-		   mtime_since_now(&disp_eta_new_line) > eta_new_line * 1000) {
+	} else if (eta_new_line && mtime_since_now(&disp_eta_new_line) > eta_new_line) {
 		fio_gettime(&disp_eta_new_line, NULL);
 		eta_new_line_pending = 1;
 	}
@@ -522,19 +697,35 @@
 	fflush(stdout);
 }
 
-void print_thread_status(void)
+struct jobs_eta *get_jobs_eta(bool force, size_t *size)
 {
 	struct jobs_eta *je;
-	size_t size;
 
 	if (!thread_number)
-		return;
+		return NULL;
 
-	size = sizeof(*je) + thread_number * sizeof(char) + 1;
-	je = malloc(size);
-	memset(je, 0, size);
+	*size = sizeof(*je) + THREAD_RUNSTR_SZ + 8;
+	je = malloc(*size);
+	if (!je)
+		return NULL;
+	memset(je, 0, *size);
+
+	if (!calc_thread_status(je, force)) {
+		free(je);
+		return NULL;
+	}
 
-	if (calc_thread_status(je, 0))
+	*size = sizeof(*je) + strlen((char *) je->run_str) + 1;
+	return je;
+}
+
+void print_thread_status(void)
+{
+	struct jobs_eta *je;
+	size_t size;
+
+	je = get_jobs_eta(false, &size);
+	if (je)
 		display_thread_status(je);
 
 	free(je);
@@ -542,5 +733,11 @@
 
 void print_status_init(int thr_number)
 {
-	run_str[thr_number] = 'P';
+	struct jobs_eta_packed jep;
+
+	compiletime_assert(sizeof(struct jobs_eta) == sizeof(jep), "jobs_eta");
+
+	DRD_IGNORE_VAR(__run_str);
+	__run_str[thr_number] = 'P';
+	update_condensed_str(__run_str, run_str);
 }
diff -Nru fio-2.1.3/examples/backwards-read.fio fio-3.16/examples/backwards-read.fio
--- fio-2.1.3/examples/backwards-read.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/backwards-read.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+# Demonstrates how to read backwards in a file.
+
+[backwards-read]
+bs=4k
+# seek -8k back for every IO
+rw=read:-8k
+filename=128m
+size=128m
diff -Nru fio-2.1.3/examples/basic-verify.fio fio-3.16/examples/basic-verify.fio
--- fio-2.1.3/examples/basic-verify.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/basic-verify.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+# The most basic form of data verification. Write the device randomly
+# in 4K chunks, then read it back and verify the contents.
+[write-and-verify]
+rw=randwrite
+bs=4k
+direct=1
+ioengine=libaio
+iodepth=16
+verify=crc32c
+# Use /dev/XXX. For running this on a file instead, remove the filename
+# option and add a size=32G (or whatever file size you want) instead.
+filename=/dev/XXX
diff -Nru fio-2.1.3/examples/butterfly.fio fio-3.16/examples/butterfly.fio
--- fio-2.1.3/examples/butterfly.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/butterfly.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,19 @@
+# Perform a butterfly/funnel seek pattern. This won't always alternate ends on
+# every I/O but it will get close.
+
+[global]
+filename=/tmp/testfile
+bs=4k
+direct=1
+
+[forward]
+rw=read
+flow=2
+# Uncomment the size= and offset= lines to prevent each direction going past
+# the middle of the file
+#size=50%
+
+[backward]
+rw=read:-8k
+flow=-2
+#offset=50%
diff -Nru fio-2.1.3/examples/cpp_null.fio fio-3.16/examples/cpp_null.fio
--- fio-2.1.3/examples/cpp_null.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/cpp_null.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+[global]
+bs=4k
+gtod_reduce=1
+
+[null]
+ioengine=cpp_null
+size=100g
+rw=randread
+norandommap
+time_based=0
diff -Nru fio-2.1.3/examples/cross-stripe-verify.fio fio-3.16/examples/cross-stripe-verify.fio
--- fio-2.1.3/examples/cross-stripe-verify.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/cross-stripe-verify.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,25 @@
+# Example of how to split a drive up into sections, manually, and perform
+# verify from a bunch of jobs. This example is special in that it assumes
+# the drive is at around 30 * 124G in size, so with the below settings, we'll
+# cover most of the drive. It's also special in that it doesn't write
+# everything, it just writes 16k at a specific boundary, for every 128k.
+# This is done to exercise the split path for Intel NVMe devices, most of
+# which have a 128k stripe size and require IOs to be split if the cross
+# the stripe boundary.
+#
+[global]
+bs=16k
+direct=1
+rw=write:112k
+verify=crc32c
+filename=/dev/nvme0n1
+verify_backlog=1
+offset_increment=124g
+io_size=120g
+offset=120k
+group_reporting=1
+verify_dump=1
+loops=2
+
+[write-verify]
+numjobs=30
diff -Nru fio-2.1.3/examples/dev-dax.fio fio-3.16/examples/dev-dax.fio
--- fio-2.1.3/examples/dev-dax.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/dev-dax.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,45 @@
+[global]
+bs=2m
+ioengine=dev-dax
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the dev-dax engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#
+iodepth=1
+direct=0
+thread=1
+numjobs=16
+#
+# The dev-dax engine does IO to DAX device that are special character
+# devices exported by the kernel (e.g. /dev/dax0.0). The device is
+# opened normally and then the region is accessible via mmap. We do
+# not use the O_DIRECT flag because the device is naturally direct
+# access. The O_DIRECT flags will result in failure. The engine
+# access the underlying NVDIMM directly once the mmapping is setup.
+#
+# Check the alignment requirement of your DAX device. Currently the default
+# should be 2M. Blocksize (bs) should meet alignment requirement.
+#
+# An example of creating a dev dax device node from pmem:
+# ndctl create-namespace --reconfig=namespace0.0 --mode=dax --force
+#
+filename=/dev/dax0.0
+
+[dev-dax-write]
+rw=randwrite
+stonewall
+
+[dev-dax-read]
+rw=randread
+stonewall
diff -Nru fio-2.1.3/examples/e4defrag2.fio fio-3.16/examples/e4defrag2.fio
--- fio-2.1.3/examples/e4defrag2.fio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/examples/e4defrag2.fio	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,6 @@
 #################################################
 # Hardcode defragmentation patterns
-# Please be carefull, it can trigger kernel panic
+# Please be careful, it can trigger kernel panic
 #################################################
 [global]
 ioengine=e4defrag
diff -Nru fio-2.1.3/examples/filecreate-ioengine.fio fio-3.16/examples/filecreate-ioengine.fio
--- fio-2.1.3/examples/filecreate-ioengine.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/filecreate-ioengine.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,35 @@
+# Example filecreate job
+#
+# create_on_open is needed so that the open happens during the run and not the
+# setup.
+#
+# openfiles needs to be set so that you do not exceed the maximum allowed open
+# files.
+#
+# filesize needs to be set to a non zero value so fio will actually run, but the
+# IO will not really be done and the write latency numbers will only reflect the
+# open times.
+[global]
+create_on_open=1
+nrfiles=31250
+ioengine=filecreate
+fallocate=none
+filesize=4k
+openfiles=1
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
+[t6]
+[t7]
+[t8]
+[t9]
+[t10]
+[t11]
+[t12]
+[t13]
+[t14]
+[t15]
diff -Nru fio-2.1.3/examples/fio-rand-read.fio fio-3.16/examples/fio-rand-read.fio
--- fio-2.1.3/examples/fio-rand-read.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-rand-read.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,16 @@
+; fio-rand-read.job for fiotest
+
+[global]
+name=fio-rand-read
+filename=fio-rand-read
+rw=randread
+bs=4K
+direct=0
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fio-rand-RW.fio fio-3.16/examples/fio-rand-RW.fio
--- fio-2.1.3/examples/fio-rand-RW.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-rand-RW.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,18 @@
+; fio-rand-RW.job for fiotest
+
+[global]
+name=fio-rand-RW
+filename=fio-rand-RW
+rw=randrw
+rwmixread=60
+rwmixwrite=40
+bs=4K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fio-rand-write.fio fio-3.16/examples/fio-rand-write.fio
--- fio-2.1.3/examples/fio-rand-write.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-rand-write.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,16 @@
+; fio-rand-write.job for fiotest
+
+[global]
+name=fio-rand-write
+filename=fio-rand-write
+rw=randwrite
+bs=4K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fio-seq-read.fio fio-3.16/examples/fio-seq-read.fio
--- fio-2.1.3/examples/fio-seq-read.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-seq-read.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,14 @@
+[global]
+name=fio-seq-reads
+filename=fio-seq-reads
+rw=read
+bs=256K
+direct=1
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fio-seq-RW.fio fio-3.16/examples/fio-seq-RW.fio
--- fio-2.1.3/examples/fio-seq-RW.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-seq-RW.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,18 @@
+; fio-seq-RW.job for fiotest
+
+[global]
+name=fio-seq-RW
+filename=fio-seq-RW
+rw=rw
+rwmixread=60
+rwmixwrite=40
+bs=256K
+direct=0
+numjobs=4
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fio-seq-write.fio fio-3.16/examples/fio-seq-write.fio
--- fio-2.1.3/examples/fio-seq-write.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fio-seq-write.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,16 @@
+; fio-seq-write.job for fiotest
+
+[global]
+name=fio-seq-write
+filename=fio-seq-write
+rw=write
+bs=256K
+direct=0
+numjobs=1
+time_based=1
+runtime=900
+
+[file1]
+size=10G
+ioengine=libaio
+iodepth=16
diff -Nru fio-2.1.3/examples/fixed-rate-submission.fio fio-3.16/examples/fixed-rate-submission.fio
--- fio-2.1.3/examples/fixed-rate-submission.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/fixed-rate-submission.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+[fixed-rate-submit]
+size=128m
+rw=read
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=1000
diff -Nru fio-2.1.3/examples/ftruncate.fio fio-3.16/examples/ftruncate.fio
--- fio-2.1.3/examples/ftruncate.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/ftruncate.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,27 @@
+# Example ftruncate engine jobs
+
+[global]
+ioengine=ftruncate
+directory=/scratch
+size=102404k ; 100Mb+4k
+stonewall
+filename=truncate
+runtime=10s
+time_based
+direct=1
+#
+# bs option is stub here. Truncation is performed on the current block offset.
+# blocksize value is ignored
+bs=4k
+
+# truncate the file to 4Kbytes then repeatedly grow the file back to just over
+# its original size using subsequent truncates
+[grow-truncate]
+rw=write
+
+# Repeatedly change a file to a random size between 0Kbytes and 100Mb
+# using truncates
+[rand-truncate]
+rw=randwrite
+norandommap
+
diff -Nru fio-2.1.3/examples/fusion-aw-sync.fio fio-3.16/examples/fusion-aw-sync.fio
--- fio-2.1.3/examples/fusion-aw-sync.fio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/examples/fusion-aw-sync.fio	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-# Example Job File that randomly writes 8k worth of data atomically for
-# 60 seconds.
-[rw_aw_file_sync]
-rw=randwrite
-ioengine=fusion-aw-sync
-blocksize=8k
-blockalign=8k
-
-# if file system supports atomic write
-filename=/mnt/fs/file
-# or test on a direct block device instead
-#filename=/dev/fioa
-randrepeat=1
-fallocate=none
-direct=1
-invalidate=0
-runtime=60
-time_based
diff -Nru fio-2.1.3/examples/gfapi.fio fio-3.16/examples/gfapi.fio
--- fio-2.1.3/examples/gfapi.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/gfapi.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,16 @@
+# Test opening a file from multiple jobs.
+# Originally authored by Castor Fu
+[global]
+ioengine=gfapi
+volume=vol
+brick=localhost
+create_on_open=1
+rw=write
+
+[reopen_file_test]
+nrfiles=4
+filesize=16k
+size=64k
+openfiles=2
+rw=write
+filename_format=reopen_test.$filenum
diff -Nru fio-2.1.3/examples/gpudirect-rdmaio-client.fio fio-3.16/examples/gpudirect-rdmaio-client.fio
--- fio-2.1.3/examples/gpudirect-rdmaio-client.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/gpudirect-rdmaio-client.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff -Nru fio-2.1.3/examples/gpudirect-rdmaio-server.fio fio-3.16/examples/gpudirect-rdmaio-server.fio
--- fio-2.1.3/examples/gpudirect-rdmaio-server.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/gpudirect-rdmaio-server.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff -Nru fio-2.1.3/examples/http-s3.fio fio-3.16/examples/http-s3.fio
--- fio-2.1.3/examples/http-s3.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/http-s3.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,34 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff -Nru fio-2.1.3/examples/http-swift.fio fio-3.16/examples/http-swift.fio
--- fio-2.1.3/examples/http-swift.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/http-swift.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,32 @@
+[global]
+ioengine=http
+rw=randwrite
+name=test
+direct=1
+http_verbose=0
+http_mode=swift
+https=on
+# This is the hostname and port portion of the public access link for
+# the container:
+http_host=swift.srv.openstack.local:8081
+filename_format=/swift/v1/fio-test/bucket.$jobnum
+group_reporting
+bs=64k
+size=1M
+# Currently, fio cannot yet generate the Swift Auth-Token itself.
+# You need to set this prior to running fio via
+# eval $(openstack token issue -f shell --prefix SWIFT_) ; export SWIFT_id
+http_swift_auth_token=${SWIFT_id}
+
+[create]
+numjobs=1
+rw=randwrite
+io_size=256k
+verify=sha256
+
+# This will delete all created objects again
+[trim]
+stonewall
+numjobs=1
+rw=trim
+io_size=64k
diff -Nru fio-2.1.3/examples/http-webdav.fio fio-3.16/examples/http-webdav.fio
--- fio-2.1.3/examples/http-webdav.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/http-webdav.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,26 @@
+[global]
+ioengine=http
+rw=randwrite
+name=test
+direct=1
+http_verbose=0
+http_mode=webdav
+https=off
+http_host=localhost
+filename_format=/dav/bucket.$jobnum
+group_reporting
+bs=64k
+size=1M
+
+[create]
+numjobs=16
+rw=randwrite
+io_size=10M
+verify=sha256
+
+# This will delete all created objects again
+[trim]
+stonewall
+numjobs=16
+rw=trim
+io_size=1M
diff -Nru fio-2.1.3/examples/ime.fio fio-3.16/examples/ime.fio
--- fio-2.1.3/examples/ime.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/ime.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,51 @@
+# This jobfile performs basic write+read operations using
+# DDN's Infinite Memory Engine.
+
+[global]
+
+# Use as much jobs as possible to maximize performance
+numjobs=8
+
+# The filename should be uniform so that "read" jobs can read what
+# the "write" jobs have written.
+filename_format=fio-test-ime.$jobnum.$filenum
+
+size=25g
+bs=128k
+
+# These settings are useful for the asynchronous ime_aio engine:
+# by setting the io depth to twice the size of a "batch", we can
+# queue IOs while other IOs are "in-flight".
+iodepth=32
+iodepth_batch=16
+iodepth_batch_complete=16
+
+[write-psync]
+stonewall
+rw=write
+ioengine=ime_psync
+
+[read-psync]
+stonewall
+rw=read
+ioengine=ime_psync
+
+[write-psyncv]
+stonewall
+rw=write
+ioengine=ime_psyncv
+
+[read-psyncv]
+stonewall
+rw=read
+ioengine=ime_psyncv
+
+[write-aio]
+stonewall
+rw=write
+ioengine=ime_aio
+
+[read-aio]
+stonewall
+rw=read
+ioengine=ime_aio
\ No newline at end of file
diff -Nru fio-2.1.3/examples/jesd219.fio fio-3.16/examples/jesd219.fio
--- fio-2.1.3/examples/jesd219.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/jesd219.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,20 @@
+# Sample implementation of the JESD219 workload for SSD endurance
+# testing. It uses a specific distribution of block sizes and
+# read/write mix, as well as a specific distribution of where on
+# the device the IO accesses will land. Based on posting from
+# Jeff Furlong <jeff.furlong@hgst.com>
+[JESD219]
+ioengine=libaio
+direct=1
+rw=randrw
+norandommap
+randrepeat=0
+rwmixread=40
+rwmixwrite=60
+iodepth=256
+numjobs=4
+bssplit=512/4:1024/1:1536/1:2048/1:2560/1:3072/1:3584/1:4k/67:8k/10:16k/7:32k/3:64k/3
+blockalign=4k
+random_distribution=zoned:50/5:30/15:20/80
+filename=/dev/nvme0n1
+group_reporting=1
diff -Nru fio-2.1.3/examples/latency-profile.fio fio-3.16/examples/latency-profile.fio
--- fio-2.1.3/examples/latency-profile.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/latency-profile.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,21 @@
+# Test job that demonstrates how to use the latency target
+# profiling. Fio will find the queue depth between 1..128
+# that fits within the latency constraints of this 4k random
+# read workload.
+
+[global]
+bs=4k
+rw=randread
+random_generator=lfsr
+direct=1
+ioengine=libaio
+iodepth=128
+# Set max acceptable latency to 500msec
+latency_target=500000
+# profile over a 5s window
+latency_window=5000000
+# 99.9% of IOs must be below the target
+latency_percentile=99.9
+
+[device]
+filename=/dev/sda
diff -Nru fio-2.1.3/examples/libhdfs.fio fio-3.16/examples/libhdfs.fio
--- fio-2.1.3/examples/libhdfs.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/libhdfs.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+[global]
+runtime=300
+
+[hdfs]
+filename=dfs-perftest-base.dfs-perftest-base,9000
+ioengine=libhdfs
+rw=read
+bs=256k
diff -Nru fio-2.1.3/examples/libiscsi.fio fio-3.16/examples/libiscsi.fio
--- fio-2.1.3/examples/libiscsi.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/libiscsi.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,3 @@
+[iscsi]
+ioengine=libiscsi
+filename=iscsi\://127.0.0.1/iqn.2016-02.com.fio\:system\:fio/1
diff -Nru fio-2.1.3/examples/libpmem.fio fio-3.16/examples/libpmem.fio
--- fio-2.1.3/examples/libpmem.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/libpmem.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,73 @@
+[global]
+bs=4k
+size=8g
+ioengine=libpmem
+norandommap
+time_based=1
+group_reporting
+invalidate=1
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+
+iodepth=1
+iodepth_batch=1
+thread=1
+numjobs=1
+
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operations.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high performance.
+#
+scramble_buffers=0
+
+#
+# direct=0:
+#   Using pmem_memcpy_nodrain() for write operation
+#
+# direct=1:
+#   Using pmem_memcpy_persist() for write operation
+#
+direct=0
+
+#
+# Setting for fio process's CPU Node and Memory Node
+#
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+#
+# split means that each job will get a unique CPU from the CPU set
+#
+cpus_allowed_policy=split
+
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+directory=/mnt/pmem0
+
+[libpmem-seqwrite]
+rw=write
+stonewall
+
+#[libpmem-seqread]
+#rw=read
+#stonewall
+
+#[libpmem-randwrite]
+#rw=randwrite
+#stonewall
+
+#[libpmem-randread]
+#rw=randread
+#stonewall
diff -Nru fio-2.1.3/examples/mtd.fio fio-3.16/examples/mtd.fio
--- fio-2.1.3/examples/mtd.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/mtd.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,21 @@
+[global]
+gtod_reduce=1
+filename=/dev/mtd0
+ioengine=mtd
+ignore_error=,EIO
+blocksize=512,512,16384
+skip_bad=1
+
+[write]
+stonewall
+rw=trim
+
+[write]
+stonewall
+rw=write
+
+[write]
+stonewall
+block_error_percentiles=1
+rw=trimwrite
+loops=4
diff -Nru fio-2.1.3/examples/nbd.fio fio-3.16/examples/nbd.fio
--- fio-2.1.3/examples/nbd.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/nbd.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,35 @@
+# To use fio to test nbdkit:
+#
+# nbdkit -U - memory size=256M --run 'export unixsocket; fio examples/nbd.fio'
+#
+# To use fio to test qemu-nbd:
+#
+# rm -f /tmp/disk.img /tmp/socket
+# truncate -s 256M /tmp/disk.img
+# export unixsocket=/tmp/socket
+# qemu-nbd -t -k $unixsocket -f raw /tmp/disk.img &
+# fio examples/nbd.fio
+# killall qemu-nbd
+
+[global]
+ioengine=nbd
+uri=nbd+unix:///?socket=${unixsocket}
+# Starting from nbdkit 1.14 the following will work:
+#uri=${uri}
+rw=randrw
+time_based
+runtime=60
+group_reporting
+iodepth=64
+
+[job0]
+offset=0
+
+[job1]
+offset=64m
+
+[job2]
+offset=128m
+
+[job3]
+offset=192m
diff -Nru fio-2.1.3/examples/pmemblk.fio fio-3.16/examples/pmemblk.fio
--- fio-2.1.3/examples/pmemblk.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/pmemblk.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,71 @@
+[global]
+bs=1m
+ioengine=pmemblk
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the pmemblk engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#   Must use threads
+#
+iodepth=1
+direct=1
+thread=1
+numjobs=16
+#
+# Unlink can be used to remove the files when done, but if you are
+# using serial runs with stonewall, and you want the files to be created
+# only once and unlinked only at the very end, then put the unlink=1
+# in the last group.  This is the method demonstrated here.
+#
+# Note that if you have a read-only group and if the files will be
+# newly created, then all of the data will read back as zero and the
+# read will be optimized, yielding performance that is different from
+# that of reading non-zero blocks (or unoptimized zero blocks).
+#
+unlink=0
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+# Here we specify a test file on each of two NVDIMMs.  The first
+# number after the file name is the block size in bytes (4096 bytes
+# in this example).  The second number is the size of the file to
+# create in MiB (1 GiB in this example); note that the actual usable
+# space available to fio will be less than this as libpmemblk requires
+# some space for metadata.
+#
+# Currently, the minimum block size is 512 bytes and the minimum file
+# size is about 17 MiB (these are libpmemblk requirements).
+#
+# While both files in this example have the same block size and file
+# size, this is not required.
+#
+filename=/pmem0/fio-test,4096,1024
+filename=/pmem1/fio-test,4096,1024
+
+[pmemblk-write]
+rw=randwrite
+stonewall
+
+[pmemblk-read]
+rw=randread
+stonewall
+#
+# We're done, so unlink the file:
+#
+unlink=1
+
diff -Nru fio-2.1.3/examples/poisson-rate-submission.fio fio-3.16/examples/poisson-rate-submission.fio
--- fio-2.1.3/examples/poisson-rate-submission.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/poisson-rate-submission.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,14 @@
+[poisson-rate-submit]
+size=128m
+rw=randread
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=50
+# Real world random request flow follows Poisson process. To give better
+# insight on latency distribution, we simulate request flow under Poisson
+# process.
+rate_process=poisson
diff -Nru fio-2.1.3/examples/rados.fio fio-3.16/examples/rados.fio
--- fio-2.1.3/examples/rados.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/rados.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,24 @@
+######################################################################
+# Example test for the RADOS engine.
+#
+# Runs a 4k random write test against a RADOS via librados
+#
+# NOTE: Make sure you have either Ceph pool named 'rados' or change
+#       the pool parameter.
+######################################################################
+[global]
+#logging
+#write_iops_log=write_iops_log
+#write_bw_log=write_bw_log
+#write_lat_log=write_lat_log
+ioengine=rados
+clientname=admin
+pool=rados
+busy_poll=0
+rw=randwrite
+bs=4k
+
+[rbd_iodepth32]
+iodepth=32
+size=128m
+nr_files=32
diff -Nru fio-2.1.3/examples/rand-zones.fio fio-3.16/examples/rand-zones.fio
--- fio-2.1.3/examples/rand-zones.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/rand-zones.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,26 @@
+# Sample job file demonstrating how to use zoned random distributionss
+# to have skewed random accesses. This example has 50% of the accesses
+# to the first 5% of the file (50/5), 30% to the next 15% (30/15), and
+# finally 20% of the IO will end up in the remaining 80%.
+[zones]
+size=2g
+direct=1
+bs=4k
+rw=randread
+norandommap
+random_distribution=zoned:50/5:30/15:20/
+
+# It's also possible to use zoned_abs to specify absolute sizes. For
+# instance, if you do:
+#
+# random_distribution=zoned_abs:50/10G:30/100G:20/500G
+#
+# Then 50% of the access will be to the first 10G of the drive, 30%
+# will be to the next 100G, and 20% will be to the next 500G.
+
+# The above applies to all of reads/writes/trims. If we wanted to do
+# something differently for writes, let's say 50% for the first 10%
+# and 50% for the remaining 90%, we could do it by adding a new section
+# after a a comma.
+
+# random_distribution=zoned:50/5:30/15:20/,50/10:50/90
diff -Nru fio-2.1.3/examples/rbd.fio fio-3.16/examples/rbd.fio
--- fio-2.1.3/examples/rbd.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/rbd.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,22 @@
+######################################################################
+# Example test for the RBD engine.
+# 
+# Runs a 4k random write test against a RBD via librbd
+#
+# NOTE: Make sure you have either a RBD named 'fio_test' or change
+#       the rbdname parameter.
+######################################################################
+[global]
+#logging
+#write_iops_log=write_iops_log
+#write_bw_log=write_bw_log
+#write_lat_log=write_lat_log
+ioengine=rbd
+clientname=admin
+pool=rbd
+rbdname=fio_test
+rw=randwrite
+bs=4k
+
+[rbd_iodepth32]
+iodepth=32
diff -Nru fio-2.1.3/examples/rdmaio-client.fio fio-3.16/examples/rdmaio-client.fio
--- fio-2.1.3/examples/rdmaio-client.fio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/examples/rdmaio-client.fio	2019-09-20 01:01:52.000000000 +0000
@@ -1,11 +1,13 @@
 # Example rdma client job
 [global]
 ioengine=rdma
-filename=[ip_addr]/[port]/[RDMA_WRITE/RDMA_READ/SEND]
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
 bs=1m
 size=100g
 
 [sender]
 rw=write
 iodepth=1
-iodepth_batch_complete=1
\ No newline at end of file
+iodepth_batch_complete=1
diff -Nru fio-2.1.3/examples/rdmaio-server.fio fio-3.16/examples/rdmaio-server.fio
--- fio-2.1.3/examples/rdmaio-server.fio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/examples/rdmaio-server.fio	2019-09-20 01:01:52.000000000 +0000
@@ -1,10 +1,10 @@
 # Example rdma server job
 [global]
 ioengine=rdma
-filename=[ip_addr]/[port]
+port=[port]
 bs=1m
 size=100g
 
 [receiver]
 rw=read
-iodepth=16
\ No newline at end of file
+iodepth=16
diff -Nru fio-2.1.3/examples/ssd-test.fio fio-3.16/examples/ssd-test.fio
--- fio-2.1.3/examples/ssd-test.fio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/examples/ssd-test.fio	2019-09-20 01:01:52.000000000 +0000
@@ -14,7 +14,7 @@
 bs=4k
 ioengine=libaio
 iodepth=4
-size=1g
+size=10g
 direct=1
 runtime=60
 directory=/mount-point-of-ssd
diff -Nru fio-2.1.3/examples/steadystate.fio fio-3.16/examples/steadystate.fio
--- fio-2.1.3/examples/steadystate.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/steadystate.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,45 @@
+#
+# Example job file for steady state job termination
+# Use --output-format=json for detailed information
+#
+# For Windows, change the file names
+#
+
+[global]
+threads=1
+group_reporting=1
+time_based
+size=128m
+
+[ss-write]
+filename=/dev/null
+rw=write
+bs=128k
+numjobs=4
+runtime=5m
+ss=iops:10%
+ss_dur=30s
+ss_ramp=10s
+#
+# Begin ss detection 10s after job starts
+# Terminate job when largest deviation from mean IOPS is 10%
+# Use a rolling 30s window for deviations
+#
+
+
+[ss-read]
+new_group
+stonewall
+filename=/dev/zero
+rw=randread
+bs=4k
+numjobs=4
+runtime=5m
+ss=bw_slope:1%
+ss_dur=10s
+ss_ramp=5s
+#
+# Begin ss detection 5s after job starts
+# Terminate job when bandwidth slope is less than 1% of avg bw
+# Use a rolling 10s window for bw measurements
+#
diff -Nru fio-2.1.3/examples/waitfor.fio fio-3.16/examples/waitfor.fio
--- fio-2.1.3/examples/waitfor.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/examples/waitfor.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,35 @@
+[global]
+threads=1
+group_reporting=1
+filename=/tmp/data
+filesize=128m
+
+[writers]
+rw=write
+bs=128k
+numjobs=4
+runtime=10
+
+[readers]
+new_group
+wait_for=writers
+rw=randread
+bs=4k
+numjobs=4
+runtime=10
+
+[writers2]
+new_group
+wait_for=readers
+rw=randwrite
+bs=4k
+numjobs=4
+runtime=10
+
+[readers2]
+new_group
+wait_for=writers2
+rw=randread
+bs=4k
+numjobs=4
+runtime=10
diff -Nru fio-2.1.3/exp/expression-parser.l fio-3.16/exp/expression-parser.l
--- fio-2.1.3/exp/expression-parser.l	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/exp/expression-parser.l	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,184 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "y.tab.h"
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+extern int lexer_input(char *buffer, unsigned int *nbytes, int buffersize);
+
+#undef YY_INPUT
+#define YY_INPUT(buffer, bytes_read, bytes_requested)			\
+({									\
+	int __ret;							\
+	unsigned int __bread = bytes_read;				\
+	__ret = lexer_input((buffer), &__bread, (bytes_requested));	\
+	bytes_read = __bread;						\
+	__ret;								\
+})
+
+extern int yyerror(long long *result, double *dresult,
+		int *has_error, int *units_specified, const char *msg);
+
+static void __attribute__((unused)) yyunput(int c, char *buf_ptr);
+static int __attribute__((unused)) input(void);
+
+/* set by parser -- this is another thing which makes the parser thread-unsafe :(. */
+int lexer_value_is_time = 0; /* for determining if "m" suffix means mega- or minutes */
+
+#define set_suffix_value(yylval, i_val, d_val, has_d_val) \
+	(yylval).v.dval = (d_val); \
+	(yylval).v.ival = (i_val); \
+	(yylval).v.has_dval = (has_d_val); \
+	(yylval).v.has_error = 0;
+
+%}
+
+%%
+
+
+[kK]|[kK][bB] 	{
+			set_suffix_value(yylval, 1024, 1024.0, 0);
+			return SUFFIX;
+		}
+[Mm][bB]	{
+			set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[mM][sS]	{
+			set_suffix_value(yylval, 1000, 1000.0, 1);
+			return SUFFIX;
+		}
+[uU][sS]	{
+			set_suffix_value(yylval, 1, 1.0, 1);
+			return SUFFIX;
+		}
+[gG]|[Gg][Bb]	{
+			set_suffix_value(yylval, 1024LL * 1024 * 1024, 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[tT]|[tT][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024,
+						1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[pP]|[pP][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024 * 1024,
+					1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[kK][iI][Bb]	{
+			set_suffix_value(yylval, 1000LL, 1000.0, 0);
+			return SUFFIX;
+		}
+[mM][Ii][bB]	{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[gG][iI][Bb]	{
+			set_suffix_value(yylval, 1000000000LL, 1000000000.0 , 0);
+			return SUFFIX;
+		}
+[pP][iI][Bb]	{	
+			set_suffix_value(yylval, 1000000000000LL, 1000000000000.0 , 0);
+			return SUFFIX;
+		}
+[sS]		{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[mM]		{
+			if (!lexer_value_is_time) {
+				set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			} else {
+				set_suffix_value(yylval, 60LL * 1000000LL, 60.0 * 1000000.0, 0);
+			}
+			return SUFFIX;
+		}
+[dD]		{
+			set_suffix_value(yylval, 60LL * 60LL * 24LL * 1000000LL,
+						60.0 * 60.0 * 24.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[hH]		{	
+			set_suffix_value(yylval, 60LL * 60LL * 1000000LL,
+					60.0 * 60.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[ \t] ; /* ignore whitespace */
+[#:,].* ; /* ignore comments, and everything after colons and commas */
+[0-9]*[.][0-9]+|[0-9]*[.]?[0-9]+[eE][-+]*[0-9]+ {
+			int rc;
+			double dval;
+
+			rc = sscanf(yytext, "%lf", &dval);
+			if (rc == 1) {
+				yylval.v.dval = dval;
+				yylval.v.ival = (long long) dval;
+				yylval.v.has_dval = 1;
+				yylval.v.has_error = 0;
+				return NUMBER;
+			} else {
+				yyerror(0, 0, 0, 0, "bad number\n");
+				yylval.v.has_error = 1;
+				return NUMBER;
+			}
+		}
+0x[0-9a-fA-F]+ {
+		int rc, intval;
+		rc = sscanf(yytext, "%x", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+[0-9]+	{
+		int rc, intval;
+		rc = sscanf(yytext, "%d", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+\n	return 0;
+[+-/*()^%]	return yytext[0];
+
+.	{
+		yylval.v.has_error = 1;
+		return NUMBER;	
+	}
+%%
+
diff -Nru fio-2.1.3/exp/expression-parser.y fio-3.16/exp/expression-parser.y
--- fio-2.1.3/exp/expression-parser.y	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/exp/expression-parser.y	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,247 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+struct parser_value_type {
+	double dval;
+	long long ival;
+	int has_dval;
+	int has_error;
+};
+
+typedef union valtype {
+	struct parser_value_type v;
+} PARSER_VALUE_TYPE;
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg);
+
+extern int yylex(void);
+extern void yyrestart(FILE *file);
+extern int lexer_value_is_time;
+
+%}
+
+%union valtype {
+	struct parser_value_type {
+		double dval;
+		long long ival;
+		int has_dval;
+		int has_error;
+	} v;
+};
+
+%token <v> NUMBER
+%token <v> BYE
+%token <v> SUFFIX 
+%left '-' '+'
+%right SUFFIX
+%left '*' '/'
+%right '^'
+%left '%'
+%nonassoc UMINUS
+%parse-param { long long *result }
+%parse-param { double *dresult }
+%parse-param { int *has_error }
+%parse-param { int *units_specified }
+
+%type <v> expression
+%%
+
+top_level:	expression {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = $1.has_error;
+			}
+		| expression error {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = 1;
+			}
+expression:	expression '+' expression { 
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival + $3.ival;
+			else
+				$$.ival = (long long) ($1.dval + $3.dval);
+			$$.dval = $1.dval + $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '-' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival - $3.ival; 
+			else
+				$$.ival = (long long) ($1.dval - $3.dval); 
+			$$.dval = $1.dval - $3.dval; 
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '*' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival * $3.ival;
+			else
+				$$.ival = (long long) ($1.dval * $3.dval);
+			$$.dval = $1.dval * $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '/' expression {
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.ival = $1.ival / $3.ival;
+			if ($3.dval < 1e-20 && $3.dval > -1e-20)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.dval = $1.dval / $3.dval;
+			if ($3.has_dval || $1.has_dval)
+				$$.ival = (long long) $$.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	'-' expression %prec UMINUS {
+			$$.ival = -$2.ival;
+			$$.dval = -$2.dval;
+			$$.has_error = $2.has_error;
+		}
+	|	'(' expression ')' { $$ = $2; }
+	|	expression SUFFIX {
+			if (!$1.has_dval && !$2.has_dval)
+				$$.ival = $1.ival * $2.ival;
+			else
+				$$.ival = (long long) $1.dval * $2.dval;
+			if ($1.has_dval || $2.has_dval)
+				$$.dval = $1.dval * $2.dval;
+			else
+				$$.dval = $1.ival * $2.ival;
+			$$.has_error = $1.has_error || $2.has_error;
+			*units_specified = 1;
+		}
+	|	expression '%' expression {
+			if ($1.has_dval || $3.has_dval)
+				yyerror(0, 0, 0, 0, "modulo on floats");
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else {
+				$$.ival = $1.ival % $3.ival;
+				$$.dval = $$.ival;
+			}
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '^' expression {
+			$$.has_error = $1.has_error || $3.has_error;
+			if (!$1.has_dval && !$3.has_dval) {
+				int i;
+
+				if ($3.ival == 0) {
+					$$.ival = 1;
+				} else if ($3.ival > 0) {
+					long long tmp = $1.ival;
+					$$.ival = 1.0;
+					for (i = 0; i < $3.ival; i++)
+						$$.ival *= tmp;
+				}  else {
+					/* integers, 2^-3, ok, we now have doubles */
+					double tmp;
+					if ($1.ival == 0 && $3.ival == 0) {
+						tmp = 1.0;
+						$$.has_error = 1;
+					} else {
+						double x = (double) $1.ival;
+						double y = (double) $3.ival;
+						tmp = pow(x, y);
+					}
+					$$.ival = (long long) tmp;
+				}
+				$$.dval = pow($1.dval, $3.dval);
+			} else {
+				$$.dval = pow($1.dval, $3.dval);
+				$$.ival = (long long) $$.dval;
+			}
+		}
+	|	NUMBER { $$ = $1; };
+%%
+#include <stdio.h>
+
+/* Urgh.  yacc and lex are kind of horrible.  This is not thread safe, obviously. */
+static int lexer_read_offset = 0;
+static char lexer_input_buffer[1000];
+
+int lexer_input(char* buffer, unsigned int *bytes_read, int bytes_requested)
+{
+	int bytes_left = strlen(lexer_input_buffer) - lexer_read_offset;
+
+	if (bytes_requested > bytes_left )
+		bytes_requested = bytes_left;
+	memcpy(buffer, &lexer_input_buffer[lexer_read_offset], bytes_requested);
+	*bytes_read = bytes_requested;
+	lexer_read_offset += bytes_requested;
+	return 0;
+}
+
+static void setup_to_parse_string(const char *string)
+{
+	unsigned int len;
+
+	len = sizeof(lexer_input_buffer) - 3;
+	if (len > strlen(string))
+		len = strlen(string);
+
+	strncpy(lexer_input_buffer, string, len);
+	lexer_input_buffer[len] = '\0'; 
+	lexer_input_buffer[len + 1] = '\0';  /* lex/yacc want string double null terminated! */
+	lexer_read_offset = 0;
+}
+
+int evaluate_arithmetic_expression(const char *buffer, long long *ival, double *dval,
+					double implied_units, int is_time)
+{
+	int rc, units_specified = 0, has_error = 0;
+
+	lexer_value_is_time = is_time;
+	setup_to_parse_string(buffer);
+	rc = yyparse(ival, dval, &has_error, &units_specified);
+	yyrestart(NULL);
+	if (rc || has_error) {
+		*ival = 0;
+		*dval = 0;
+		has_error = 1;
+	}
+	if (!units_specified) {
+		*ival = (int) ((double) *ival * implied_units);
+		*dval = *dval * implied_units;
+	}
+	return has_error;
+}
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg)
+{
+	/* We do not need to do anything here. */
+	return 0;
+}
+
diff -Nru fio-2.1.3/exp/test-expression-parser.c fio-3.16/exp/test-expression-parser.c
--- fio-2.1.3/exp/test-expression-parser.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/exp/test-expression-parser.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,54 @@
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "../y.tab.h"
+
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units, int is_time);
+ 
+int main(int argc, char *argv[])
+{
+	int rc, bye = 0;
+	long long result;
+	double dresult;
+	char buffer[100];
+
+	do {
+		if (fgets(buffer, 90, stdin) == NULL)
+			break;
+		rc = strlen(buffer);
+		if (rc > 0 && buffer[rc - 1] == '\n')
+			buffer[rc - 1] = '\0';
+		rc = evaluate_arithmetic_expression(buffer, &result, &dresult, 1.0, 0);
+		if (!rc) {
+			printf("%lld (%20.20lf)\n", result, dresult);
+		} else {
+			fprintf(stderr, "Syntax error\n");
+			result = 0;
+			dresult = 0;
+		}
+	} while (!bye);
+	return 0;
+}
+
diff -Nru fio-2.1.3/fifo.c fio-3.16/fifo.c
--- fio-2.1.3/fifo.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/fifo.c	2019-09-20 01:01:52.000000000 +0000
@@ -15,7 +15,7 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
 
@@ -24,6 +24,7 @@
 #include <string.h>
 
 #include "fifo.h"
+#include "minmax.h"
 
 struct fifo *fifo_alloc(unsigned int size)
 {
diff -Nru fio-2.1.3/fifo.h fio-3.16/fifo.h
--- fio-2.1.3/fifo.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/fifo.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,3 +1,5 @@
+#ifndef FIO_FIFO_H
+#define FIO_FIFO_H
 /*
  * A simple FIFO implementation.
  *
@@ -15,9 +17,10 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
+
 struct fifo {
 	unsigned char *buffer;	/* the buffer holding the data */
 	unsigned int size;	/* the size of the allocated buffer */
@@ -40,19 +43,4 @@
 	return fifo->size - fifo->in + fifo->out;
 }
 
-#ifndef min
-#define min(x,y) ({ \
-	typeof(x) _x = (x);	\
-	typeof(y) _y = (y);	\
-	(void) (&_x == &_y);		\
-	_x < _y ? _x : _y; })
-#endif
-
-#ifndef max
-#define max(x,y) ({ \
-	typeof(x) _x = (x);	\
-	typeof(y) _y = (y);	\
-	(void) (&_x == &_y);		\
-	_x > _y ? _x : _y; })
-
 #endif
diff -Nru fio-2.1.3/file.h fio-3.16/file.h
--- fio-2.1.3/file.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/file.h	2019-09-20 01:01:52.000000000 +0000
@@ -8,13 +8,17 @@
 #include "lib/zipf.h"
 #include "lib/axmap.h"
 #include "lib/lfsr.h"
+#include "lib/gauss.h"
+
+/* Forward declarations */
+struct zoned_block_device_info;
 
 /*
  * The type of object we are working on
  */
 enum fio_filetype {
 	FIO_TYPE_FILE = 1,		/* plain file */
-	FIO_TYPE_BD,			/* block device */
+	FIO_TYPE_BLOCK,			/* block device */
 	FIO_TYPE_CHAR,			/* character device */
 	FIO_TYPE_PIPE,			/* pipe */
 };
@@ -27,6 +31,8 @@
 	FIO_FILE_size_known	= 1 << 4,	/* size has been set */
 	FIO_FILE_hashed		= 1 << 5,	/* file is on hash */
 	FIO_FILE_partial_mmap	= 1 << 6,	/* can't do full mmap */
+	FIO_FILE_axmap		= 1 << 7,	/* uses axmap */
+	FIO_FILE_lfsr		= 1 << 8,	/* lfsr is used */
 };
 
 enum file_lock_mode {
@@ -36,13 +42,20 @@
 };
 
 /*
- * roundrobin available files, or choose one at random, or do each one
- * serially.
+ * How fio chooses what file to service next. Choice of uniformly random, or
+ * some skewed random variants, or just sequentially go through them or
+ * roundrobing.
  */
 enum {
-	FIO_FSERVICE_RANDOM	= 1,
-	FIO_FSERVICE_RR		= 2,
-	FIO_FSERVICE_SEQ	= 3,
+	FIO_FSERVICE_RANDOM		= 1,
+	FIO_FSERVICE_RR			= 2,
+	FIO_FSERVICE_SEQ		= 3,
+	__FIO_FSERVICE_NONUNIFORM	= 0x100,
+	FIO_FSERVICE_ZIPF		= __FIO_FSERVICE_NONUNIFORM | 4,
+	FIO_FSERVICE_PARETO		= __FIO_FSERVICE_NONUNIFORM | 5,
+	FIO_FSERVICE_GAUSS		= __FIO_FSERVICE_NONUNIFORM | 6,
+
+	FIO_FSERVICE_SHIFT		= 10,
 };
 
 /*
@@ -53,6 +66,7 @@
 	FIO_FALLOCATE_NONE	= 1,
 	FIO_FALLOCATE_POSIX	= 2,
 	FIO_FALLOCATE_KEEP_SIZE	= 3,
+	FIO_FALLOCATE_NATIVE	= 4,
 };
 
 /*
@@ -73,51 +87,70 @@
 	/*
 	 * filename and possible memory mapping
 	 */
-	char *file_name;
 	unsigned int major, minor;
 	int fileno;
-
-	void *mmap_ptr;
-	size_t mmap_sz;
-	off_t mmap_off;
+	char *file_name;
 
 	/*
 	 * size of the file, offset into file, and io size from that offset
+	 * (be aware io_size is different from thread_options::io_size)
 	 */
 	uint64_t real_file_size;
 	uint64_t file_offset;
 	uint64_t io_size;
 
-	uint64_t last_pos;
-	uint64_t last_start;
+	/*
+	 * Zoned block device information. See also zonemode=zbd.
+	 */
+	struct zoned_block_device_info *zbd_info;
+
+	/*
+	 * Track last end and last start of IO for a given data direction
+	 */
+	uint64_t last_pos[DDIR_RWDIR_CNT];
+	uint64_t last_start[DDIR_RWDIR_CNT];
 
 	uint64_t first_write;
 	uint64_t last_write;
 
 	/*
-	 * For use by the io engine
+	 * Tracks the last iodepth number of completed writes, if data
+	 * verification is enabled
+	 */
+	uint64_t *last_write_comp;
+	unsigned int last_write_idx;
+
+	/*
+	 * For use by the io engine for offset or private data storage
 	 */
-	uint64_t engine_data;
+	union {
+		uint64_t engine_pos;
+		void *engine_data;
+	};
 
 	/*
 	 * if io is protected by a semaphore, this is set
 	 */
 	union {
-		struct fio_mutex *lock;
+		struct fio_sem *lock;
 		struct fio_rwlock *rwlock;
 	};
 
 	/*
-	 * block map for random io
+	 * block map or LFSR for random io
 	 */
-	struct axmap *io_axmap;
-
-	struct fio_lfsr lfsr;
+	union {
+		struct axmap *io_axmap;
+		struct fio_lfsr lfsr;
+	};
 
 	/*
 	 * Used for zipf random distribution
 	 */
-	struct zipf_state zipf;
+	union {
+		struct zipf_state zipf;
+		struct gauss_state gauss;
+	};
 
 	int references;
 	enum fio_file_flags flags;
@@ -125,14 +158,17 @@
 	struct disk_util *du;
 };
 
+#define FILE_ENG_DATA(f)		((f)->engine_data)
+#define FILE_SET_ENG_DATA(f, data)	((f)->engine_data = (data))
+
 #define FILE_FLAG_FNS(name)						\
 static inline void fio_file_set_##name(struct fio_file *f)		\
 {									\
-	(f)->flags |= FIO_FILE_##name;					\
+	(f)->flags = (enum fio_file_flags) ((f)->flags | FIO_FILE_##name);	\
 }									\
 static inline void fio_file_clear_##name(struct fio_file *f)		\
 {									\
-	(f)->flags &= ~FIO_FILE_##name;					\
+	(f)->flags = (enum fio_file_flags) ((f)->flags & ~FIO_FILE_##name);	\
 }									\
 static inline int fio_file_##name(struct fio_file *f)			\
 {									\
@@ -146,6 +182,8 @@
 FILE_FLAG_FNS(size_known);
 FILE_FLAG_FNS(hashed);
 FILE_FLAG_FNS(partial_mmap);
+FILE_FLAG_FNS(axmap);
+FILE_FLAG_FNS(lfsr);
 #undef FILE_FLAG_FNS
 
 /*
@@ -154,15 +192,22 @@
 struct thread_data;
 extern void close_files(struct thread_data *);
 extern void close_and_free_files(struct thread_data *);
-extern uint64_t get_start_offset(struct thread_data *);
+extern uint64_t get_start_offset(struct thread_data *, struct fio_file *);
 extern int __must_check setup_files(struct thread_data *);
 extern int __must_check file_invalidate_cache(struct thread_data *, struct fio_file *);
+#ifdef __cplusplus
+extern "C" {
+#endif
 extern int __must_check generic_open_file(struct thread_data *, struct fio_file *);
 extern int __must_check generic_close_file(struct thread_data *, struct fio_file *);
 extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
+#ifdef __cplusplus
+}
+#endif
 extern int __must_check file_lookup_open(struct fio_file *f, int flags);
-extern int __must_check pre_read_files(struct thread_data *);
-extern int add_file(struct thread_data *, const char *);
+extern bool __must_check pre_read_files(struct thread_data *);
+extern unsigned long long get_rand_file_size(struct thread_data *td);
+extern int add_file(struct thread_data *, const char *, int, int);
 extern int add_file_exclusive(struct thread_data *, const char *);
 extern void get_file(struct fio_file *);
 extern int __must_check put_file(struct thread_data *, struct fio_file *);
@@ -171,10 +216,14 @@
 extern void unlock_file(struct thread_data *, struct fio_file *);
 extern void unlock_file_all(struct thread_data *, struct fio_file *);
 extern int add_dir_files(struct thread_data *, const char *);
-extern int init_random_map(struct thread_data *);
+extern bool init_random_map(struct thread_data *);
 extern void dup_files(struct thread_data *, struct thread_data *);
 extern int get_fileno(struct thread_data *, const char *);
 extern void free_release_files(struct thread_data *);
-void fio_file_reset(struct thread_data *, struct fio_file *);
+extern void filesetup_mem_free(void);
+extern void fio_file_reset(struct thread_data *, struct fio_file *);
+extern bool fio_files_done(struct thread_data *);
+extern bool exists_and_not_regfile(const char *);
+extern int fio_set_directio(struct thread_data *, struct fio_file *);
 
 #endif
diff -Nru fio-2.1.3/filehash.c fio-3.16/filehash.c
--- fio-2.1.3/filehash.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/filehash.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,23 +4,41 @@
 #include "fio.h"
 #include "flist.h"
 #include "hash.h"
+#include "filehash.h"
+#include "smalloc.h"
+#include "lib/bloom.h"
 
 #define HASH_BUCKETS	512
 #define HASH_MASK	(HASH_BUCKETS - 1)
 
-unsigned int file_hash_size = HASH_BUCKETS * sizeof(struct flist_head);
+#define BLOOM_SIZE	16*1024*1024
+
+static unsigned int file_hash_size = HASH_BUCKETS * sizeof(struct flist_head);
 
 static struct flist_head *file_hash;
-static struct fio_mutex *hash_lock;
+static struct fio_sem *hash_lock;
+static struct bloom *file_bloom;
 
 static unsigned short hash(const char *name)
 {
 	return jhash(name, strlen(name), 0) & HASH_MASK;
 }
 
+void fio_file_hash_lock(void)
+{
+	if (hash_lock)
+		fio_sem_down(hash_lock);
+}
+
+void fio_file_hash_unlock(void)
+{
+	if (hash_lock)
+		fio_sem_up(hash_lock);
+}
+
 void remove_file_hash(struct fio_file *f)
 {
-	fio_mutex_down(hash_lock);
+	fio_sem_down(hash_lock);
 
 	if (fio_file_hashed(f)) {
 		assert(!flist_empty(&f->hash_list));
@@ -28,7 +46,7 @@
 		fio_file_clear_hashed(f);
 	}
 
-	fio_mutex_up(hash_lock);
+	fio_sem_up(hash_lock);
 }
 
 static struct fio_file *__lookup_file_hash(const char *name)
@@ -55,9 +73,9 @@
 {
 	struct fio_file *f;
 
-	fio_mutex_down(hash_lock);
+	fio_sem_down(hash_lock);
 	f = __lookup_file_hash(name);
-	fio_mutex_up(hash_lock);
+	fio_sem_up(hash_lock);
 	return f;
 }
 
@@ -70,7 +88,7 @@
 
 	INIT_FLIST_HEAD(&f->hash_list);
 
-	fio_mutex_down(hash_lock);
+	fio_sem_down(hash_lock);
 
 	alias = __lookup_file_hash(f->file_name);
 	if (!alias) {
@@ -78,34 +96,44 @@
 		flist_add_tail(&f->hash_list, &file_hash[hash(f->file_name)]);
 	}
 
-	fio_mutex_up(hash_lock);
+	fio_sem_up(hash_lock);
 	return alias;
 }
 
+bool file_bloom_exists(const char *fname, bool set)
+{
+	return bloom_string(file_bloom, fname, strlen(fname), set);
+}
+
 void file_hash_exit(void)
 {
 	unsigned int i, has_entries = 0;
 
-	fio_mutex_down(hash_lock);
+	fio_sem_down(hash_lock);
 	for (i = 0; i < HASH_BUCKETS; i++)
 		has_entries += !flist_empty(&file_hash[i]);
-	fio_mutex_up(hash_lock);
+	fio_sem_up(hash_lock);
 
 	if (has_entries)
 		log_err("fio: file hash not empty on exit\n");
 
+	sfree(file_hash);
 	file_hash = NULL;
-	fio_mutex_remove(hash_lock);
+	fio_sem_remove(hash_lock);
 	hash_lock = NULL;
+	bloom_free(file_bloom);
+	file_bloom = NULL;
 }
 
-void file_hash_init(void *ptr)
+void file_hash_init(void)
 {
 	unsigned int i;
 
-	file_hash = ptr;
+	file_hash = smalloc(file_hash_size);
+
 	for (i = 0; i < HASH_BUCKETS; i++)
 		INIT_FLIST_HEAD(&file_hash[i]);
 
-	hash_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	hash_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+	file_bloom = bloom_new(BLOOM_SIZE);
 }
diff -Nru fio-2.1.3/filehash.h fio-3.16/filehash.h
--- fio-2.1.3/filehash.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/filehash.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,12 +1,15 @@
 #ifndef FIO_FILE_HASH_H
 #define FIO_FILE_HASH_H
 
-extern unsigned int file_hash_size;
+#include "lib/types.h"
 
-extern void file_hash_init(void *);
+extern void file_hash_init(void);
 extern void file_hash_exit(void);
 extern struct fio_file *lookup_file_hash(const char *);
 extern struct fio_file *add_file_hash(struct fio_file *);
 extern void remove_file_hash(struct fio_file *);
+extern void fio_file_hash_lock(void);
+extern void fio_file_hash_unlock(void);
+extern bool file_bloom_exists(const char *, bool);
 
 #endif
diff -Nru fio-2.1.3/filelock.c fio-3.16/filelock.c
--- fio-2.1.3/filelock.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/filelock.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,246 @@
+/*
+ * Really simple exclusive file locking based on filename.
+ * No hash indexing, just a list, so only works well for < 100 files or
+ * so. But that's more than what fio needs, so should be fine.
+ */
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include "flist.h"
+#include "filelock.h"
+#include "smalloc.h"
+#include "fio_sem.h"
+#include "hash.h"
+#include "log.h"
+
+struct fio_filelock {
+	uint32_t hash;
+	struct fio_sem lock;
+	struct flist_head list;
+	unsigned int references;
+};
+
+#define MAX_FILELOCKS	128
+	
+static struct filelock_data {
+	struct flist_head list;
+	struct fio_sem lock;
+
+	struct flist_head free_list;
+	struct fio_filelock ffs[MAX_FILELOCKS];
+} *fld;
+
+static void put_filelock(struct fio_filelock *ff)
+{
+	flist_add(&ff->list, &fld->free_list);
+}
+
+static struct fio_filelock *__get_filelock(void)
+{
+	struct fio_filelock *ff;
+
+	if (flist_empty(&fld->free_list))
+		return NULL;
+
+	ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+	flist_del_init(&ff->list);
+	return ff;
+}
+
+static struct fio_filelock *get_filelock(int trylock, int *retry)
+{
+	struct fio_filelock *ff;
+
+	do {
+		ff = __get_filelock();
+		if (ff || trylock)
+			break;
+
+		fio_sem_up(&fld->lock);
+		usleep(1000);
+		fio_sem_down(&fld->lock);
+		*retry = 1;
+	} while (1);
+
+	return ff;
+}
+
+int fio_filelock_init(void)
+{
+	int i;
+
+	fld = smalloc(sizeof(*fld));
+	if (!fld)
+		return 1;
+
+	INIT_FLIST_HEAD(&fld->list);
+	INIT_FLIST_HEAD(&fld->free_list);
+
+	if (__fio_sem_init(&fld->lock, FIO_SEM_UNLOCKED))
+		goto err;
+
+	for (i = 0; i < MAX_FILELOCKS; i++) {
+		struct fio_filelock *ff = &fld->ffs[i];
+
+		if (__fio_sem_init(&ff->lock, FIO_SEM_UNLOCKED))
+			goto err;
+		flist_add_tail(&ff->list, &fld->free_list);
+	}
+
+	return 0;
+err:
+	fio_filelock_exit();
+	return 1;
+}
+
+void fio_filelock_exit(void)
+{
+	if (!fld)
+		return;
+
+	assert(flist_empty(&fld->list));
+	__fio_sem_remove(&fld->lock);
+
+	while (!flist_empty(&fld->free_list)) {
+		struct fio_filelock *ff;
+
+		ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+
+		flist_del_init(&ff->list);
+		__fio_sem_remove(&ff->lock);
+	}
+
+	sfree(fld);
+	fld = NULL;
+}
+
+static struct fio_filelock *fio_hash_find(uint32_t hash)
+{
+	struct flist_head *entry;
+	struct fio_filelock *ff;
+
+	flist_for_each(entry, &fld->list) {
+		ff = flist_entry(entry, struct fio_filelock, list);
+		if (ff->hash == hash)
+			return ff;
+	}
+
+	return NULL;
+}
+
+static struct fio_filelock *fio_hash_get(uint32_t hash, int trylock)
+{
+	struct fio_filelock *ff;
+
+	ff = fio_hash_find(hash);
+	if (!ff) {
+		int retry = 0;
+
+		ff = get_filelock(trylock, &retry);
+		if (!ff)
+			return NULL;
+
+		/*
+		 * If we dropped the main lock, re-lookup the hash in case
+		 * someone else added it meanwhile. If it's now there,
+		 * just return that.
+		 */
+		if (retry) {
+			struct fio_filelock *__ff;
+
+			__ff = fio_hash_find(hash);
+			if (__ff) {
+				put_filelock(ff);
+				return __ff;
+			}
+		}
+
+		ff->hash = hash;
+		ff->references = 0;
+		flist_add(&ff->list, &fld->list);
+	}
+
+	return ff;
+}
+
+static bool __fio_lock_file(const char *fname, int trylock)
+{
+	struct fio_filelock *ff;
+	uint32_t hash;
+
+	hash = jhash(fname, strlen(fname), 0);
+
+	fio_sem_down(&fld->lock);
+	ff = fio_hash_get(hash, trylock);
+	if (ff)
+		ff->references++;
+	fio_sem_up(&fld->lock);
+
+	if (!ff) {
+		assert(!trylock);
+		return true;
+	}
+
+	if (!trylock) {
+		fio_sem_down(&ff->lock);
+		return false;
+	}
+
+	if (!fio_sem_down_trylock(&ff->lock))
+		return false;
+
+	fio_sem_down(&fld->lock);
+
+	/*
+	 * If we raced and the only reference to the lock is us, we can
+	 * grab it
+	 */
+	if (ff->references != 1) {
+		ff->references--;
+		ff = NULL;
+	}
+
+	fio_sem_up(&fld->lock);
+
+	if (ff) {
+		fio_sem_down(&ff->lock);
+		return false;
+	}
+
+	return true;
+}
+
+bool fio_trylock_file(const char *fname)
+{
+	return __fio_lock_file(fname, 1);
+}
+
+void fio_lock_file(const char *fname)
+{
+	__fio_lock_file(fname, 0);
+}
+
+void fio_unlock_file(const char *fname)
+{
+	struct fio_filelock *ff;
+	uint32_t hash;
+
+	hash = jhash(fname, strlen(fname), 0);
+
+	fio_sem_down(&fld->lock);
+
+	ff = fio_hash_find(hash);
+	if (ff) {
+		int refs = --ff->references;
+		fio_sem_up(&ff->lock);
+		if (!refs) {
+			flist_del_init(&ff->list);
+			put_filelock(ff);
+		}
+	} else
+		log_err("fio: file not found for unlocking\n");
+
+	fio_sem_up(&fld->lock);
+}
diff -Nru fio-2.1.3/filelock.h fio-3.16/filelock.h
--- fio-2.1.3/filelock.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/filelock.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,13 @@
+#ifndef FIO_LOCK_FILE_H
+#define FIO_LOCK_FILE_H
+
+#include "lib/types.h"
+
+extern void fio_lock_file(const char *);
+extern bool fio_trylock_file(const char *);
+extern void fio_unlock_file(const char *);
+
+extern int fio_filelock_init(void);
+extern void fio_filelock_exit(void);
+
+#endif
diff -Nru fio-2.1.3/filesetup.c fio-3.16/filesetup.c
--- fio-2.1.3/filesetup.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/filesetup.c	2019-09-20 01:01:52.000000000 +0000
@@ -5,21 +5,30 @@
 #include <dirent.h>
 #include <libgen.h>
 #include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/types.h>
 
 #include "fio.h"
 #include "smalloc.h"
 #include "filehash.h"
+#include "options.h"
 #include "os/os.h"
 #include "hash.h"
 #include "lib/axmap.h"
+#include "rwlock.h"
+#include "zbd.h"
 
 #ifdef CONFIG_LINUX_FALLOCATE
 #include <linux/falloc.h>
 #endif
 
-static int root_warn;
+static FLIST_HEAD(filename_list);
+
+/*
+ * List entry for filename_list
+ */
+struct file_name {
+	struct flist_head list;
+	char *filename;
+};
 
 static inline void clear_error(struct thread_data *td)
 {
@@ -27,15 +36,80 @@
 	td->verror[0] = '\0';
 }
 
+static int native_fallocate(struct thread_data *td, struct fio_file *f)
+{
+	bool success;
+
+	success = fio_fallocate(f, 0, f->real_file_size);
+	dprint(FD_FILE, "native fallocate of file %s size %llu was "
+			"%ssuccessful\n", f->file_name,
+			(unsigned long long) f->real_file_size,
+			!success ? "un": "");
+
+	if (success)
+		return false;
+
+	if (errno == ENOSYS)
+		dprint(FD_FILE, "native fallocate is not implemented\n");
+
+	return true;
+}
+
+static void fallocate_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->o.fill_device)
+		return;
+
+	switch (td->o.fallocate_mode) {
+	case FIO_FALLOCATE_NATIVE:
+		native_fallocate(td, f);
+		break;
+	case FIO_FALLOCATE_NONE:
+		break;
+#ifdef CONFIG_POSIX_FALLOCATE
+	case FIO_FALLOCATE_POSIX: {
+		int r;
+
+		dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
+				 f->file_name,
+				 (unsigned long long) f->real_file_size);
+
+		r = posix_fallocate(f->fd, 0, f->real_file_size);
+		if (r > 0)
+			log_err("fio: posix_fallocate fails: %s\n", strerror(r));
+		break;
+		}
+#endif /* CONFIG_POSIX_FALLOCATE */
+#ifdef CONFIG_LINUX_FALLOCATE
+	case FIO_FALLOCATE_KEEP_SIZE: {
+		int r;
+
+		dprint(FD_FILE, "fallocate(FALLOC_FL_KEEP_SIZE) "
+				"file %s size %llu\n", f->file_name,
+				(unsigned long long) f->real_file_size);
+
+		r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, f->real_file_size);
+		if (r != 0)
+			td_verror(td, errno, "fallocate");
+
+		break;
+		}
+#endif /* CONFIG_LINUX_FALLOCATE */
+	default:
+		log_err("fio: unknown fallocate mode: %d\n", td->o.fallocate_mode);
+		assert(0);
+	}
+}
+
 /*
  * Leaves f->fd open on success, caller must close
  */
 static int extend_file(struct thread_data *td, struct fio_file *f)
 {
-	int r, new_layout = 0, unlink_file = 0, flags;
+	int new_layout = 0, unlink_file = 0, flags;
 	unsigned long long left;
-	unsigned int bs;
-	char *b;
+	unsigned long long bs;
+	char *b = NULL;
 
 	if (read_only) {
 		log_err("fio: refusing extend of file due to read-only\n");
@@ -47,69 +121,53 @@
 	 * does that for operations involving reads, or for writes
 	 * where overwrite is set
 	 */
-	if (td_read(td) || (td_write(td) && td->o.overwrite) ||
-	    (td_write(td) && td->io_ops->flags & FIO_NOEXTEND))
+	if (td_read(td) ||
+	   (td_write(td) && td->o.overwrite && !td->o.file_append) ||
+	    (td_write(td) && td_ioengine_flagged(td, FIO_NOEXTEND)))
 		new_layout = 1;
-	if (td_write(td) && !td->o.overwrite)
+	if (td_write(td) && !td->o.overwrite && !td->o.file_append)
 		unlink_file = 1;
 
 	if (unlink_file || new_layout) {
+		int ret;
+
 		dprint(FD_FILE, "layout unlink %s\n", f->file_name);
-		if ((unlink(f->file_name) < 0) && (errno != ENOENT)) {
+
+		ret = td_io_unlink_file(td, f);
+		if (ret != 0 && ret != ENOENT) {
 			td_verror(td, errno, "unlink");
 			return 1;
 		}
 	}
 
-	flags = O_WRONLY | O_CREAT;
+	flags = O_WRONLY;
+	if (td->o.allow_create)
+		flags |= O_CREAT;
 	if (new_layout)
 		flags |= O_TRUNC;
 
+#ifdef WIN32
+	flags |= _O_BINARY;
+#endif
+
 	dprint(FD_FILE, "open file %s, flags %x\n", f->file_name, flags);
 	f->fd = open(f->file_name, flags, 0644);
 	if (f->fd < 0) {
-		td_verror(td, errno, "open");
+		int err = errno;
+
+		if (err == ENOENT && !td->o.allow_create)
+			log_err("fio: file creation disallowed by "
+					"allow_file_create=0\n");
+		else
+			td_verror(td, err, "open");
 		return 1;
 	}
 
-#ifdef CONFIG_POSIX_FALLOCATE
-	if (!td->o.fill_device) {
-		switch (td->o.fallocate_mode) {
-		case FIO_FALLOCATE_NONE:
-			break;
-		case FIO_FALLOCATE_POSIX:
-			dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
-				 f->file_name,
-				 (unsigned long long) f->real_file_size);
-
-			r = posix_fallocate(f->fd, 0, f->real_file_size);
-			if (r > 0) {
-				log_err("fio: posix_fallocate fails: %s\n",
-						strerror(r));
-			}
-			break;
-#ifdef CONFIG_LINUX_FALLOCATE
-		case FIO_FALLOCATE_KEEP_SIZE:
-			dprint(FD_FILE,
-				"fallocate(FALLOC_FL_KEEP_SIZE) "
-				"file %s size %llu\n", f->file_name,
-				(unsigned long long) f->real_file_size);
-
-			r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0,
-					f->real_file_size);
-			if (r != 0)
-				td_verror(td, errno, "fallocate");
-
-			break;
-#endif /* CONFIG_LINUX_FALLOCATE */
-		default:
-			log_err("fio: unknown fallocate mode: %d\n",
-				td->o.fallocate_mode);
-			assert(0);
-		}
-	}
-#endif /* CONFIG_POSIX_FALLOCATE */
+	fallocate_file(td, f);
 
+	/*
+	 * If our jobs don't require regular files initially, we're done.
+	 */
 	if (!new_layout)
 		goto done;
 
@@ -121,16 +179,27 @@
 		dprint(FD_FILE, "truncate file %s, size %llu\n", f->file_name,
 					(unsigned long long) f->real_file_size);
 		if (ftruncate(f->fd, f->real_file_size) == -1) {
-			td_verror(td, errno, "ftruncate");
-			goto err;
+			if (errno != EFBIG) {
+				td_verror(td, errno, "ftruncate");
+				goto err;
+			}
 		}
 	}
 
-	b = malloc(td->o.max_bs[DDIR_WRITE]);
-
 	left = f->real_file_size;
+	bs = td->o.max_bs[DDIR_WRITE];
+	if (bs > left)
+		bs = left;
+
+	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		goto err;
+	}
+
 	while (left && !td->terminate) {
-		bs = td->o.max_bs[DDIR_WRITE];
+		ssize_t r;
+
 		if (bs > left)
 			bs = left;
 
@@ -162,7 +231,7 @@
 
 	if (td->terminate) {
 		dprint(FD_FILE, "terminate unlink %s\n", f->file_name);
-		unlink(f->file_name);
+		td_io_unlink_file(td, f);
 	} else if (td->o.create_fsync) {
 		if (fsync(f->fd) < 0) {
 			td_verror(td, errno, "fsync");
@@ -183,36 +252,55 @@
 err:
 	close(f->fd);
 	f->fd = -1;
+	if (b)
+		free(b);
 	return 1;
 }
 
-static int pre_read_file(struct thread_data *td, struct fio_file *f)
+static bool pre_read_file(struct thread_data *td, struct fio_file *f)
 {
 	int r, did_open = 0, old_runstate;
 	unsigned long long left;
-	unsigned int bs;
+	unsigned long long bs;
+	bool ret = true;
 	char *b;
 
-	if (td->io_ops->flags & FIO_PIPEIO)
-		return 0;
+	if (td_ioengine_flagged(td, FIO_PIPEIO) ||
+	    td_ioengine_flagged(td, FIO_NOIO))
+		return true;
+
+	if (f->filetype == FIO_TYPE_CHAR)
+		return true;
 
 	if (!fio_file_open(f)) {
 		if (td->io_ops->open_file(td, f)) {
 			log_err("fio: cannot pre-read, failed to open file\n");
-			return 1;
+			return false;
 		}
 		did_open = 1;
 	}
 
-	old_runstate = td->runstate;
-	td_set_runstate(td, TD_PRE_READING);
+	old_runstate = td_bump_runstate(td, TD_PRE_READING);
 
+	left = f->io_size;
 	bs = td->o.max_bs[DDIR_READ];
+	if (bs > left)
+		bs = left;
+
 	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		ret = false;
+		goto error;
+	}
 	memset(b, 0, bs);
 
-	lseek(f->fd, f->file_offset, SEEK_SET);
-	left = f->io_size;
+	if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
+		td_verror(td, errno, "lseek");
+		log_err("fio: failed to lseek pre-read file\n");
+		ret = false;
+		goto error;
+	}
 
 	while (left && !td->terminate) {
 		if (bs > left)
@@ -229,29 +317,26 @@
 		}
 	}
 
-	td_set_runstate(td, old_runstate);
+error:
+	td_restore_runstate(td, old_runstate);
 
 	if (did_open)
 		td->io_ops->close_file(td, f);
+
 	free(b);
-	return 0;
+	return ret;
 }
 
-static unsigned long long get_rand_file_size(struct thread_data *td)
+unsigned long long get_rand_file_size(struct thread_data *td)
 {
 	unsigned long long ret, sized;
-	unsigned long r;
-
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->file_size_state);
-		sized = td->o.file_size_high - td->o.file_size_low;
-		ret = (unsigned long long) ((double) sized * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__file_size_state);
-		sized = td->o.file_size_high - td->o.file_size_low;
-		ret = (unsigned long long) ((double) sized * (r / (FRAND_MAX + 1.0)));
-	}
+	uint64_t frand_max;
+	uint64_t r;
 
+	frand_max = rand_max(&td->file_size_state);
+	r = __rand(&td->file_size_state);
+	sized = td->o.file_size_high - td->o.file_size_low;
+	ret = (unsigned long long) ((double) sized * (r / (frand_max + 1.0)));
 	ret += td->o.file_size_low;
 	ret -= (ret % td->o.rw_min_bs);
 	return ret;
@@ -307,7 +392,7 @@
 	int r;
 
 	if (td->io_ops->open_file(td, f)) {
-		log_err("fio: failed opening blockdev %s for size check\n",
+		log_err("fio: failed opening chardev %s for size check\n",
 			f->file_name);
 		return 1;
 	}
@@ -344,16 +429,30 @@
 
 	if (f->filetype == FIO_TYPE_FILE)
 		ret = file_size(td, f);
-	else if (f->filetype == FIO_TYPE_BD)
+	else if (f->filetype == FIO_TYPE_BLOCK)
 		ret = bdev_size(td, f);
 	else if (f->filetype == FIO_TYPE_CHAR)
 		ret = char_size(td, f);
-	else
+	else {
 		f->real_file_size = -1;
+		log_info("%s: failed to get file size of %s\n", td->o.name,
+					f->file_name);
+		return 1; /* avoid offset extends end error message */
+	}
 
+	/*
+	 * Leave ->real_file_size with 0 since it could be expectation
+	 * of initial setup for regular files.
+	 */
 	if (ret)
 		return ret;
 
+	/*
+	 * ->file_offset normally hasn't been initialized yet, so this
+	 * is basically always false unless ->real_file_size is -1, but
+	 * if ->real_file_size is -1 this message doesn't make sense.
+	 * As a result, this message is basically useless.
+	 */
 	if (f->file_offset > f->real_file_size) {
 		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
 					(unsigned long long) f->file_offset,
@@ -369,7 +468,11 @@
 				   unsigned long long off,
 				   unsigned long long len)
 {
-	int ret = 0;
+	int errval = 0, ret = 0;
+
+#ifdef CONFIG_ESX
+	return 0;
+#endif
 
 	if (len == -1ULL)
 		len = f->io_size;
@@ -379,42 +482,60 @@
 	if (len == -1ULL || off == -1ULL)
 		return 0;
 
-	dprint(FD_IO, "invalidate cache %s: %llu/%llu\n", f->file_name, off,
-								len);
-
-	/*
-	 * FIXME: add blockdev flushing too
-	 */
-	if (f->mmap_ptr) {
-		ret = posix_madvise(f->mmap_ptr, f->mmap_sz, POSIX_MADV_DONTNEED);
-#ifdef FIO_MADV_FREE
-		if (f->filetype == FIO_TYPE_BD)
-			(void) posix_madvise(f->mmap_ptr, f->mmap_sz, FIO_MADV_FREE);
-#endif
+	if (td->io_ops->invalidate) {
+		dprint(FD_IO, "invalidate %s cache %s\n", td->io_ops->name,
+			f->file_name);
+		ret = td->io_ops->invalidate(td, f);
+		if (ret < 0)
+			errval = -ret;
+	} else if (td_ioengine_flagged(td, FIO_DISKLESSIO)) {
+		dprint(FD_IO, "invalidate not supported by ioengine %s\n",
+		       td->io_ops->name);
 	} else if (f->filetype == FIO_TYPE_FILE) {
+		dprint(FD_IO, "declare unneeded cache %s: %llu/%llu\n",
+			f->file_name, off, len);
 		ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
-	} else if (f->filetype == FIO_TYPE_BD) {
+		if (ret)
+			errval = ret;
+	} else if (f->filetype == FIO_TYPE_BLOCK) {
+		int retry_count = 0;
+
+		dprint(FD_IO, "drop page cache %s\n", f->file_name);
 		ret = blockdev_invalidate_cache(f);
+		while (ret < 0 && errno == EAGAIN && retry_count++ < 25) {
+			/*
+			 * Linux multipath devices reject ioctl while
+			 * the maps are being updated. That window can
+			 * last tens of milliseconds; we'll try up to
+			 * a quarter of a second.
+			 */
+			usleep(10000);
+			ret = blockdev_invalidate_cache(f);
+		}
 		if (ret < 0 && errno == EACCES && geteuid()) {
-			if (!root_warn) {
+			if (!fio_did_warn(FIO_WARN_ROOT_FLUSH)) {
 				log_err("fio: only root may flush block "
 					"devices. Cache flush bypassed!\n");
-				root_warn = 1;
 			}
-			ret = 0;
 		}
-	} else if (f->filetype == FIO_TYPE_CHAR || f->filetype == FIO_TYPE_PIPE)
-		ret = 0;
-
-	if (ret < 0) {
-		td_verror(td, errno, "invalidate_cache");
-		return 1;
-	} else if (ret > 0) {
-		td_verror(td, ret, "invalidate_cache");
-		return 1;
+		if (ret < 0)
+			errval = errno;
+	} else if (f->filetype == FIO_TYPE_CHAR ||
+		   f->filetype == FIO_TYPE_PIPE) {
+		dprint(FD_IO, "invalidate not supported %s\n", f->file_name);
 	}
 
-	return ret;
+	/*
+	 * Cache flushing isn't a fatal condition, and we know it will
+	 * happen on some platforms where we don't have the proper
+	 * function to flush eg block device caches. So just warn and
+	 * continue on our way.
+	 */
+	if (errval)
+		log_info("fio: cache invalidation of %s failed: %s\n",
+			 f->file_name, strerror(errval));
+
+	return 0;
 
 }
 
@@ -444,7 +565,7 @@
 		f->shadow_fd = -1;
 	}
 
-	f->engine_data = 0;
+	f->engine_pos = 0;
 	return ret;
 }
 
@@ -456,9 +577,6 @@
 	__f = lookup_file_hash(f->file_name);
 	if (__f) {
 		dprint(FD_FILE, "found file in hash %s\n", f->file_name);
-		/*
-		 * racy, need the __f->lock locked
-		 */
 		f->lock = __f->lock;
 		from_hash = 1;
 	} else {
@@ -466,6 +584,10 @@
 		from_hash = 0;
 	}
 
+#ifdef WIN32
+	flags |= _O_BINARY;
+#endif
+
 	f->fd = open(f->file_name, flags, 0600);
 	return from_hash;
 }
@@ -496,11 +618,6 @@
 
 	dprint(FD_FILE, "fd open %s\n", f->file_name);
 
-	if (td_trim(td) && f->filetype != FIO_TYPE_BD) {
-		log_err("fio: trim only applies to block device\n");
-		return 1;
-	}
-
 	if (!strcmp(f->file_name, "-")) {
 		if (td_rw(td)) {
 			log_err("fio: can't read/write to stdin/out\n");
@@ -519,9 +636,16 @@
 		goto skip_flags;
 	if (td->o.odirect)
 		flags |= OS_O_DIRECT;
+	if (td->o.oatomic) {
+		if (!FIO_O_ATOMIC) {
+			td_verror(td, EINVAL, "OS does not support atomic IO");
+			return 1;
+		}
+		flags |= OS_O_DIRECT | FIO_O_ATOMIC;
+	}
 	if (td->o.sync_io)
 		flags |= O_SYNC;
-	if (td->o.create_on_open)
+	if (td->o.create_on_open && td->o.allow_create)
 		flags |= O_CREAT;
 skip_flags:
 	if (f->filetype != FIO_TYPE_FILE)
@@ -532,7 +656,7 @@
 		if (!read_only)
 			flags |= O_RDWR;
 
-		if (f->filetype == FIO_TYPE_FILE)
+		if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
 			flags |= O_CREAT;
 
 		if (is_std)
@@ -549,8 +673,10 @@
 			f->fd = dup(STDIN_FILENO);
 		else
 			from_hash = file_lookup_open(f, flags);
-	} else { //td trim
-		flags |= O_RDWR;
+	} else if (td_trim(td)) {
+		assert(!td_rw(td)); /* should have matched above */
+		if (!read_only)
+			flags |= O_RDWR;
 		from_hash = file_lookup_open(f, flags);
 	}
 
@@ -573,6 +699,7 @@
 		}
 
 		td_verror(td, __e, buf);
+		return 1;
 	}
 
 	if (!from_hash && f->fd != -1) {
@@ -584,7 +711,7 @@
 			 * work-around a "feature" on Linux, where a close of
 			 * an fd that has been opened for write will trigger
 			 * udev to call blkid to check partitions, fs id, etc.
-			 * That polutes the device cache, which can slow down
+			 * That pollutes the device cache, which can slow down
 			 * unbuffered accesses.
 			 */
 			if (f->shadow_fd == -1)
@@ -603,6 +730,10 @@
 	return 0;
 }
 
+/*
+ * This function i.e. get_file_size() is the default .get_file_size
+ * implementation of majority of I/O engines.
+ */
 int generic_get_file_size(struct thread_data *td, struct fio_file *f)
 {
 	return get_file_size(td, f);
@@ -618,17 +749,25 @@
 	int err = 0;
 
 	for_each_file(td, f, i) {
-		dprint(FD_FILE, "get file size for %p/%d/%p\n", f, i,
+		dprint(FD_FILE, "get file size for %p/%d/%s\n", f, i,
 								f->file_name);
 
 		if (td_io_get_file_size(td, f)) {
 			if (td->error != ENOENT) {
 				log_err("%s\n", td->verror);
 				err = 1;
+				break;
 			}
 			clear_error(td);
 		}
 
+		/*
+		 * There are corner cases where we end up with -1 for
+		 * ->real_file_size due to unsupported file type, etc.
+		 * We then just set to size option value divided by number
+		 * of files, similar to the way file ->io_size is set.
+		 * stat(2) failure doesn't set ->real_file_size to -1.
+		 */
 		if (f->real_file_size == -1ULL && td->o.size)
 			f->real_file_size = td->o.size / td->o.nr_files;
 	}
@@ -659,14 +798,14 @@
 		struct stat sb;
 		char buf[256];
 
-		if (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_CHAR) {
+		if (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_CHAR) {
 			if (f->real_file_size != -1ULL)
 				ret += f->real_file_size;
 			continue;
 		} else if (f->filetype != FIO_TYPE_FILE)
 			continue;
 
-		strcpy(buf, f->file_name);
+		snprintf(buf, ARRAY_SIZE(buf), "%s", f->file_name);
 
 		if (stat(buf, &sb) < 0) {
 			if (errno != ENOENT)
@@ -688,8 +827,8 @@
 		if (fm)
 			continue;
 
-		fm = malloc(sizeof(*fm));
-		strcpy(fm->__base, buf);
+		fm = calloc(1, sizeof(*fm));
+		snprintf(fm->__base, ARRAY_SIZE(fm->__base), "%s", buf);
 		fm->base = basename(fm->__base);
 		fm->key = sb.st_dev;
 		flist_add(&fm->list, &list);
@@ -701,7 +840,7 @@
 		fm = flist_entry(n, struct fio_mount, list);
 		flist_del(&fm->list);
 
-		sz = get_fs_size(fm->base);
+		sz = get_fs_free_size(fm->base);
 		if (sz && sz != -1ULL)
 			ret += sz;
 
@@ -711,10 +850,91 @@
 	return ret;
 }
 
-uint64_t get_start_offset(struct thread_data *td)
+uint64_t get_start_offset(struct thread_data *td, struct fio_file *f)
 {
-	return td->o.start_offset +
-		(td->thread_number - 1) * td->o.offset_increment;
+	bool align = false;
+	struct thread_options *o = &td->o;
+	unsigned long long align_bs;
+	unsigned long long offset;
+	unsigned long long increment;
+
+	if (o->file_append && f->filetype == FIO_TYPE_FILE)
+		return f->real_file_size;
+
+	if (o->offset_increment_percent) {
+		assert(!o->offset_increment);
+		increment = o->offset_increment_percent * f->real_file_size / 100;
+		align = true;
+	} else
+		increment = o->offset_increment;
+
+	if (o->start_offset_percent > 0) {
+		/* calculate the raw offset */
+		offset = (f->real_file_size * o->start_offset_percent / 100) +
+			(td->subjob_number * increment);
+
+		align = true;
+	} else {
+		/* start_offset_percent not set */
+		offset = o->start_offset +
+				td->subjob_number * increment;
+	}
+
+	if (align) {
+		/*
+		 * if offset_align is provided, use it
+		 */
+		if (fio_option_is_set(o, start_offset_align)) {
+			align_bs = o->start_offset_align;
+		} else {
+			/* else take the minimum block size */
+			align_bs = td_min_bs(td);
+		}
+
+		/*
+		 * block align the offset at the next available boundary at
+		 * ceiling(offset / align_bs) * align_bs
+		 */
+		offset = (offset / align_bs + (offset % align_bs != 0)) * align_bs;
+	}
+
+	return offset;
+}
+
+static bool create_work_dirs(struct thread_data *td, const char *fname)
+{
+	char path[PATH_MAX];
+	char *start, *end;
+
+	if (td->o.directory) {
+		snprintf(path, PATH_MAX, "%s%c%s", td->o.directory,
+			 FIO_OS_PATH_SEPARATOR, fname);
+		start = strstr(path, fname);
+	} else {
+		snprintf(path, PATH_MAX, "%s", fname);
+		start = path;
+	}
+
+	end = start;
+	while ((end = strchr(end, FIO_OS_PATH_SEPARATOR)) != NULL) {
+		if (end == start)
+			break;
+		*end = '\0';
+		errno = 0;
+#ifdef CONFIG_HAVE_MKDIR_TWO
+		if (mkdir(path, 0600) && errno != EEXIST) {
+#else
+		if (mkdir(path) && errno != EEXIST) {
+#endif
+			log_err("fio: failed to create dir (%s): %d\n",
+				start, errno);
+			return false;
+		}
+		*end = FIO_OS_PATH_SEPARATOR;
+		end++;
+	}
+	td->flags |= TD_F_DIRS_CREATED;
+	return true;
 }
 
 /*
@@ -725,20 +945,28 @@
 	unsigned long long total_size, extend_size;
 	struct thread_options *o = &td->o;
 	struct fio_file *f;
-	unsigned int i;
+	unsigned int i, nr_fs_extra = 0;
 	int err = 0, need_extend;
 	int old_state;
+	const unsigned long long bs = td_min_bs(td);
+	uint64_t fs = 0;
 
 	dprint(FD_FILE, "setup files\n");
 
-	old_state = td->runstate;
-	td_set_runstate(td, TD_SETTING_UP);
+	old_state = td_bump_runstate(td, TD_SETTING_UP);
 
-	if (o->read_iolog_file)
-		goto done;
+	for_each_file(td, f, i) {
+		if (!td_ioengine_flagged(td, FIO_DISKLESSIO) &&
+		    strchr(f->file_name, FIO_OS_PATH_SEPARATOR) &&
+		    !(td->flags & TD_F_DIRS_CREATED) &&
+		    !create_work_dirs(td, f->file_name))
+			goto err_out;
+	}
 
 	/*
-	 * if ioengine defines a setup() method, it's responsible for
+	 * Find out physical size of files or devices for this thread,
+	 * before we determine I/O size and range of our targets.
+	 * If ioengine defines a setup() method, it's responsible for
 	 * opening the files and setting f->real_file_size to indicate
 	 * the valid range for that file.
 	 */
@@ -750,12 +978,16 @@
 	if (err)
 		goto err_out;
 
+	if (o->read_iolog_file)
+		goto done;
+
 	/*
 	 * check sizes. if the files/devices do not exist and the size
 	 * isn't passed to fio, abort.
 	 */
 	total_size = 0;
 	for_each_file(td, f, i) {
+		f->fileno = i;
 		if (f->real_file_size == -1ULL)
 			total_size = -1ULL;
 		else
@@ -769,7 +1001,7 @@
 	 * device/file sizes are zero and no size given, punt
 	 */
 	if ((!total_size || total_size == -1ULL) && !o->size &&
-	    !(td->io_ops->flags & FIO_NOIO) && !o->fill_device &&
+	    !td_ioengine_flagged(td, FIO_NOIO) && !o->fill_device &&
 	    !(o->nr_files && (o->file_size_low || o->file_size_high))) {
 		log_err("%s: you need to specify size=\n", o->name);
 		td_verror(td, EINVAL, "total_file_size");
@@ -777,6 +1009,20 @@
 	}
 
 	/*
+	 * Calculate per-file size and potential extra size for the
+	 * first files, if needed (i.e. if we don't have a fixed size).
+	 */
+	if (!o->file_size_low && o->nr_files) {
+		uint64_t all_fs;
+
+		fs = o->size / o->nr_files;
+		all_fs = fs * o->nr_files;
+
+		if (all_fs < o->size)
+			nr_fs_extra = (o->size - all_fs) / bs;
+	}
+
+	/*
 	 * now file sizes are known, so we can set ->io_size. if size= is
 	 * not given, ->io_size is just equal to ->real_file_size. if size
 	 * is given, ->io_size is size / nr_files.
@@ -784,17 +1030,45 @@
 	extend_size = total_size = 0;
 	need_extend = 0;
 	for_each_file(td, f, i) {
-		f->file_offset = get_start_offset(td);
+		f->file_offset = get_start_offset(td, f);
 
+		/*
+		 * Update ->io_size depending on options specified.
+		 * ->file_size_low being 0 means filesize option isn't set.
+		 * Non zero ->file_size_low equals ->file_size_high means
+		 * filesize option is set in a fixed size format.
+		 * Non zero ->file_size_low not equals ->file_size_high means
+		 * filesize option is set in a range format.
+		 */
 		if (!o->file_size_low) {
 			/*
-			 * no file size range given, file size is equal to
-			 * total size divided by number of files. if that is
-			 * zero, set it to the real file size.
+			 * no file size or range given, file size is equal to
+			 * total size divided by number of files. If the size
+			 * doesn't divide nicely with the min blocksize,
+			 * make the first files bigger.
 			 */
-			f->io_size = o->size / o->nr_files;
-			if (!f->io_size)
+			f->io_size = fs - f->file_offset;
+			if (nr_fs_extra) {
+				nr_fs_extra--;
+				f->io_size += bs;
+			}
+
+			/*
+			 * We normally don't come here for regular files, but
+			 * if the result is 0 for a regular file, set it to the
+			 * real file size. This could be size of the existing
+			 * one if it already exists, but otherwise will be set
+			 * to 0. A new file won't be created because
+			 * ->io_size + ->file_offset equals ->real_file_size.
+			 */
+			if (!f->io_size) {
+				if (f->file_offset > f->real_file_size)
+					goto err_offset;
 				f->io_size = f->real_file_size - f->file_offset;
+				if (!f->io_size)
+					log_info("fio: file %s may be ignored\n",
+						f->file_name);
+			}
 		} else if (f->real_file_size < o->file_size_low ||
 			   f->real_file_size > o->file_size_high) {
 			if (f->file_offset > o->file_size_low)
@@ -815,35 +1089,77 @@
 		if (f->io_size == -1ULL)
 			total_size = -1ULL;
 		else {
-                        if (o->size_percent)
-                                f->io_size = (f->io_size * o->size_percent) / 100;
+                        if (o->size_percent && o->size_percent != 100) {
+				uint64_t file_size;
+
+				file_size = f->io_size + f->file_offset;
+				f->io_size = (file_size *
+					      o->size_percent) / 100;
+				if (f->io_size > (file_size - f->file_offset))
+					f->io_size = file_size - f->file_offset;
+
+				f->io_size -= (f->io_size % td_min_bs(td));
+			}
 			total_size += f->io_size;
 		}
 
 		if (f->filetype == FIO_TYPE_FILE &&
 		    (f->io_size + f->file_offset) > f->real_file_size &&
-		    !(td->io_ops->flags & FIO_DISKLESSIO)) {
+		    !td_ioengine_flagged(td, FIO_DISKLESSIO)) {
 			if (!o->create_on_open) {
 				need_extend++;
 				extend_size += (f->io_size + f->file_offset);
+				fio_file_set_extend(f);
 			} else
 				f->real_file_size = f->io_size + f->file_offset;
-			fio_file_set_extend(f);
 		}
 	}
 
-	if (!o->size || o->size > total_size)
+	if (td->o.block_error_hist) {
+		int len;
+
+		assert(td->o.nr_files == 1);	/* checked in fixup_options */
+		f = td->files[0];
+		len = f->io_size / td->o.bs[DDIR_TRIM];
+		if (len > MAX_NR_BLOCK_INFOS || len <= 0) {
+			log_err("fio: cannot calculate block histogram with "
+				"%d trim blocks, maximum %d\n",
+				len, MAX_NR_BLOCK_INFOS);
+			td_verror(td, EINVAL, "block_error_hist");
+			goto err_out;
+		}
+
+		td->ts.nr_block_infos = len;
+		for (i = 0; i < len; i++)
+			td->ts.block_infos[i] =
+				BLOCK_INFO(0, BLOCK_STATE_UNINIT);
+	} else
+		td->ts.nr_block_infos = 0;
+
+	if (!o->size || (total_size && o->size > total_size))
 		o->size = total_size;
 
+	if (o->size < td_min_bs(td)) {
+		log_err("fio: blocksize too large for data set\n");
+		goto err_out;
+	}
+
 	/*
-	 * See if we need to extend some files
+	 * See if we need to extend some files, typically needed when our
+	 * target regular files don't exist yet, but our jobs require them
+	 * initially due to read I/Os.
 	 */
 	if (need_extend) {
 		temp_stall_ts = 1;
-		if (output_format == FIO_OUTPUT_NORMAL)
-			log_info("%s: Laying out IO file(s) (%u file(s) /"
-				 " %lluMB)\n", o->name, need_extend,
-					extend_size >> 20);
+		if (output_format & FIO_OUTPUT_NORMAL) {
+			log_info("%s: Laying out IO file%s (%u file%s / %s%lluMiB)\n",
+				 o->name,
+				 need_extend > 1 ? "s" : "",
+				 need_extend,
+				 need_extend > 1 ? "s" : "",
+				 need_extend > 1 ? "total " : "",
+				 extend_size >> 20);
+		}
 
 		for_each_file(td, f, i) {
 			unsigned long long old_len = -1ULL, extend_len = -1ULL;
@@ -865,7 +1181,13 @@
 
 			err = __file_invalidate_cache(td, f, old_len,
 								extend_len);
-			close(f->fd);
+
+			/*
+			 * Shut up static checker
+			 */
+			if (f->fd != -1)
+				close(f->fd);
+
 			f->fd = -1;
 			if (err)
 				break;
@@ -876,30 +1198,38 @@
 	if (err)
 		goto err_out;
 
-	if (!o->zone_size)
-		o->zone_size = o->size;
-
 	/*
 	 * iolog already set the total io size, if we read back
 	 * stored entries.
 	 */
-	if (!o->read_iolog_file)
-		td->total_io_size = o->size * o->loops;
+	if (!o->read_iolog_file) {
+		if (o->io_size)
+			td->total_io_size = o->io_size * o->loops;
+		else
+			td->total_io_size = o->size * o->loops;
+	}
 
 done:
 	if (o->create_only)
 		td->done = 1;
 
-	td_set_runstate(td, old_state);
+	td_restore_runstate(td, old_state);
+
+	if (td->o.zone_mode == ZONE_MODE_ZBD) {
+		err = zbd_init(td);
+		if (err)
+			goto err_out;
+	}
 	return 0;
+
 err_offset:
 	log_err("%s: you need to specify valid offset=\n", o->name);
 err_out:
-	td_set_runstate(td, old_state);
+	td_restore_runstate(td, old_state);
 	return 1;
 }
 
-int pre_read_files(struct thread_data *td)
+bool pre_read_files(struct thread_data *td)
 {
 	struct fio_file *f;
 	unsigned int i;
@@ -907,22 +1237,23 @@
 	dprint(FD_FILE, "pre_read files\n");
 
 	for_each_file(td, f, i) {
-		pre_read_file(td, f);
+		if (!pre_read_file(td, f))
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
-static int __init_rand_distribution(struct thread_data *td, struct fio_file *f)
+static void __init_rand_distribution(struct thread_data *td, struct fio_file *f)
 {
 	unsigned int range_size, seed;
-	unsigned long nranges;
-	uint64_t file_size;
+	uint64_t nranges;
+	uint64_t fsize;
 
 	range_size = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
-	file_size = min(f->real_file_size, f->io_size);
+	fsize = min(f->real_file_size, f->io_size);
 
-	nranges = (file_size + range_size - 1) / range_size;
+	nranges = (fsize + range_size - 1ULL) / range_size;
 
 	seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
 	if (!td->o.rand_repeatable)
@@ -930,57 +1261,101 @@
 
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
 		zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
-	else
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
 		pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
-
-	return 1;
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, seed);
 }
 
-static int init_rand_distribution(struct thread_data *td)
+static bool init_rand_distribution(struct thread_data *td)
 {
 	struct fio_file *f;
 	unsigned int i;
 	int state;
 
 	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
-		return 0;
+		return false;
+
+	state = td_bump_runstate(td, TD_SETTING_UP);
 
-	state = td->runstate;
-	td_set_runstate(td, TD_SETTING_UP);
 	for_each_file(td, f, i)
 		__init_rand_distribution(td, f);
-	td_set_runstate(td, state);
 
-	return 1;
+	td_restore_runstate(td, state);
+	return true;
 }
 
-int init_random_map(struct thread_data *td)
+/*
+ * Check if the number of blocks exceeds the randomness capability of
+ * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * 64-bit capable.
+ */
+static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
+				 uint64_t blocks)
+{
+	if (blocks <= FRAND32_MAX)
+		return 0;
+	if (td->o.random_generator != FIO_RAND_GEN_TAUSWORTHE)
+		return 0;
+
+	/*
+	 * If the user hasn't specified a random generator, switch
+	 * to tausworthe64 with informational warning. If the user did
+	 * specify one, just warn.
+	 */
+	log_info("fio: file %s exceeds 32-bit tausworthe random generator.\n",
+			f->file_name);
+
+	if (!fio_option_is_set(&td->o, random_generator)) {
+		log_info("fio: Switching to tausworthe64. Use the "
+			 "random_generator= option to get rid of this "
+			 "warning.\n");
+		td->o.random_generator = FIO_RAND_GEN_TAUSWORTHE64;
+		return 0;
+	}
+
+	/*
+	 * Just make this information to avoid breaking scripts.
+	 */
+	log_info("fio: Use the random_generator= option to switch to lfsr or "
+			 "tausworthe64.\n");
+	return 0;
+}
+
+bool init_random_map(struct thread_data *td)
 {
 	unsigned long long blocks;
 	struct fio_file *f;
 	unsigned int i;
 
 	if (init_rand_distribution(td))
-		return 0;
+		return true;
 	if (!td_random(td))
-		return 0;
+		return true;
 
 	for_each_file(td, f, i) {
-		uint64_t file_size = min(f->real_file_size, f->io_size);
+		uint64_t fsize = min(f->real_file_size, f->io_size);
 
-		blocks = file_size / (unsigned long long) td->o.rw_min_bs;
+		blocks = fsize / (unsigned long long) td->o.rw_min_bs;
+
+		if (check_rand_gen_limits(td, f, blocks))
+			return false;
 
 		if (td->o.random_generator == FIO_RAND_GEN_LFSR) {
-			unsigned long seed;
+			uint64_t seed;
 
 			seed = td->rand_seeds[FIO_RAND_BLOCK_OFF];
-			
-			if (!lfsr_init(&f->lfsr, blocks, seed, seed & 0xF))
+
+			if (!lfsr_init(&f->lfsr, blocks, seed, 0)) {
+				fio_file_set_lfsr(f);
 				continue;
+			}
 		} else if (!td->o.norandommap) {
 			f->io_axmap = axmap_new(blocks);
-			if (f->io_axmap)
+			if (f->io_axmap) {
+				fio_file_set_axmap(f);
 				continue;
+			}
 		} else if (td->o.norandommap)
 			continue;
 
@@ -989,14 +1364,14 @@
 				" a large number of jobs, try the 'norandommap'"
 				" option or set 'softrandommap'. Or give"
 				" a larger --alloc-size to fio.\n");
-			return 1;
+			return false;
 		}
 
 		log_info("fio: file %s failed allocating random map. Running "
 			 "job without.\n", f->file_name);
 	}
 
-	return 0;
+	return true;
 }
 
 void close_files(struct thread_data *td)
@@ -1014,13 +1389,14 @@
 {
 	struct fio_file *f;
 	unsigned int i;
+	bool use_free = td_ioengine_flagged(td, FIO_NOFILEHASH);
 
 	dprint(FD_FILE, "close files\n");
 
 	for_each_file(td, f, i) {
 		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
 			dprint(FD_FILE, "free unlink %s\n", f->file_name);
-			unlink(f->file_name);
+			td_io_unlink_file(td, f);
 		}
 
 		if (fio_file_open(f))
@@ -1028,11 +1404,26 @@
 
 		remove_file_hash(f);
 
-		sfree(f->file_name);
+		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
+			dprint(FD_FILE, "free unlink %s\n", f->file_name);
+			td_io_unlink_file(td, f);
+		}
+
+		zbd_free_zone_info(f);
+
+		if (use_free)
+			free(f->file_name);
+		else
+			sfree(f->file_name);
 		f->file_name = NULL;
-		axmap_free(f->io_axmap);
-		f->io_axmap = NULL;
-		sfree(f);
+		if (fio_file_axmap(f)) {
+			axmap_free(f->io_axmap);
+			f->io_axmap = NULL;
+		}
+		if (use_free)
+			free(f);
+		else
+			sfree(f);
 	}
 
 	td->o.filename = NULL;
@@ -1041,6 +1432,7 @@
 	td->files_index = 0;
 	td->files = NULL;
 	td->file_locks = NULL;
+	td->o.file_lock_mode = FILE_LOCK_NONE;
 	td->o.nr_files = 0;
 }
 
@@ -1053,14 +1445,16 @@
 	else
 		f->filetype = FIO_TYPE_FILE;
 
+#ifdef WIN32
 	/* \\.\ is the device namespace in Windows, where every file is
 	 * a block device */
 	if (strncmp(f->file_name, "\\\\.\\", 4) == 0)
-		f->filetype = FIO_TYPE_BD;
+		f->filetype = FIO_TYPE_BLOCK;
+#endif
 
 	if (!stat(f->file_name, &sb)) {
 		if (S_ISBLK(sb.st_mode))
-			f->filetype = FIO_TYPE_BD;
+			f->filetype = FIO_TYPE_BLOCK;
 		else if (S_ISCHR(sb.st_mode))
 			f->filetype = FIO_TYPE_CHAR;
 		else if (S_ISFIFO(sb.st_mode))
@@ -1068,24 +1462,137 @@
 	}
 }
 
-int add_file(struct thread_data *td, const char *fname)
+static bool __is_already_allocated(const char *fname, bool set)
 {
-	int cur_files = td->files_index;
-	char file_name[PATH_MAX];
-	struct fio_file *f;
-	int len = 0;
+	struct flist_head *entry;
+	bool ret;
 
-	dprint(FD_FILE, "add file %s\n", fname);
+	ret = file_bloom_exists(fname, set);
+	if (!ret)
+		return ret;
+
+	flist_for_each(entry, &filename_list) {
+		struct file_name *fn;
+
+		fn = flist_entry(entry, struct file_name, list);
+
+		if (!strcmp(fn->filename, fname))
+			return true;
+	}
+
+	return false;
+}
+
+static bool is_already_allocated(const char *fname)
+{
+	bool ret;
+
+	fio_file_hash_lock();
+	ret = __is_already_allocated(fname, false);
+	fio_file_hash_unlock();
+
+	return ret;
+}
+
+static void set_already_allocated(const char *fname)
+{
+	struct file_name *fn;
+
+	fn = malloc(sizeof(struct file_name));
+	fn->filename = strdup(fname);
+
+	fio_file_hash_lock();
+	if (!__is_already_allocated(fname, true)) {
+		flist_add_tail(&fn->list, &filename_list);
+		fn = NULL;
+	}
+	fio_file_hash_unlock();
+
+	if (fn) {
+		free(fn->filename);
+		free(fn);
+	}
+}
 
-	f = smalloc(sizeof(*f));
+static void free_already_allocated(void)
+{
+	struct flist_head *entry, *tmp;
+	struct file_name *fn;
+
+	if (flist_empty(&filename_list))
+		return;
+
+	fio_file_hash_lock();
+	flist_for_each_safe(entry, tmp, &filename_list) {
+		fn = flist_entry(entry, struct file_name, list);
+		free(fn->filename);
+		flist_del(&fn->list);
+		free(fn);
+	}
+
+	fio_file_hash_unlock();
+}
+
+static struct fio_file *alloc_new_file(struct thread_data *td)
+{
+	struct fio_file *f;
+
+	if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+		f = calloc(1, sizeof(*f));
+	else
+		f = scalloc(1, sizeof(*f));
 	if (!f) {
-		log_err("fio: smalloc OOM\n");
 		assert(0);
+		return NULL;
 	}
 
 	f->fd = -1;
 	f->shadow_fd = -1;
 	fio_file_reset(td, f);
+	return f;
+}
+
+bool exists_and_not_regfile(const char *filename)
+{
+	struct stat sb;
+
+	if (lstat(filename, &sb) == -1)
+		return false;
+
+#ifndef WIN32 /* NOT Windows */
+	if (S_ISREG(sb.st_mode))
+		return false;
+#else
+	/* \\.\ is the device namespace in Windows, where every file
+	 * is a device node */
+	if (S_ISREG(sb.st_mode) && strncmp(filename, "\\\\.\\", 4) != 0)
+		return false;
+#endif
+
+	return true;
+}
+
+int add_file(struct thread_data *td, const char *fname, int numjob, int inc)
+{
+	int cur_files = td->files_index;
+	char file_name[PATH_MAX];
+	struct fio_file *f;
+	int len = 0;
+
+	dprint(FD_FILE, "add file %s\n", fname);
+
+	if (td->o.directory)
+		len = set_name_idx(file_name, PATH_MAX, td->o.directory, numjob,
+					td->o.unique_filename);
+
+	sprintf(file_name + len, "%s", fname);
+
+	/* clean cloned siblings using existing files */
+	if (numjob && is_already_allocated(file_name) &&
+	    !exists_and_not_regfile(fname))
+		return 0;
+
+	f = alloc_new_file(td);
 
 	if (td->files_size <= td->files_index) {
 		unsigned int new_size = td->o.nr_files + 1;
@@ -1113,18 +1620,16 @@
 	/*
 	 * init function, io engine may not be loaded yet
 	 */
-	if (td->io_ops && (td->io_ops->flags & FIO_DISKLESSIO))
+	if (td->io_ops && td_ioengine_flagged(td, FIO_DISKLESSIO))
 		f->real_file_size = -1ULL;
 
-	if (td->o.directory)
-		len = sprintf(file_name, "%s/", td->o.directory);
+	if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+		f->file_name = strdup(file_name);
+	else
+		f->file_name = smalloc_strdup(file_name);
 
-	sprintf(file_name + len, "%s", fname);
-	f->file_name = smalloc_strdup(file_name);
-	if (!f->file_name) {
-		log_err("fio: smalloc OOM\n");
-		assert(0);
-	}
+	/* can't handle smalloc failure from here */
+	assert(f->file_name);
 
 	get_file_type(f);
 
@@ -1135,7 +1640,7 @@
 		f->rwlock = fio_rwlock_init();
 		break;
 	case FILE_LOCK_EXCLUSIVE:
-		f->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+		f->lock = fio_sem_init(FIO_SEM_UNLOCKED);
 		break;
 	default:
 		log_err("fio: unknown lock mode: %d\n", td->o.file_lock_mode);
@@ -1143,8 +1648,12 @@
 	}
 
 	td->files_index++;
-	if (f->filetype == FIO_TYPE_FILE)
-		td->nr_normal_files++;
+
+	if (td->o.numjobs > 1)
+		set_already_allocated(file_name);
+
+	if (inc)
+		td->o.nr_files++;
 
 	dprint(FD_FILE, "file %p \"%s\" added at %d\n", f, f->file_name,
 							cur_files);
@@ -1162,7 +1671,7 @@
 			return i;
 	}
 
-	return add_file(td, fname);
+	return add_file(td, fname, 0, 1);
 }
 
 void get_file(struct fio_file *f)
@@ -1187,8 +1696,16 @@
 	if (--f->references)
 		return 0;
 
-	if (should_fsync(td) && td->o.fsync_on_close)
+	disk_util_dec(f->du);
+
+	if (td->o.file_lock_mode != FILE_LOCK_NONE)
+		unlock_file_all(td, f);
+
+	if (should_fsync(td) && td->o.fsync_on_close) {
 		f_ret = fsync(f->fd);
+		if (f_ret < 0)
+			f_ret = errno;
+	}
 
 	if (td->io_ops->close_file)
 		ret = td->io_ops->close_file(td, f);
@@ -1197,6 +1714,7 @@
 		ret = f_ret;
 
 	td->nr_open_files--;
+	fio_file_clear_closing(f);
 	fio_file_clear_open(f);
 	assert(f->fd == -1);
 	return ret;
@@ -1213,7 +1731,7 @@
 		else
 			fio_rwlock_write(f->rwlock);
 	} else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
-		fio_mutex_down(f->lock);
+		fio_sem_down(f->lock);
 
 	td->file_locks[f->fileno] = td->o.file_lock_mode;
 }
@@ -1226,21 +1744,23 @@
 	if (td->o.file_lock_mode == FILE_LOCK_READWRITE)
 		fio_rwlock_unlock(f->rwlock);
 	else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
-		fio_mutex_up(f->lock);
+		fio_sem_up(f->lock);
 
 	td->file_locks[f->fileno] = FILE_LOCK_NONE;
 }
 
 void unlock_file_all(struct thread_data *td, struct fio_file *f)
 {
+	if (td->o.file_lock_mode == FILE_LOCK_NONE || !td->file_locks)
+		return;
 	if (td->file_locks[f->fileno] != FILE_LOCK_NONE)
 		unlock_file(td, f);
 }
 
-static int recurse_dir(struct thread_data *td, const char *dirname)
+static bool recurse_dir(struct thread_data *td, const char *dirname)
 {
 	struct dirent *dir;
-	int ret = 0;
+	bool ret = false;
 	DIR *D;
 
 	D = opendir(dirname);
@@ -1249,7 +1769,7 @@
 
 		snprintf(buf, FIO_VERROR_SIZE, "opendir(%s)", dirname);
 		td_verror(td, errno, buf);
-		return 1;
+		return true;
 	}
 
 	while ((dir = readdir(D)) != NULL) {
@@ -1259,18 +1779,18 @@
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
 
-		sprintf(full_path, "%s%s%s", dirname, FIO_OS_PATH_SEPARATOR, dir->d_name);
+		sprintf(full_path, "%s%c%s", dirname, FIO_OS_PATH_SEPARATOR, dir->d_name);
 
 		if (lstat(full_path, &sb) == -1) {
 			if (errno != ENOENT) {
 				td_verror(td, errno, "stat");
-				return 1;
+				ret = true;
+				break;
 			}
 		}
 
 		if (S_ISREG(sb.st_mode)) {
-			add_file(td, full_path);
-			td->o.nr_files++;
+			add_file(td, full_path, 0, 1);
 			continue;
 		}
 		if (!S_ISDIR(sb.st_mode))
@@ -1313,24 +1833,24 @@
 	for_each_file(org, f, i) {
 		struct fio_file *__f;
 
-		__f = smalloc(sizeof(*__f));
-		if (!__f) {
-			log_err("fio: smalloc OOM\n");
-			assert(0);
-		}
-		__f->fd = -1;
-		fio_file_reset(td, __f);
+		__f = alloc_new_file(td);
 
 		if (f->file_name) {
-			__f->file_name = smalloc_strdup(f->file_name);
-			if (!__f->file_name) {
-				log_err("fio: smalloc OOM\n");
-				assert(0);
-			}
+			if (td_ioengine_flagged(td, FIO_NOFILEHASH))
+				__f->file_name = strdup(f->file_name);
+			else
+				__f->file_name = smalloc_strdup(f->file_name);
 
+			/* can't handle smalloc failure from here */
+			assert(__f->file_name);
 			__f->filetype = f->filetype;
 		}
 
+		if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
+			__f->lock = f->lock;
+		else if (td->o.file_lock_mode == FILE_LOCK_READWRITE)
+			__f->rwlock = f->rwlock;
+
 		td->files[i] = __f;
 	}
 }
@@ -1356,16 +1876,71 @@
 void free_release_files(struct thread_data *td)
 {
 	close_files(td);
+	td->o.nr_files = 0;
+	td->o.open_files = 0;
 	td->files_index = 0;
-	td->nr_normal_files = 0;
 }
 
 void fio_file_reset(struct thread_data *td, struct fio_file *f)
 {
-	f->last_pos = f->file_offset;
-	f->last_start = -1ULL;
-	if (f->io_axmap)
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		f->last_pos[i] = f->file_offset;
+		f->last_start[i] = -1ULL;
+	}
+
+	if (fio_file_axmap(f))
 		axmap_reset(f->io_axmap);
-	if (td->o.random_generator == FIO_RAND_GEN_LFSR)
+	else if (fio_file_lfsr(f))
 		lfsr_reset(&f->lfsr, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
+
+	zbd_file_reset(td, f);
+}
+
+bool fio_files_done(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i)
+		if (!fio_file_done(f))
+			return false;
+
+	return true;
+}
+
+/* free memory used in initialization phase only */
+void filesetup_mem_free(void)
+{
+	free_already_allocated();
+}
+
+/*
+ * This function is for platforms which support direct I/O but not O_DIRECT.
+ */
+int fio_set_directio(struct thread_data *td, struct fio_file *f)
+{
+#ifdef FIO_OS_DIRECTIO
+	int ret = fio_set_odirect(f);
+
+	if (ret) {
+		td_verror(td, ret, "fio_set_directio");
+#if defined(__sun__)
+		if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
+			log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
+		} else {
+			log_err("fio: the file system does not seem to support direct IO\n");
+		}
+#else
+		log_err("fio: the file system does not seem to support direct IO\n");
+#endif
+		return -1;
+	}
+
+	return 0;
+#else
+	log_err("fio: direct IO is not supported on this host operating system\n");
+	return -1;
+#endif
 }
diff -Nru fio-2.1.3/fio.1 fio-3.16/fio.1
--- fio-2.1.3/fio.1	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/fio.1	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,4 @@
-.TH fio 1 "September 2007" "User Manual"
+.TH fio 1 "August 2017" "User Manual"
 .SH NAME
 fio \- flexible I/O tester
 .SH SYNOPSIS
@@ -13,155 +13,569 @@
 .SH OPTIONS
 .TP
 .BI \-\-debug \fR=\fPtype
-Enable verbose tracing of various fio actions. May be `all' for all types
-or individual types separated by a comma (eg \-\-debug=io,file). `help' will
-list all available tracing options.
+Enable verbose tracing \fItype\fR of various fio actions. May be `all' for all \fItype\fRs
+or individual types separated by a comma (e.g. `\-\-debug=file,mem' will enable
+file and memory debugging). `help' will list all available tracing options.
+.TP
+.BI \-\-parse\-only
+Parse options only, don't start any I/O.
+.TP
+.BI \-\-merge\-blktrace\-only
+Merge blktraces only, don't start any I/O.
 .TP
 .BI \-\-output \fR=\fPfilename
 Write output to \fIfilename\fR.
 .TP
-.BI \-\-runtime \fR=\fPruntime
-Limit run time to \fIruntime\fR seconds.
+.BI \-\-output\-format \fR=\fPformat
+Set the reporting \fIformat\fR to `normal', `terse', `json', or
+`json+'. Multiple formats can be selected, separate by a comma. `terse'
+is a CSV based format. `json+' is like `json', except it adds a full
+dump of the latency buckets.
+.TP
+.BI \-\-bandwidth\-log
+Generate aggregate bandwidth logs.
+.TP
+.BI \-\-minimal
+Print statistics in a terse, semicolon\-delimited format.
+.TP
+.BI \-\-append\-terse
+Print statistics in selected mode AND terse, semicolon\-delimited format.
+\fBDeprecated\fR, use \fB\-\-output\-format\fR instead to select multiple formats.
 .TP
-.B \-\-latency\-log
-Generate per-job latency logs.
-.TP
-.B \-\-bandwidth\-log
-Generate per-job bandwidth logs.
+.BI \-\-terse\-version \fR=\fPversion
+Set terse \fIversion\fR output format (default `3', or `2', `4', `5').
 .TP
-.B \-\-minimal
-Print statistics in a terse, semicolon-delimited format.
+.BI \-\-version
+Print version information and exit.
 .TP
-.B \-\-version
-Display version information and exit.
+.BI \-\-help
+Print a summary of the command line options and exit.
 .TP
-.BI \-\-terse\-version \fR=\fPversion
-Set terse version output format (Current version 3, or older version 2).
+.BI \-\-cpuclock\-test
+Perform test and validation of internal CPU clock.
 .TP
-.B \-\-help
-Display usage information and exit.
+.BI \-\-crctest \fR=\fP[test]
+Test the speed of the built\-in checksumming functions. If no argument is given,
+all of them are tested. Alternatively, a comma separated list can be passed, in which
+case the given ones are tested.
 .TP
 .BI \-\-cmdhelp \fR=\fPcommand
-Print help information for \fIcommand\fR.  May be `all' for all commands.
+Print help information for \fIcommand\fR. May be `all' for all commands.
 .TP
-.BI \-\-enghelp \fR=\fPioengine[,command]
-List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR defined by \fIioengine\fR.
+.BI \-\-enghelp \fR=\fP[ioengine[,command]]
+List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR
+defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
+available ioengines.
 .TP
 .BI \-\-showcmd \fR=\fPjobfile
-Convert \fIjobfile\fR to a set of command-line options.
+Convert \fIjobfile\fR to a set of command\-line options.
+.TP
+.BI \-\-readonly
+Turn on safety read\-only checks, preventing writes and trims. The \fB\-\-readonly\fR
+option is an extra safety guard to prevent users from accidentally starting
+a write or trim workload when that is not desired. Fio will only modify the
+device under test if `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite'
+is given. This safety net can be used as an extra precaution.
 .TP
 .BI \-\-eta \fR=\fPwhen
-Specifies when real-time ETA estimate should be printed.  \fIwhen\fR may
-be one of `always', `never' or `auto'.
+Specifies when real\-time ETA estimate should be printed. \fIwhen\fR may
+be `always', `never' or `auto'. `auto' is the default, it prints ETA when
+requested if the output is a TTY. `always' disregards the output type, and
+prints ETA when requested. `never' never prints ETA.
+.TP
+.BI \-\-eta\-interval \fR=\fPtime
+By default, fio requests client ETA status roughly every second. With this
+option, the interval is configurable. Fio imposes a minimum allowed time to
+avoid flooding the console, less than 250 msec is not supported.
 .TP
 .BI \-\-eta\-newline \fR=\fPtime
-Force an ETA newline for every `time` period passed.
+Force a new line for every \fItime\fR period passed. When the unit is omitted,
+the value is interpreted in seconds.
 .TP
 .BI \-\-status\-interval \fR=\fPtime
-Report full output status every `time` period passed.
-.TP
-.BI \-\-readonly
-Turn on safety read-only checks, preventing any attempted write.
-.TP
-.BI \-\-section \fR=\fPsec
-Only run section \fIsec\fR from job file. Multiple of these options can be given, adding more sections to run.
+Force a full status dump of cumulative (from job start) values at \fItime\fR
+intervals. This option does *not* provide per-period measurements. So
+values such as bandwidth are running averages. When the time unit is omitted,
+\fItime\fR is interpreted in seconds. Note that using this option with
+`\-\-output-format=json' will yield output that technically isn't valid json,
+since the output will be collated sets of valid json. It will need to be split
+into valid sets of json after the run.
+.TP
+.BI \-\-section \fR=\fPname
+Only run specified section \fIname\fR in job file. Multiple sections can be specified.
+The \fB\-\-section\fR option allows one to combine related jobs into one file.
+E.g. one job file could define light, moderate, and heavy sections. Tell
+fio to run only the "heavy" section by giving `\-\-section=heavy'
+command line option. One can also specify the "write" operations in one
+section and "verify" operation in another section. The \fB\-\-section\fR option
+only applies to job sections. The reserved *global* section is always
+parsed and used.
 .TP
 .BI \-\-alloc\-size \fR=\fPkb
-Set the internal smalloc pool size to \fIkb\fP kilobytes.
+Allocate additional internal smalloc pools of size \fIkb\fR in KiB. The
+\fB\-\-alloc\-size\fR option increases shared memory set aside for use by fio.
+If running large jobs with randommap enabled, fio can run out of memory.
+Smalloc is an internal allocator for shared structures from a fixed size
+memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+NOTE: While running `.fio_smalloc.*' backing store files are visible
+in `/tmp'.
 .TP
 .BI \-\-warnings\-fatal
 All fio parser warnings are fatal, causing fio to exit with an error.
 .TP
 .BI \-\-max\-jobs \fR=\fPnr
-Set the maximum allowed number of jobs (threads/processes) to support.
+Set the maximum number of threads/processes to support to \fInr\fR.
+NOTE: On Linux, it may be necessary to increase the shared-memory limit
+(`/proc/sys/kernel/shmmax') if fio runs into errors while creating jobs.
 .TP
 .BI \-\-server \fR=\fPargs
-Start a backend server, with \fIargs\fP specifying what to listen to. See client/server section.
+Start a backend server, with \fIargs\fR specifying what to listen to.
+See \fBCLIENT/SERVER\fR section.
 .TP
 .BI \-\-daemonize \fR=\fPpidfile
-Background a fio server, writing the pid to the given pid file.
+Background a fio server, writing the pid to the given \fIpidfile\fR file.
 .TP
-.BI \-\-client \fR=\fPhost
-Instead of running the jobs locally, send and run them on the given host.
+.BI \-\-client \fR=\fPhostname
+Instead of running the jobs locally, send and run them on the given \fIhostname\fR
+or set of \fIhostname\fRs. See \fBCLIENT/SERVER\fR section.
+.TP
+.BI \-\-remote\-config \fR=\fPfile
+Tell fio server to load this local \fIfile\fR.
 .TP
 .BI \-\-idle\-prof \fR=\fPoption
-Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate).
+Report CPU idleness. \fIoption\fR is one of the following:
+.RS
+.RS
+.TP
+.B calibrate
+Run unit work calibration only and exit.
+.TP
+.B system
+Show aggregate system idleness and unit work.
+.TP
+.B percpu
+As \fBsystem\fR but also show per CPU idleness.
+.RE
+.RE
+.TP
+.BI \-\-inflate\-log \fR=\fPlog
+Inflate and output compressed \fIlog\fR.
+.TP
+.BI \-\-trigger\-file \fR=\fPfile
+Execute trigger command when \fIfile\fR exists.
+.TP
+.BI \-\-trigger\-timeout \fR=\fPtime
+Execute trigger at this \fItime\fR.
+.TP
+.BI \-\-trigger \fR=\fPcommand
+Set this \fIcommand\fR as local trigger.
+.TP
+.BI \-\-trigger\-remote \fR=\fPcommand
+Set this \fIcommand\fR as remote trigger.
+.TP
+.BI \-\-aux\-path \fR=\fPpath
+Use the directory specified by \fIpath\fP for generated state files instead
+of the current working directory.
 .SH "JOB FILE FORMAT"
-Job files are in `ini' format. They consist of one or more
-job definitions, which begin with a job name in square brackets and
-extend to the next job name.  The job name can be any ASCII string
-except `global', which has a special meaning.  Following the job name is
-a sequence of zero or more parameters, one per line, that define the
-behavior of the job.  Any line starting with a `;' or `#' character is
-considered a comment and ignored.
-.P
-If \fIjobfile\fR is specified as `-', the job file will be read from
-standard input.
-.SS "Global Section"
-The global section contains default parameters for jobs specified in the
-job file.  A job is only affected by global sections residing above it,
-and there may be any number of global sections.  Specific job definitions
-may override any parameter set in global sections.
-.SH "JOB PARAMETERS"
-.SS Types
-Some parameters may take arguments of a specific type.  The types used are:
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will \fBstonewall\fR execution
+between each group.
+
+Fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning. Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The \fB\-\-cmdhelp\fR option also lists all options. If used with an \fIcommand\fR
+argument, \fB\-\-cmdhelp\fR will detail the given \fIcommand\fR.
+
+See the `examples/' directory for inspiration on how to write job files. Note
+the copyright and license requirements currently apply to
+`examples/' files.
+
+Note that the maximum length of a line in the job file is 8192 bytes.
+.SH "JOB FILE PARAMETERS"
+Some parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+.RS
+.P
+.B addition (+)
+.P
+.B subtraction (\-)
+.P
+.B multiplication (*)
+.P
+.B division (/)
+.P
+.B modulus (%)
+.P
+.B exponentiation (^)
+.RE
+.P
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses).
+.SH "PARAMETER TYPES"
+The following parameter types are used.
 .TP
 .I str
-String: a sequence of alphanumeric characters.
+String. A sequence of alphanumeric characters.
+.TP
+.I time
+Integer with possible time suffix. Without a unit value is interpreted as
+seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
+hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and 'us'
+(or 'usec') for microseconds. For example, use 10m for 10 minutes.
 .TP
 .I int
-SI integer: a whole number, possibly containing a suffix denoting the base unit
-of the value.  Accepted suffixes are `k', 'M', 'G', 'T', and 'P', denoting
-kilo (1024), mega (1024^2), giga (1024^3), tera (1024^4), and peta (1024^5)
-respectively. The suffix is not case sensitive. If prefixed with '0x', the
-value is assumed to be base 16 (hexadecimal). A suffix may include a trailing 'b',
-for instance 'kb' is identical to 'k'. You can specify a base 10 value
-by using 'KiB', 'MiB', 'GiB', etc. This is useful for disk drives where
-values are often given in base 10 values. Specifying '30GiB' will get you
-30*1000^3 bytes.
+Integer. A whole number value, which may contain an integer prefix
+and an integer suffix.
+.RS
+.RS
+.P
+[*integer prefix*] **number** [*integer suffix*]
+.RE
+.P
+The optional *integer prefix* specifies the number's base. The default
+is decimal. *0x* specifies hexadecimal.
+.P
+The optional *integer suffix* specifies the number's units, and includes an
+optional unit prefix and an optional unit. For quantities of data, the
+default unit is bytes. For quantities of time, the default unit is seconds
+unless otherwise specified.
+.P
+With `kb_base=1000', fio follows international standards for unit
+prefixes. To specify power\-of\-10 decimal values defined in the
+International System of Units (SI):
+.RS
+.P
+.PD 0
+K means kilo (K) or 1000
+.P
+M means mega (M) or 1000**2
+.P
+G means giga (G) or 1000**3
+.P
+T means tera (T) or 1000**4
+.P
+P means peta (P) or 1000**5
+.PD
+.RE
+.P
+To specify power\-of\-2 binary values defined in IEC 80000\-13:
+.RS
+.P
+.PD 0
+Ki means kibi (Ki) or 1024
+.P
+Mi means mebi (Mi) or 1024**2
+.P
+Gi means gibi (Gi) or 1024**3
+.P
+Ti means tebi (Ti) or 1024**4
+.P
+Pi means pebi (Pi) or 1024**5
+.PD
+.RE
+.P
+With `kb_base=1024' (the default), the unit prefixes are opposite
+from those specified in the SI and IEC 80000\-13 standards to provide
+compatibility with old scripts. For example, 4k means 4096.
+.P
+For quantities of data, an optional unit of 'B' may be included
+(e.g., 'kB' is the same as 'k').
+.P
+The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+not milli). 'b' and 'B' both mean byte, not bit.
+.P
+Examples with `kb_base=1000':
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+Examples with `kb_base=1024' (default):
+.RS
+.P
+.PD 0
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
+1 MiB: 1048576, 1m, 1024k
+.P
+1 MB: 1000000, 1mi, 1000ki
+.P
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.PD
+.RE
+.P
+To specify times (units are not case sensitive):
+.RS
+.P
+.PD 0
+D means days
+.P
+H means hours
+.P
+M mean minutes
+.P
+s or sec means seconds (default)
+.P
+ms or msec means milliseconds
+.P
+us or usec means microseconds
+.PD
+.RE
+.P
+If the option accepts an upper and lower range, use a colon ':' or
+minus '\-' to separate such values. See \fIirange\fR parameter type.
+If the lower value specified happens to be larger than the upper value
+the two values are swapped.
+.RE
 .TP
 .I bool
-Boolean: a true or false value. `0' denotes false, `1' denotes true.
+Boolean. Usually parsed as an integer, however only defined for
+true and false (1 and 0).
 .TP
 .I irange
-Integer range: a range of integers specified in the format
-\fIlower\fR:\fIupper\fR or \fIlower\fR\-\fIupper\fR. \fIlower\fR and
-\fIupper\fR may contain a suffix as described above.  If an option allows two
-sets of ranges, they are separated with a `,' or `/' character. For example:
-`8\-8k/8M\-4G'.
+Integer range with suffix. Allows value range to be given, such as
+1024\-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+option allows two sets of ranges, they can be specified with a ',' or '/'
+delimiter: 1k\-4k/8k\-32k. Also see \fIint\fR parameter type.
 .TP
 .I float_list
-List of floating numbers: A list of floating numbers, separated by
-a ':' charcater.
-.SS "Parameter List"
+A list of floating point numbers, separated by a ':' character.
+.SH "JOB PARAMETERS"
+With the above in mind, here follows the complete list of fio job parameters.
+.SS "Units"
+.TP
+.BI kb_base \fR=\fPint
+Select the interpretation of unit prefixes in input parameters.
+.RS
+.RS
+.TP
+.B 1000
+Inputs comply with IEC 80000\-13 and the International
+System of Units (SI). Use:
+.RS
+.P
+.PD 0
+\- power\-of\-2 values with IEC prefixes (e.g., KiB)
+.P
+\- power\-of\-10 values with SI prefixes (e.g., kB)
+.PD
+.RE
+.TP
+.B 1024
+Compatibility mode (default). To avoid breaking old scripts:
+.P
+.RS
+.PD 0
+\- power\-of\-2 values with SI prefixes
+.P
+\- power\-of\-10 values with IEC prefixes
+.PD
+.RE
+.RE
+.P
+See \fBbs\fR for more details on input parameters.
+.P
+Outputs always use correct prefixes. Most outputs include both
+side\-by\-side, like:
+.P
+.RS
+bw=2383.3kB/s (2327.4KiB/s)
+.RE
+.P
+If only one value is reported, then kb_base selects the one to use:
+.P
+.RS
+.PD 0
+1000 \-\- SI prefixes
+.P
+1024 \-\- IEC prefixes
+.PD
+.RE
+.RE
+.TP
+.BI unit_base \fR=\fPint
+Base unit for reporting. Allowed values are:
+.RS
+.RS
+.TP
+.B 0
+Use auto\-detection (default).
+.TP
+.B 8
+Byte based.
+.TP
+.B 1
+Bit based.
+.RE
+.RE
+.SS "Job description"
 .TP
 .BI name \fR=\fPstr
-May be used to override the job name.  On the command line, this parameter
-has the special purpose of signalling the start of a new job.
+ASCII name of the job. This may be used to override the name printed by fio
+for this job. Otherwise the job name is used. On the command line this
+parameter has the special purpose of also signaling the start of a new job.
 .TP
 .BI description \fR=\fPstr
-Human-readable description of the job. It is printed when the job is run, but
-otherwise has no special purpose.
+Text description of the job. Doesn't do anything except dump this text
+description when this job is run. It's not parsed.
+.TP
+.BI loops \fR=\fPint
+Run the specified number of iterations of this job. Used to repeat the same
+workload a given number of times. Defaults to 1.
+.TP
+.BI numjobs \fR=\fPint
+Create the specified number of clones of this job. Each clone of job
+is spawned as an independent thread or process. May be used to setup a
+larger number of threads/processes doing the same thing. Each thread is
+reported separately; to see statistics for all clones as a whole, use
+\fBgroup_reporting\fR in conjunction with \fBnew_group\fR.
+See \fB\-\-max\-jobs\fR. Default: 1.
+.SS "Time related parameters"
+.TP
+.BI runtime \fR=\fPtime
+Tell fio to terminate processing after the specified period of time. It
+can be quite hard to determine for how long a specified job will run, so
+this parameter is handy to cap the total runtime to a given time. When
+the unit is omitted, the value is interpreted in seconds.
+.TP
+.BI time_based
+If set, fio will run for the duration of the \fBruntime\fR specified
+even if the file(s) are completely read or written. It will simply loop over
+the same workload as many times as the \fBruntime\fR allows.
+.TP
+.BI startdelay \fR=\fPirange(int)
+Delay the start of job for the specified amount of time. Can be a single
+value or a range. When given as a range, each thread will choose a value
+randomly from within the range. Value is in seconds if a unit is omitted.
+.TP
+.BI ramp_time \fR=\fPtime
+If set, fio will run the specified workload for this amount of time before
+logging any performance numbers. Useful for letting performance settle
+before logging results, thus minimizing the runtime required for stable
+results. Note that the \fBramp_time\fR is considered lead in time for a job,
+thus it will increase the total runtime if a special timeout or
+\fBruntime\fR is specified. When the unit is omitted, the value is
+given in seconds.
+.TP
+.BI clocksource \fR=\fPstr
+Use the given clocksource as the base of timing. The supported options are:
+.RS
+.RS
+.TP
+.B gettimeofday
+\fBgettimeofday\fR\|(2)
+.TP
+.B clock_gettime
+\fBclock_gettime\fR\|(2)
+.TP
+.B cpu
+Internal CPU clock source
+.RE
+.P
+\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast (and
+fio is heavy on time calls). Fio will automatically use this clocksource if
+it's supported and considered reliable on the system it is running on,
+unless another clocksource is specifically set. For x86/x86\-64 CPUs, this
+means supporting TSC Invariant.
+.RE
+.TP
+.BI gtod_reduce \fR=\fPbool
+Enable all of the \fBgettimeofday\fR\|(2) reducing options
+(\fBdisable_clat\fR, \fBdisable_slat\fR, \fBdisable_bw_measurement\fR) plus
+reduce precision of the timeout somewhat to really shrink the
+\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do
+about 0.4% of the \fBgettimeofday\fR\|(2) calls we would have done if all
+time keeping was enabled.
+.TP
+.BI gtod_cpu \fR=\fPint
+Sometimes it's cheaper to dedicate a single thread of execution to just
+getting the current time. Fio (and databases, for instance) are very
+intensive on \fBgettimeofday\fR\|(2) calls. With this option, you can set
+one CPU aside for doing nothing but logging current time to a shared memory
+location. Then the other threads/processes that run I/O workloads need only
+copy that segment, instead of entering the kernel with a
+\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
+calls will be excluded from other uses. Fio will manually clear it from the
+CPU mask of other jobs.
+.SS "Target file/device"
 .TP
 .BI directory \fR=\fPstr
-Prefix filenames with this directory.  Used to place files in a location other
-than `./'.
+Prefix \fBfilename\fRs with this directory. Used to place files in a different
+location than `./'. You can specify a number of directories by
+separating the names with a ':' character. These directories will be
+assigned equally distributed to job clones created by \fBnumjobs\fR as
+long as they are using generated filenames. If specific \fBfilename\fR(s) are
+set fio will use the first listed directory, and thereby matching the
+\fBfilename\fR semantic (which generates a file for each clone if not
+specified, but lets all clones use the same file if set).
+.RS
+.P
+See the \fBfilename\fR option for information on how to escape ':' and '\\'
+characters within the directory path itself.
+.P
+Note: To control the directory fio will use for internal state files
+use \fB\-\-aux\-path\fR.
+.RE
 .TP
 .BI filename \fR=\fPstr
-.B fio
-normally makes up a file name based on the job name, thread number, and file
-number. If you want to share files between threads in a job or several jobs,
-specify a \fIfilename\fR for each of them to override the default.
-If the I/O engine is file-based, you can specify
-a number of files by separating the names with a `:' character. `\-' is a
-reserved name, meaning stdin or stdout, depending on the read/write direction
-set.
+Fio normally makes up a \fBfilename\fR based on the job name, thread number, and
+file number (see \fBfilename_format\fR). If you want to share files
+between threads in a job or several
+jobs with fixed file paths, specify a \fBfilename\fR for each of them to override
+the default. If the ioengine is file based, you can specify a number of files
+by separating the names with a ':' colon. So if you wanted a job to open
+`/dev/sda' and `/dev/sdb' as the two working files, you would use
+`filename=/dev/sda:/dev/sdb'. This also means that whenever this option is
+specified, \fBnrfiles\fR is ignored. The size of regular files specified
+by this option will be \fBsize\fR divided by number of files unless an
+explicit size is specified by \fBfilesize\fR.
+.RS
+.P
+Each colon and backslash in the wanted path must be escaped with a '\\'
+character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
+would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
+`F:\\filename' then you would use `filename=F\\:\\\\filename'.
+.P
+On Windows, disk devices are accessed as `\\\\.\\PhysicalDrive0' for
+the first device, `\\\\.\\PhysicalDrive1' for the second etc.
+Note: Windows and FreeBSD prevent write access to areas
+of the disk containing in\-use data (e.g. filesystems).
+.P
+The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which
+of the two depends on the read/write direction set.
+.RE
 .TP
 .BI filename_format \fR=\fPstr
-If sharing multiple files between jobs, it is usually necessary to have
-fio generate the exact names that you want. By default, fio will name a file
+If sharing multiple files between jobs, it is usually necessary to have fio
+generate the exact names that you want. By default, fio will name a file
 based on the default file format specification of
-\fBjobname.jobnumber.filenumber\fP. With this option, that can be
+`jobname.jobnumber.filenumber'. With this option, that can be
 customized. Fio will recognize and replace the following keywords in this
 string:
 .RS
@@ -177,39 +591,250 @@
 The incremental number of the file for that worker thread or process.
 .RE
 .P
-To have dependent jobs share a set of files, this option can be set to
-have fio generate filenames that are shared between the two. For instance,
-if \fBtestfiles.$filenum\fR is specified, file number 4 for any job will
-be named \fBtestfiles.4\fR. The default of \fB$jobname.$jobnum.$filenum\fR
+To have dependent jobs share a set of files, this option can be set to have
+fio generate filenames that are shared between the two. For instance, if
+`testfiles.$filenum' is specified, file number 4 for any job will be
+named `testfiles.4'. The default of `$jobname.$jobnum.$filenum'
 will be used if no other format specifier is given.
-.RE
 .P
+If you specify a path then the directories will be created up to the main
+directory for the file.  So for example if you specify `a/b/c/$jobnum` then the
+directories a/b/c will be created before the file setup part of the job.  If you
+specify \fBdirectory\fR then the path will be relative that directory, otherwise
+it is treated as the absolute path.
+.RE
+.TP
+.BI unique_filename \fR=\fPbool
+To avoid collisions between networked clients, fio defaults to prefixing any
+generated filenames (with a directory specified) with the source of the
+client connecting. To disable this behavior, set this option to 0.
+.TP
+.BI opendir \fR=\fPstr
+Recursively open any files below directory \fIstr\fR.
 .TP
 .BI lockfile \fR=\fPstr
-Fio defaults to not locking any files before it does IO to them. If a file or
-file descriptor is shared, fio can serialize IO to that file to make the end
-result consistent. This is usual for emulating real workloads that share files.
-The lock modes are:
+Fio defaults to not locking any files before it does I/O to them. If a file
+or file descriptor is shared, fio can serialize I/O to that file to make the
+end result consistent. This is usual for emulating real workloads that share
+files. The lock modes are:
 .RS
 .RS
 .TP
 .B none
-No locking. This is the default.
+No locking. The default.
 .TP
 .B exclusive
-Only one thread or process may do IO at the time, excluding all others.
+Only one thread or process may do I/O at a time, excluding all others.
 .TP
 .B readwrite
-Read-write locking on the file. Many readers may access the file at the same
-time, but writes get exclusive access.
+Read\-write locking on the file. Many readers may
+access the file at the same time, but writes get exclusive access.
 .RE
 .RE
+.TP
+.BI nrfiles \fR=\fPint
+Number of files to use for this job. Defaults to 1. The size of files
+will be \fBsize\fR divided by this unless explicit size is specified by
+\fBfilesize\fR. Files are created for each thread separately, and each
+file will have a file number within its name by default, as explained in
+\fBfilename\fR section.
+.TP
+.BI openfiles \fR=\fPint
+Number of files to keep open at the same time. Defaults to the same as
+\fBnrfiles\fR, can be set smaller to limit the number simultaneous
+opens.
+.TP
+.BI file_service_type \fR=\fPstr
+Defines how fio decides which file from a job to service next. The following
+types are defined:
+.RS
+.RS
+.TP
+.B random
+Choose a file at random.
+.TP
+.B roundrobin
+Round robin over opened files. This is the default.
+.TP
+.B sequential
+Finish one file before moving on to the next. Multiple files can
+still be open depending on \fBopenfiles\fR.
+.TP
+.B zipf
+Use a Zipf distribution to decide what file to access.
+.TP
+.B pareto
+Use a Pareto distribution to decide what file to access.
+.TP
+.B normal
+Use a Gaussian (normal) distribution to decide what file to access.
+.TP
+.B gauss
+Alias for normal.
+.RE
 .P
-.BI opendir \fR=\fPstr
-Recursively open any files below directory \fIstr\fR.
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be appended to
+tell fio how many I/Os to issue before switching to a new file. For example,
+specifying `file_service_type=random:8' would cause fio to issue
+8 I/Os before selecting a new file at random. For the non\-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description
+of how that would work.
+.RE
+.TP
+.BI ioscheduler \fR=\fPstr
+Attempt to switch the device hosting the file to the specified I/O scheduler
+before running.
+.TP
+.BI create_serialize \fR=\fPbool
+If true, serialize the file creation for the jobs. This may be handy to
+avoid interleaving of data files, which may greatly depend on the filesystem
+used and even the number of processors in the system. Default: true.
+.TP
+.BI create_fsync \fR=\fPbool
+\fBfsync\fR\|(2) the data file after creation. This is the default.
+.TP
+.BI create_on_open \fR=\fPbool
+If true, don't pre\-create files but allow the job's open() to create a file
+when it's time to do I/O. Default: false \-\- pre\-create all necessary files
+when the job starts.
+.TP
+.BI create_only \fR=\fPbool
+If true, fio will only run the setup phase of the job. If files need to be
+laid out or updated on disk, only that will be done \-\- the actual job contents
+are not executed. Default: false.
+.TP
+.BI allow_file_create \fR=\fPbool
+If true, fio is permitted to create files as part of its workload. If this
+option is false, then fio will error out if
+the files it needs to use don't already exist. Default: true.
+.TP
+.BI allow_mounted_write \fR=\fPbool
+If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+to what appears to be a mounted device or partition. This should help catch
+creating inadvertently destructive tests, not realizing that the test will
+destroy data on the mounted file system. Note that some platforms don't allow
+writing against a mounted device regardless of this option. Default: false.
+.TP
+.BI pre_read \fR=\fPbool
+If this is given, files will be pre\-read into memory before starting the
+given I/O operation. This will also clear the \fBinvalidate\fR flag,
+since it is pointless to pre\-read and then drop the cache. This will only
+work for I/O engines that are seek\-able, since they allow you to read the
+same data multiple times. Thus it will not work on non\-seekable I/O engines
+(e.g. network, splice). Default: false.
+.TP
+.BI unlink \fR=\fPbool
+Unlink the job files when done. Not the default, as repeated runs of that
+job would then waste time recreating the file set again and again. Default:
+false.
+.TP
+.BI unlink_each_loop \fR=\fPbool
+Unlink job files after each iteration or loop. Default: false.
+.TP
+.BI zonemode \fR=\fPstr
+Accepted values are:
+.RS
+.RS
+.TP
+.B none
+The \fBzonerange\fR, \fBzonesize\fR and \fBzoneskip\fR parameters are ignored.
+.TP
+.B strided
+I/O happens in a single zone until \fBzonesize\fR bytes have been transferred.
+After that number of bytes has been transferred processing of the next zone
+starts.
+.TP
+.B zbd
+Zoned block device mode. I/O happens sequentially in each zone, even if random
+I/O has been selected. Random I/O happens across all zones instead of being
+restricted to a single zone.
+.RE
+.RE
+.TP
+.BI zonerange \fR=\fPint
+For \fBzonemode\fR=strided, this is the size of a single zone. See also
+\fBzonesize\fR and \fBzoneskip\fR.
+
+For \fBzonemode\fR=zbd, this parameter is ignored.
+.TP
+.BI zonesize \fR=\fPint
+For \fBzonemode\fR=strided, this is the number of bytes to transfer before
+skipping \fBzoneskip\fR bytes. If this parameter is smaller than
+\fBzonerange\fR then only a fraction of each zone with \fBzonerange\fR bytes
+will be accessed.  If this parameter is larger than \fBzonerange\fR then each
+zone will be accessed multiple times before skipping to the next zone.
+
+For \fBzonemode\fR=zbd, this is the size of a single zone. The
+\fBzonerange\fR parameter is ignored in this mode. For a job accessing a
+zoned block device, the specified \fBzonesize\fR must be 0 or equal to the
+device zone size. For a regular block device or file, the specified
+\fBzonesize\fR must be at least 512B.
+.TP
+.BI zoneskip \fR=\fPint
+For \fBzonemode\fR=strided, the number of bytes to skip after \fBzonesize\fR
+bytes of data have been transferred.
+
+For \fBzonemode\fR=zbd, the \fBzonesize\fR aligned number of bytes to skip
+once a zone is fully written (write workloads) or all written data in the
+zone have been read (read workloads). This parameter is valid only for
+sequential workloads and ignored for random workloads. For read workloads,
+see also \fBread_beyond_wp\fR.
+
+.TP
+.BI read_beyond_wp \fR=\fPbool
+This parameter applies to \fBzonemode=zbd\fR only.
+
+Zoned block devices are block devices that consist of multiple zones. Each
+zone has a type, e.g. conventional or sequential. A conventional zone can be
+written at any offset that is a multiple of the block size. Sequential zones
+must be written sequentially. The position at which a write must occur is
+called the write pointer. A zoned block device can be either host managed or
+host aware. For host managed devices the host must ensure that writes happen
+sequentially. Fio recognizes host managed devices and serializes writes to
+sequential zones for these devices.
+
+If a read occurs in a sequential zone beyond the write pointer then the zoned
+block device will complete the read without reading any data from the storage
+medium. Since such reads lead to unrealistically high bandwidth and IOPS
+numbers fio only reads beyond the write pointer if explicitly told to do
+so. Default: false.
+.TP
+.BI max_open_zones \fR=\fPint
+When running a random write test across an entire drive many more zones will be
+open than in a typical application workload. Hence this command line option
+that allows to limit the number of open zones. The number of open zones is
+defined as the number of zones to which write commands are issued.
+.TP
+.BI zone_reset_threshold \fR=\fPfloat
+A number between zero and one that indicates the ratio of logical blocks with
+data to the total number of logical blocks in the test above which zones
+should be reset periodically.
+.TP
+.BI zone_reset_frequency \fR=\fPfloat
+A number between zero and one that indicates how often a zone reset should be
+issued if the zone reset threshold has been exceeded. A zone reset is
+submitted after each (1 / zone_reset_frequency) write requests. This and the
+previous parameter can be used to simulate garbage collection activity.
+
+.SS "I/O type"
+.TP
+.BI direct \fR=\fPbool
+If value is true, use non\-buffered I/O. This is usually O_DIRECT. Note that
+OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
+ioengines don't support direct I/O. Default: false.
+.TP
+.BI atomic \fR=\fPbool
+If value is true, attempt to use atomic direct I/O. Atomic writes are
+guaranteed to be stable once acknowledged by the operating system. Only
+Linux supports O_ATOMIC right now.
+.TP
+.BI buffered \fR=\fPbool
+If value is true, use buffered I/O. This is the opposite of the
+\fBdirect\fR option. Defaults to true.
 .TP
 .BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr
-Type of I/O pattern.  Accepted values are:
+Type of I/O pattern. Accepted values are:
 .RS
 .RS
 .TP
@@ -219,382 +844,225 @@
 .B write
 Sequential writes.
 .TP
+.B trim
+Sequential trims (Linux block devices and SCSI character devices only).
+.TP
 .B randread
 Random reads.
 .TP
 .B randwrite
 Random writes.
 .TP
-.B rw, readwrite
-Mixed sequential reads and writes.
+.B randtrim
+Random trims (Linux block devices and SCSI character devices only).
 .TP
-.B randrw 
-Mixed random reads and writes.
-.RE
-.P
-For mixed I/O, the default split is 50/50. For certain types of io the result
-may still be skewed a bit, since the speed may be different. It is possible to
-specify a number of IO's to do before getting a new offset, this is done by
-appending a `:\fI<nr>\fR to the end of the string given. For a random read, it
-would look like \fBrw=randread:8\fR for passing in an offset modifier with a
-value of 8. If the postfix is used with a sequential IO pattern, then the value
-specified will be added to the generated offset for each IO. For instance,
-using \fBrw=write:4k\fR will skip 4k for every write. It turns sequential IO
-into sequential IO with holes. See the \fBrw_sequencer\fR option.
+.B rw,readwrite
+Sequential mixed reads and writes.
+.TP
+.B randrw
+Random mixed reads and writes.
+.TP
+.B trimwrite
+Sequential trim+write sequences. Blocks will be trimmed first,
+then the same blocks will be written to.
+.RE
+.P
+Fio defaults to read if the option is not specified. For the mixed I/O
+types, the default is to split them 50/50. For certain types of I/O the
+result may still be skewed a bit, since the speed may be different.
+.P
+It is possible to specify the number of I/Os to do before getting a new
+offset by appending `:<nr>' to the end of the string given. For a
+random read, it would look like `rw=randread:8' for passing in an offset
+modifier with a value of 8. If the suffix is used with a sequential I/O
+pattern, then the `<nr>' value specified will be added to the generated
+offset for each I/O turning sequential I/O into sequential I/O with holes.
+For instance, using `rw=write:4k' will skip 4k for every write. Also see
+the \fBrw_sequencer\fR option.
 .RE
 .TP
 .BI rw_sequencer \fR=\fPstr
-If an offset modifier is given by appending a number to the \fBrw=<str>\fR line,
-then this option controls how that number modifies the IO offset being
-generated. Accepted values are:
+If an offset modifier is given by appending a number to the `rw=\fIstr\fR'
+line, then this option controls how that number modifies the I/O offset
+being generated. Accepted values are:
 .RS
 .RS
 .TP
 .B sequential
-Generate sequential offset
+Generate sequential offset.
 .TP
 .B identical
-Generate the same offset
+Generate the same offset.
 .RE
 .P
-\fBsequential\fR is only useful for random IO, where fio would normally
-generate a new random offset for every IO. If you append eg 8 to randread, you
-would get a new random offset for every 8 IO's. The result would be a seek for
-only every 8 IO's, instead of for every IO. Use \fBrw=randread:8\fR to specify
-that. As sequential IO is already sequential, setting \fBsequential\fR for that
-would not result in any differences.  \fBidentical\fR behaves in a similar
-fashion, except it sends the same offset 8 number of times before generating a
-new offset.
+\fBsequential\fR is only useful for random I/O, where fio would normally
+generate a new random offset for every I/O. If you append e.g. 8 to randread,
+you would get a new random offset for every 8 I/Os. The result would be a
+seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
+to specify that. As sequential I/O is already sequential, setting
+\fBsequential\fR for that would not result in any differences. \fBidentical\fR
+behaves in a similar fashion, except it sends the same offset 8 number of
+times before generating a new offset.
 .RE
-.P
-.TP
-.BI kb_base \fR=\fPint
-The base unit for a kilobyte. The defacto base is 2^10, 1024.  Storage
-manufacturers like to use 10^3 or 1000 as a base ten unit instead, for obvious
-reasons. Allow values are 1024 or 1000, with 1024 being the default.
 .TP
 .BI unified_rw_reporting \fR=\fPbool
 Fio normally reports statistics on a per data direction basis, meaning that
-read, write, and trim are accounted and reported separately. If this option is
-set, the fio will sum the results and report them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. If this
+option is set fio sums the results and report them as "mixed" instead.
 .TP
 .BI randrepeat \fR=\fPbool
-Seed the random number generator in a predictable way so results are repeatable
-across runs.  Default: true.
+Seed the random number generator used for random I/O patterns in a
+predictable way so the pattern is repeatable across runs. Default: true.
 .TP
-.BI use_os_rand \fR=\fPbool
-Fio can either use the random generator supplied by the OS to generator random
-offsets, or it can use it's own internal generator (based on Tausworthe).
-Default is to use the internal generator, which is often of better quality and
-faster. Default: false.
+.BI allrandrepeat \fR=\fPbool
+Seed all random number generators in a predictable way so results are
+repeatable across runs. Default: false.
+.TP
+.BI randseed \fR=\fPint
+Seed the random number generators based on this seed value, to be able to
+control what sequence of output is being generated. If not set, the random
+sequence depends on the \fBrandrepeat\fR setting.
 .TP
 .BI fallocate \fR=\fPstr
-Whether pre-allocation is performed when laying down files. Accepted values
-are:
+Whether pre\-allocation is performed when laying down files.
+Accepted values are:
 .RS
 .RS
 .TP
 .B none
-Do not pre-allocate space.
+Do not pre\-allocate space.
+.TP
+.B native
+Use a platform's native pre\-allocation call but fall back to
+\fBnone\fR behavior if it fails/is not implemented.
 .TP
 .B posix
-Pre-allocate via posix_fallocate().
+Pre\-allocate via \fBposix_fallocate\fR\|(3).
 .TP
 .B keep
-Pre-allocate via fallocate() with FALLOC_FL_KEEP_SIZE set.
+Pre\-allocate via \fBfallocate\fR\|(2) with
+FALLOC_FL_KEEP_SIZE set.
 .TP
 .B 0
-Backward-compatible alias for 'none'.
+Backward\-compatible alias for \fBnone\fR.
 .TP
 .B 1
-Backward-compatible alias for 'posix'.
+Backward\-compatible alias for \fBposix\fR.
 .RE
 .P
-May not be available on all supported platforms. 'keep' is only
-available on Linux. If using ZFS on Solaris this must be set to 'none'
-because ZFS doesn't support it. Default: 'posix'.
+May not be available on all supported platforms. \fBkeep\fR is only available
+on Linux. If using ZFS on Solaris this cannot be set to \fBposix\fR
+because ZFS doesn't support pre\-allocation. Default: \fBnative\fR if any
+pre\-allocation methods are available, \fBnone\fR if not.
 .RE
 .TP
-.BI fadvise_hint \fR=\fPbool
-Use of \fIposix_fadvise\fR\|(2) to advise the kernel what I/O patterns
-are likely to be issued. Default: true.
-.TP
-.BI size \fR=\fPint
-Total size of I/O for this job.  \fBfio\fR will run until this many bytes have
-been transferred, unless limited by other options (\fBruntime\fR, for instance).
-Unless \fBnrfiles\fR and \fBfilesize\fR options are given, this amount will be
-divided between the available files for the job. If not set, fio will use the
-full size of the given files or devices. If the the files do not exist, size
-must be given. It is also possible to give size as a percentage between 1 and
-100. If size=20% is given, fio will use 20% of the full size of the given files
-or devices.
-.TP
-.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
-Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential write.
-For a read workload, the mount point will be filled first then IO started on
-the result. This option doesn't make sense if operating on a raw device node,
-since the size of that is already known by the file system. Additionally,
-writing beyond end-of-device will not return ENOSPC there.
-.TP
-.BI filesize \fR=\fPirange
-Individual file sizes. May be a range, in which case \fBfio\fR will select sizes
-for files at random within the given range, limited to \fBsize\fR in total (if
-that is given). If \fBfilesize\fR is not specified, each created file is the
-same size.
-.TP
-.BI blocksize \fR=\fPint[,int] "\fR,\fB bs" \fR=\fPint[,int]
-Block size for I/O units.  Default: 4k.  Values for reads, writes, and trims
-can be specified separately in the format \fIread\fR,\fIwrite\fR,\fItrim\fR
-either of which may be empty to leave that value at its default. If a trailing
-comma isn't given, the remainder will inherit the last value set.
-.TP
-.BI blocksize_range \fR=\fPirange[,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange]
-Specify a range of I/O block sizes.  The issued I/O unit will always be a
-multiple of the minimum size, unless \fBblocksize_unaligned\fR is set.  Applies
-to both reads and writes if only one range is given, but can be specified
-separately with a comma seperating the values. Example: bsrange=1k-4k,2k-8k.
-Also (see \fBblocksize\fR).
-.TP
-.BI bssplit \fR=\fPstr
-This option allows even finer grained control of the block sizes issued,
-not just even splits between them. With this option, you can weight various
-block sizes for exact control of the issued IO for a job that has mixed
-block sizes. The format of the option is bssplit=blocksize/percentage,
-optionally adding as many definitions as needed separated by a colon.
-Example: bssplit=4k/10:64k/50:32k/40 would issue 50% 64k blocks, 10% 4k
-blocks and 40% 32k blocks. \fBbssplit\fR also supports giving separate
-splits to reads and writes. The format is identical to what the
-\fBbs\fR option accepts, the read and write parts are separated with a
-comma.
-.TP
-.B blocksize_unaligned\fR,\fP bs_unaligned
-If set, any size in \fBblocksize_range\fR may be used.  This typically won't
-work with direct I/O, as that normally requires sector alignment.
-.TP
-.BI blockalign \fR=\fPint[,int] "\fR,\fB ba" \fR=\fPint[,int]
-At what boundary to align random IO offsets. Defaults to the same as 'blocksize'
-the minimum blocksize given.  Minimum alignment is typically 512b
-for using direct IO, though it usually depends on the hardware block size.
-This option is mutually exclusive with using a random map for files, so it
-will turn off that option.
-.TP
-.BI bs_is_seq_rand \fR=\fPbool
-If this option is set, fio will use the normal read,write blocksize settings as
-sequential,random instead. Any random read or write will use the WRITE
-blocksize settings, and any sequential read or write will use the READ
-blocksize setting.
-.TP
-.B zero_buffers
-Initialise buffers with all zeros. Default: fill buffers with random data.
-.TP
-.B refill_buffers
-If this option is given, fio will refill the IO buffers on every submit. The
-default is to only fill it at init time and reuse that data. Only makes sense
-if zero_buffers isn't specified, naturally. If data verification is enabled,
-refill_buffers is also automatically enabled.
-.TP
-.BI scramble_buffers \fR=\fPbool
-If \fBrefill_buffers\fR is too costly and the target is using data
-deduplication, then setting this option will slightly modify the IO buffer
-contents to defeat normal de-dupe attempts. This is not enough to defeat
-more clever block compression attempts, but it will stop naive dedupe
-of blocks. Default: true.
-.TP
-.BI buffer_compress_percentage \fR=\fPint
-If this is set, then fio will attempt to provide IO buffer content (on WRITEs)
-that compress to the specified level. Fio does this by providing a mix of
-random data and zeroes. Note that this is per block size unit, for file/disk
-wide compression level that matches this setting, you'll also want to set
-\fBrefill_buffers\fR.
-.TP
-.BI buffer_compress_chunk \fR=\fPint
-See \fBbuffer_compress_percentage\fR. This setting allows fio to manage how
-big the ranges of random data and zeroed data is. Without this set, fio will
-provide \fBbuffer_compress_percentage\fR of blocksize random data, followed by
-the remaining zeroed. With this set to some chunk size smaller than the block
-size, fio can alternate random and zeroed data throughout the IO buffer.
-.TP
-.BI nrfiles \fR=\fPint
-Number of files to use for this job.  Default: 1.
-.TP
-.BI openfiles \fR=\fPint
-Number of files to keep open at the same time.  Default: \fBnrfiles\fR.
-.TP
-.BI file_service_type \fR=\fPstr
-Defines how files to service are selected.  The following types are defined:
+.BI fadvise_hint \fR=\fPstr
+Use \fBposix_fadvise\fR\|(2) or \fBposix_madvise\fR\|(2) to advise the kernel
+what I/O patterns are likely to be issued. Accepted values are:
 .RS
 .RS
 .TP
-.B random
-Choose a file at random
+.B 0
+Backwards compatible hint for "no hint".
+.TP
+.B 1
+Backwards compatible hint for "advise with fio workload type". This
+uses FADV_RANDOM for a random workload, and FADV_SEQUENTIAL
+for a sequential workload.
 .TP
-.B roundrobin
-Round robin over open files (default).
 .B sequential
-Do each file in the set sequentially.
+Advise using FADV_SEQUENTIAL.
+.TP
+.B random
+Advise using FADV_RANDOM.
 .RE
-.P
-The number of I/Os to issue before switching a new file can be specified by
-appending `:\fIint\fR' to the service type.
 .RE
 .TP
-.BI ioengine \fR=\fPstr
-Defines how the job issues I/O.  The following types are defined:
+.BI write_hint \fR=\fPstr
+Use \fBfcntl\fR\|(2) to advise the kernel what life time to expect
+from a write. Only supported on Linux, as of version 4.13. Accepted
+values are:
 .RS
 .RS
 .TP
-.B sync
-Basic \fIread\fR\|(2) or \fIwrite\fR\|(2) I/O.  \fIfseek\fR\|(2) is used to
-position the I/O location.
-.TP
-.B psync
-Basic \fIpread\fR\|(2) or \fIpwrite\fR\|(2) I/O.
-.TP
-.B vsync
-Basic \fIreadv\fR\|(2) or \fIwritev\fR\|(2) I/O. Will emulate queuing by
-coalescing adjacents IOs into a single submission.
-.TP
-.B pvsync
-Basic \fIpreadv\fR\|(2) or \fIpwritev\fR\|(2) I/O.
-.TP
-.B libaio
-Linux native asynchronous I/O. This ioengine defines engine specific options.
-.TP
-.B posixaio
-POSIX asynchronous I/O using \fIaio_read\fR\|(3) and \fIaio_write\fR\|(3).
-.TP
-.B solarisaio
-Solaris native asynchronous I/O.
-.TP
-.B windowsaio
-Windows native asynchronous I/O.
-.TP
-.B mmap
-File is memory mapped with \fImmap\fR\|(2) and data copied using
-\fImemcpy\fR\|(3).
-.TP
-.B splice
-\fIsplice\fR\|(2) is used to transfer the data and \fIvmsplice\fR\|(2) to
-transfer data from user-space to the kernel.
-.TP
-.B syslet-rw
-Use the syslet system calls to make regular read/write asynchronous.
-.TP
-.B sg
-SCSI generic sg v3 I/O. May be either synchronous using the SG_IO ioctl, or if
-the target is an sg character device, we use \fIread\fR\|(2) and
-\fIwrite\fR\|(2) for asynchronous I/O.
-.TP
-.B null
-Doesn't transfer any data, just pretends to.  Mainly used to exercise \fBfio\fR
-itself and for debugging and testing purposes.
-.TP
-.B net
-Transfer over the network.  The protocol to be used can be defined with the
-\fBprotocol\fR parameter.  Depending on the protocol, \fBfilename\fR,
-\fBhostname\fR, \fBport\fR, or \fBlisten\fR must be specified.
-This ioengine defines engine specific options.
-.TP
-.B netsplice
-Like \fBnet\fR, but uses \fIsplice\fR\|(2) and \fIvmsplice\fR\|(2) to map data
-and send/receive. This ioengine defines engine specific options.
-.TP
-.B cpuio
-Doesn't transfer any data, but burns CPU cycles according to \fBcpuload\fR and
-\fBcpucycles\fR parameters.
-.TP
-.B guasi
-The GUASI I/O engine is the Generic Userspace Asynchronous Syscall Interface
-approach to asycnronous I/O.
-.br
-See <http://www.xmailserver.org/guasi\-lib.html>.
+.B none
+No particular life time associated with this file.
 .TP
-.B rdma
-The RDMA I/O engine supports both RDMA memory semantics (RDMA_WRITE/RDMA_READ)
-and channel semantics (Send/Recv) for the InfiniBand, RoCE and iWARP protocols.
+.B short
+Data written to this file has a short life time.
 .TP
-.B external
-Loads an external I/O engine object file.  Append the engine filename as
-`:\fIenginepath\fR'.
+.B medium
+Data written to this file has a medium life time.
 .TP
-.B falloc
-   IO engine that does regular linux native fallocate callt to simulate data
-transfer as fio ioengine
-.br
-  DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
-.br
-  DIR_WRITE does fallocate(,mode = 0)
-.br
-  DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE)
+.B long
+Data written to this file has a long life time.
 .TP
-.B e4defrag
-IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity
-request to DDIR_WRITE event
+.B extreme
+Data written to this file has a very long life time.
 .RE
 .P
+The values are all relative to each other, and no absolute meaning
+should be associated with them.
 .RE
 .TP
-.BI iodepth \fR=\fPint
-Number of I/O units to keep in flight against the file. Note that increasing
-iodepth beyond 1 will not affect synchronous ioengines (except for small
-degress when verify_async is in use). Even async engines my impose OS
-restrictions causing the desired depth not to be achieved.  This may happen on
-Linux when using libaio and not setting \fBdirect\fR=1, since buffered IO is
-not async on that OS. Keep an eye on the IO depth distribution in the
-fio output to verify that the achieved depth is as expected. Default: 1.
-.TP
-.BI iodepth_batch \fR=\fPint
-Number of I/Os to submit at once.  Default: \fBiodepth\fR.
-.TP
-.BI iodepth_batch_complete \fR=\fPint
-This defines how many pieces of IO to retrieve at once. It defaults to 1 which
- means that we'll ask for a minimum of 1 IO in the retrieval process from the
-kernel. The IO retrieval will go on until we hit the limit set by
-\fBiodepth_low\fR. If this variable is set to 0, then fio will always check for
-completed events before queuing more IO. This helps reduce IO latency, at the
-cost of more retrieval system calls.
-.TP
-.BI iodepth_low \fR=\fPint
-Low watermark indicating when to start filling the queue again.  Default:
-\fBiodepth\fR. 
-.TP
-.BI direct \fR=\fPbool
-If true, use non-buffered I/O (usually O_DIRECT).  Default: false.
-.TP
-.BI buffered \fR=\fPbool
-If true, use buffered I/O.  This is the opposite of the \fBdirect\fR parameter.
-Default: true.
-.TP
 .BI offset \fR=\fPint
-Offset in the file to start I/O. Data before the offset will not be touched.
+Start I/O at the provided offset in the file, given as either a fixed size in
+bytes or a percentage. If a percentage is given, the generated offset will be
+aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
+provided. Data before the given offset will not be touched. This
+effectively caps the file size at `real_size \- offset'. Can be combined with
+\fBsize\fR to constrain the start and end range of the I/O workload.
+A percentage can be specified by a number between 1 and 100 followed by '%',
+for example, `offset=20%' to specify 20%.
+.TP
+.BI offset_align \fR=\fPint
+If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
+is aligned upwards to this value. Defaults to 0 meaning that a percentage
+offset is aligned to the minimum block size.
 .TP
 .BI offset_increment \fR=\fPint
-If this is provided, then the real offset becomes the
-offset + offset_increment * thread_number, where the thread number is a counter
-that starts at 0 and is incremented for each job. This option is useful if
-there are several jobs which are intended to operate on a file in parallel in
-disjoint segments, with even spacing between the starting points.
+If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
+* thread_number', where the thread number is a counter that starts at 0 and
+is incremented for each sub\-job (i.e. when \fBnumjobs\fR option is
+specified). This option is useful if there are several jobs which are
+intended to operate on a file in parallel disjoint segments, with even
+spacing between the starting points. Percentages can be used for this option.
+If a percentage is given, the generated offset will be aligned to the minimum
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
 .TP
 .BI number_ios \fR=\fPint
-Fio will normally perform IOs until it has exhausted the size of the region
+Fio will normally perform I/Os until it has exhausted the size of the region
 set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
 condition). With this setting, the range/size can be set independently of
-the number of IOs to perform. When fio reaches this number, it will exit
-normally and report status.
+the number of I/Os to perform. When fio reaches this number, it will exit
+normally and report status. Note that this does not extend the amount of I/O
+that will be done, it will only stop fio if this condition is met before
+other end\-of\-job criteria.
 .TP
 .BI fsync \fR=\fPint
-How many I/Os to perform before issuing an \fBfsync\fR\|(2) of dirty data.  If
-0, don't sync.  Default: 0.
+If writing to a file, issue an \fBfsync\fR\|(2) (or its equivalent) of
+the dirty data for every number of blocks given. For example, if you give 32
+as a parameter, fio will sync the file after every 32 writes issued. If fio is
+using non\-buffered I/O, we may not sync the file. The exception is the sg
+I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+means fio does not periodically issue and wait for a sync to complete. Also
+see \fBend_fsync\fR and \fBfsync_on_close\fR.
 .TP
 .BI fdatasync \fR=\fPint
-Like \fBfsync\fR, but uses \fBfdatasync\fR\|(2) instead to only sync the
-data parts of the file. Default: 0.
+Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
+not metadata blocks. In Windows, FreeBSD, and DragonFlyBSD there is no
+\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
+Defaults to 0, which means fio does not periodically issue and wait for a
+data\-only sync to complete.
+.TP
+.BI write_barrier \fR=\fPint
+Make every N\-th write a barrier write.
 .TP
 .BI sync_file_range \fR=\fPstr:int
-Use sync_file_range() for every \fRval\fP number of write operations. Fio will
-track range of writes that have happened since the last sync_file_range() call.
-\fRstr\fP can currently be one or more of:
+Use \fBsync_file_range\fR\|(2) for every \fIint\fR number of write
+operations. Fio will track range of writes that have happened since the last
+\fBsync_file_range\fR\|(2) call. \fIstr\fR can currently be one or more of:
+.RS
 .RS
 .TP
 .B wait_before
@@ -604,39 +1072,47 @@
 SYNC_FILE_RANGE_WRITE
 .TP
 .B wait_after
-SYNC_FILE_RANGE_WRITE
-.TP
+SYNC_FILE_RANGE_WRITE_AFTER
 .RE
 .P
-So if you do sync_file_range=wait_before,write:8, fio would use
-\fBSYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE\fP for every 8 writes.
-Also see the sync_file_range(2) man page.  This option is Linux specific.
+So if you do `sync_file_range=wait_before,write:8', fio would use
+`SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE' for every 8
+writes. Also see the \fBsync_file_range\fR\|(2) man page. This option is
+Linux specific.
+.RE
 .TP
 .BI overwrite \fR=\fPbool
-If writing, setup the file first and do overwrites.  Default: false.
+If true, writes to a file will always overwrite existing data. If the file
+doesn't already exist, it will be created before the write phase begins. If
+the file exists and is large enough for the specified write phase, nothing
+will be done. Default: false.
 .TP
 .BI end_fsync \fR=\fPbool
-Sync file contents when a write stage has completed.  Default: false.
+If true, \fBfsync\fR\|(2) file contents when a write stage has completed.
+Default: false.
 .TP
 .BI fsync_on_close \fR=\fPbool
-If true, sync file contents on close.  This differs from \fBend_fsync\fR in that
-it will happen on every close, not just at the end of the job.  Default: false.
+If true, fio will \fBfsync\fR\|(2) a dirty file on close. This differs
+from \fBend_fsync\fR in that it will happen on every file close, not
+just at the end of the job. Default: false.
 .TP
 .BI rwmixread \fR=\fPint
 Percentage of a mixed workload that should be reads. Default: 50.
 .TP
 .BI rwmixwrite \fR=\fPint
-Percentage of a mixed workload that should be writes.  If \fBrwmixread\fR and
-\fBrwmixwrite\fR are given and do not sum to 100%, the latter of the two
-overrides the first. This may interfere with a given rate setting, if fio is
-asked to limit reads or writes to a certain rate. If that is the case, then
-the distribution may be skewed. Default: 50.
+Percentage of a mixed workload that should be writes. If both
+\fBrwmixread\fR and \fBrwmixwrite\fR is given and the values do not
+add up to 100%, the latter of the two will be used to override the
+first. This may interfere with a given rate setting, if fio is asked to
+limit reads or writes to a certain rate. If that is the case, then the
+distribution may be skewed. Default: 50.
 .TP
-.BI random_distribution \fR=\fPstr:float
+.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
 By default, fio will use a completely uniform random distribution when asked
-to perform random IO. Sometimes it is useful to skew the distribution in
+to perform random I/O. Sometimes it is useful to skew the distribution in
 specific ways, ensuring that some parts of the data is more hot than others.
-Fio includes the following distribution models:
+fio includes the following distribution models:
+.RS
 .RS
 .TP
 .B random
@@ -648,672 +1124,2017 @@
 .B pareto
 Pareto distribution
 .TP
+.B normal
+Normal (Gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.B zoned_abs
+Zoned absolute random distribution
+.RE
+.P
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the `Zipf theta'.
+For \fBpareto\fR, it's the `Pareto power'. Fio includes a test
+program, \fBfio\-genzipf\fR, that can be used visualize what the given input
+values will yield in terms of hit rates. If you wanted to use \fBzipf\fR with
+a `theta' of 1.2, you would use `random_distribution=zipf:1.2' as the
+option. If a non\-uniform model is used, fio will disable use of the random
+map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
+supplied as a value between 0 and 100.
+.P
+For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
+access that should fall within what range of the file or device. For
+example, given a criteria of:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 10%
+.P
+30% of accesses should be to the next 20%
+.P
+8% of accesses should be to the next 30%
+.P
+2% of accesses should be to the next 40%
+.PD
+.RE
+.P
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+A \fBzoned_abs\fR distribution works exactly like the\fBzoned\fR, except that
+it takes absolute sizes. For example, let's say you wanted to define access
+according to the following criteria:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 20G
+.P
+30% of accesses should be to the next 100G
+.P
+10% of accesses should be to the next 500G
+.PD
+.RE
+.P
+we can define an absolute zoning distribution with:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
 .RE
 .P
-When using a zipf or pareto distribution, an input value is also needed to
-define the access pattern. For zipf, this is the zipf theta. For pareto,
-it's the pareto power. Fio includes a test program, genzipf, that can be
-used visualize what the given input values will yield in terms of hit rates.
-If you wanted to use zipf with a theta of 1.2, you would use
-random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
-fio will disable use of the random map.
-.TP
-.BI percentage_random \fR=\fPint
-For a random workload, set how big a percentage should be random. This defaults
-to 100%, in which case the workload is fully random. It can be set from
-anywhere from 0 to 100.  Setting it to 0 would make the workload fully
-sequential. It is possible to set different values for reads, writes, and
-trim. To do so, simply use a comma separated list. See \fBblocksize\fR.
-.TP
-.B norandommap
-Normally \fBfio\fR will cover every block of the file when doing random I/O. If
-this parameter is given, a new offset will be chosen without looking at past
-I/O history.  This parameter is mutually exclusive with \fBverify\fR.
+For both \fBzoned\fR and \fBzoned_abs\fR, fio supports defining up to 256
+separate zones.
+.P
+Similarly to how \fBbssplit\fR works for setting ranges and percentages
+of block sizes. Like \fBbssplit\fR, it's possible to specify separate
+zones for reads, writes, and trims. If just one set is given, it'll apply to
+all of them.
+.RE
+.TP
+.BI percentage_random \fR=\fPint[,int][,int]
+For a random workload, set how big a percentage should be random. This
+defaults to 100%, in which case the workload is fully random. It can be set
+from anywhere from 0 to 100. Setting it to 0 would make the workload fully
+sequential. Any setting in between will result in a random mix of sequential
+and random I/O, at the given percentages. Comma\-separated values may be
+specified for reads, writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI norandommap
+Normally fio will cover every block of the file when doing random I/O. If
+this option is given, fio will just get a new random offset without looking
+at past I/O history. This means that some blocks may not be read or written,
+and that some blocks may be read/written more than once. If this option is
+used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR),
+only intact blocks are verified, i.e., partially\-overwritten blocks are
+ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
+the same block to be overwritten, which can cause verification errors.  Either
+do not use norandommap in this case, or also use the lfsr random generator.
 .TP
 .BI softrandommap \fR=\fPbool
-See \fBnorandommap\fR. If fio runs with the random block map enabled and it
-fails to allocate the map, if this option is set it will continue without a
-random block map. As coverage will not be as complete as with random maps, this
-option is disabled by default.
+See \fBnorandommap\fR. If fio runs with the random block map enabled and
+it fails to allocate the map, if this option is set it will continue without
+a random block map. As coverage will not be as complete as with random maps,
+this option is disabled by default.
 .TP
 .BI random_generator \fR=\fPstr
-Fio supports the following engines for generating IO offsets for random IO:
+Fio supports the following engines for generating I/O offsets for random I/O:
+.RS
 .RS
 .TP
 .B tausworthe
-Strong 2^88 cycle random number generator
+Strong 2^88 cycle random number generator.
 .TP
 .B lfsr
-Linear feedback shift register generator
+Linear feedback shift register generator.
 .TP
+.B tausworthe64
+Strong 64\-bit 2^258 cycle random number generator.
 .RE
 .P
-Tausworthe is a strong random number generator, but it requires tracking on the
-side if we want to ensure that blocks are only read or written once. LFSR
-guarantees that we never generate the same offset twice, and it's also less
-computationally expensive. It's not a true random generator, however, though
-for IO purposes it's typically good enough. LFSR only works with single block
-sizes, not with workloads that use multiple block sizes. If used with such a
-workload, fio may read or write some blocks multiple times.
-.TP
-.BI nice \fR=\fPint
-Run job with given nice value.  See \fInice\fR\|(2).
-.TP
-.BI prio \fR=\fPint
-Set I/O priority value of this job between 0 (highest) and 7 (lowest).  See
-\fIionice\fR\|(1).
-.TP
-.BI prioclass \fR=\fPint
-Set I/O priority class.  See \fIionice\fR\|(1).
+\fBtausworthe\fR is a strong random number generator, but it requires tracking
+on the side if we want to ensure that blocks are only read or written
+once. \fBlfsr\fR guarantees that we never generate the same offset twice, and
+it's also less computationally expensive. It's not a true random generator,
+however, though for I/O purposes it's typically good enough. \fBlfsr\fR only
+works with single block sizes, not with workloads that use multiple block
+sizes. If used with such a workload, fio may read or write some blocks
+multiple times. The default value is \fBtausworthe\fR, unless the required
+space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is
+selected automatically.
+.RE
+.SS "Block size"
+.TP
+.BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int]
+The block size in bytes used for I/O units. Default: 4096. A single value
+applies to reads, writes, and trims. Comma\-separated values may be
+specified for reads, writes, and trims. A value not terminated in a comma
+applies to subsequent types. Examples:
+.RS
+.RS
+.P
+.PD 0
+bs=256k        means 256k for reads, writes and trims.
+.P
+bs=8k,32k      means 8k for reads, 32k for writes and trims.
+.P
+bs=8k,32k,     means 8k for reads, 32k for writes, and default for trims.
+.P
+bs=,8k         means default for reads, 8k for writes and trims.
+.P
+bs=,8k,        means default for reads, 8k for writes, and default for trims.
+.PD
+.RE
+.RE
 .TP
-.BI thinktime \fR=\fPint
-Stall job for given number of microseconds between issuing I/Os.
+.BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange]
+A range of block sizes in bytes for I/O units. The issued I/O unit will
+always be a multiple of the minimum size, unless
+\fBblocksize_unaligned\fR is set.
+Comma\-separated ranges may be specified for reads, writes, and trims as
+described in \fBblocksize\fR. Example:
+.RS
+.RS
+.P
+bsrange=1k\-4k,2k\-8k
+.RE
+.RE
 .TP
-.BI thinktime_spin \fR=\fPint
-Pretend to spend CPU time for given number of microseconds, sleeping the rest
-of the time specified by \fBthinktime\fR.  Only valid if \fBthinktime\fR is set.
+.BI bssplit \fR=\fPstr[,str][,str]
+Sometimes you want even finer grained control of the block sizes issued, not
+just an even split between them. This option allows you to weight various
+block sizes, so that you are able to define a specific amount of block sizes
+issued. The format for this option is:
+.RS
+.RS
+.P
+bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+for as many block sizes as needed. So if you want to define a workload that
+has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write:
+.RS
+.P
+bssplit=4k/10:64k/50:32k/40
+.RE
+.P
+Ordering does not matter. If the percentage is left blank, fio will fill in
+the remaining values evenly. So a bssplit option like this one:
+.RS
+.P
+bssplit=4k/50:1k/:32k/
+.RE
+.P
+would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+to 100, if bssplit is given a range that adds up to more, it will error out.
+.P
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.P
+If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+90% 4k writes and 10% 8k writes, you would specify:
+.RS
+.P
+bssplit=2k/50:4k/50,4k/90:8k/10
+.RE
+.P
+Fio supports defining up to 64 different weights for each data direction.
+.RE
 .TP
-.BI thinktime_blocks \fR=\fPint
-Only valid if thinktime is set - control how many blocks to issue, before
-waiting \fBthinktime\fR microseconds. If not set, defaults to 1 which will
-make fio wait \fBthinktime\fR microseconds after every block. This
-effectively makes any queue depth setting redundant, since no more than 1 IO
-will be queued before we have to complete it and do our thinktime. In other
-words, this setting effectively caps the queue depth if the latter is larger.
-Default: 1.
-.TP
-.BI rate \fR=\fPint
-Cap bandwidth used by this job. The number is in bytes/sec, the normal postfix
-rules apply. You can use \fBrate\fR=500k to limit reads and writes to 500k each,
-or you can specify read and writes separately. Using \fBrate\fR=1m,500k would
-limit reads to 1MB/sec and writes to 500KB/sec. Capping only reads or writes
-can be done with \fBrate\fR=,500k or \fBrate\fR=500k,. The former will only
-limit writes (to 500KB/sec), the latter will only limit reads.
-.TP
-.BI ratemin \fR=\fPint
-Tell \fBfio\fR to do whatever it can to maintain at least the given bandwidth.
-Failing to meet this requirement will cause the job to exit. The same format
-as \fBrate\fR is used for read vs write separation.
-.TP
-.BI rate_iops \fR=\fPint
-Cap the bandwidth to this number of IOPS. Basically the same as rate, just
-specified independently of bandwidth. The same format as \fBrate\fR is used for
-read vs write seperation. If \fBblocksize\fR is a range, the smallest block
-size is used as the metric.
-.TP
-.BI rate_iops_min \fR=\fPint
-If this rate of I/O is not met, the job will exit. The same format as \fBrate\fR
-is used for read vs write seperation.
-.TP
-.BI ratecycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
-milliseconds.  Default: 1000ms.
-.TP
-.BI max_latency \fR=\fPint
-If set, fio will exit the job if it exceeds this maximum latency. It will exit
-with an ETIME error.
+.BI blocksize_unaligned "\fR,\fB bs_unaligned"
+If set, fio will issue I/O units with any size within
+\fBblocksize_range\fR, not just multiples of the minimum size. This
+typically won't work with direct I/O, as that normally requires sector
+alignment.
 .TP
-.BI cpumask \fR=\fPint
-Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job
-may run on.  See \fBsched_setaffinity\fR\|(2).
+.BI bs_is_seq_rand \fR=\fPbool
+If this option is set, fio will use the normal read,write blocksize settings
+as sequential,random blocksize settings instead. Any random read or write
+will use the WRITE blocksize settings, and any sequential read or write will
+use the READ blocksize settings.
+.TP
+.BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int]
+Boundary to which fio will align random I/O units. Default:
+\fBblocksize\fR. Minimum alignment is typically 512b for using direct
+I/O, though it usually depends on the hardware block size. This option is
+mutually exclusive with using a random map for files, so it will turn off
+that option. Comma\-separated values may be specified for reads, writes, and
+trims as described in \fBblocksize\fR.
+.SS "Buffers and memory"
+.TP
+.BI zero_buffers
+Initialize buffers with all zeros. Default: fill buffers with random data.
+.TP
+.BI refill_buffers
+If this option is given, fio will refill the I/O buffers on every
+submit. The default is to only fill it at init time and reuse that
+data. Only makes sense if zero_buffers isn't specified, naturally. If data
+verification is enabled, \fBrefill_buffers\fR is also automatically enabled.
 .TP
-.BI cpus_allowed \fR=\fPstr
-Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
+.BI scramble_buffers \fR=\fPbool
+If \fBrefill_buffers\fR is too costly and the target is using data
+deduplication, then setting this option will slightly modify the I/O buffer
+contents to defeat normal de\-dupe attempts. This is not enough to defeat
+more clever block compression attempts, but it will stop naive dedupe of
+blocks. Default: true.
 .TP
-.BI numa_cpu_nodes \fR=\fPstr
-Set this job running on spcified NUMA nodes' CPUs. The arguments allow
-comma delimited list of cpu numbers, A-B ranges, or 'all'.
+.BI buffer_compress_percentage \fR=\fPint
+If this is set, then fio will attempt to provide I/O buffer content
+(on WRITEs) that compresses to the specified level. Fio does this by
+providing a mix of random data followed by fixed pattern data. The
+fixed pattern is either zeros, or the pattern specified by
+\fBbuffer_pattern\fR. If the \fBbuffer_pattern\fR option is used, it
+might skew the compression ratio slightly. Setting
+\fBbuffer_compress_percentage\fR to a value other than 100 will also
+enable \fBrefill_buffers\fR in order to reduce the likelihood that
+adjacent blocks are so similar that they over compress when seen
+together. See \fBbuffer_compress_chunk\fR for how to set a finer or
+coarser granularity of the random/fixed data regions. Defaults to unset
+i.e., buffer data will not adhere to any compression level.
 .TP
-.BI numa_mem_policy \fR=\fPstr
-Set this job's memory policy and corresponding NUMA nodes. Format of
-the argements:
+.BI buffer_compress_chunk \fR=\fPint
+This setting allows fio to manage how big the random/fixed data region
+is when using \fBbuffer_compress_percentage\fR. When
+\fBbuffer_compress_chunk\fR is set to some non-zero value smaller than the
+block size, fio can repeat the random/fixed region throughout the I/O
+buffer at the specified interval (which particularly useful when
+bigger block sizes are used for a job). When set to 0, fio will use a
+chunk size that matches the block size resulting in a single
+random/fixed region within the I/O buffer. Defaults to 512. When the
+unit is omitted, the value is interpreted in bytes.
+.TP
+.BI buffer_pattern \fR=\fPstr
+If set, fio will fill the I/O buffers with this pattern or with the contents
+of a file. If not set, the contents of I/O buffers are defined by the other
+options related to buffer contents. The setting can be any pattern of bytes,
+and can be prefixed with 0x for hex values. It may also be a string, where
+the string must then be wrapped with "". Or it may also be a filename,
+where the filename must be wrapped with '' in which case the file is
+opened and read. Note that not all the file contents will be read if that
+would cause the buffers to overflow. So, for example:
 .RS
-.TP
-.B <mode>[:<nodelist>]
-.TP
-.B mode
-is one of the following memory policy:
-.TP
-.B default, prefer, bind, interleave, local
-.TP
+.RS
+.P
+.PD 0
+buffer_pattern='filename'
+.P
+or:
+.P
+buffer_pattern="abcd"
+.P
+or:
+.P
+buffer_pattern=\-12
+.P
+or:
+.P
+buffer_pattern=0xdeadface
+.PD
+.RE
+.P
+Also you can combine everything together in any order:
+.RS
+.P
+buffer_pattern=0xdeadface"abcd"\-12'filename'
+.RE
 .RE
-For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is
-needed to be specified. For \fBprefer\fR, only one node is
-allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows
-comma delimited list of numbers, A-B ranges, or 'all'.
-.TP
-.BI startdelay \fR=\fPint
-Delay start of job for the specified number of seconds.
-.TP
-.BI runtime \fR=\fPint
-Terminate processing after the specified number of seconds.
-.TP
-.B time_based
-If given, run for the specified \fBruntime\fR duration even if the files are
-completely read or written. The same workload will be repeated as many times
-as \fBruntime\fR allows.
 .TP
-.BI ramp_time \fR=\fPint
-If set, fio will run the specified workload for this amount of time before
-logging any performance numbers. Useful for letting performance settle before
-logging results, thus minimizing the runtime required for stable results. Note
-that the \fBramp_time\fR is considered lead in time for a job, thus it will
-increase the total runtime if a special timeout or runtime is specified.
+.BI dedupe_percentage \fR=\fPint
+If set, fio will generate this percentage of identical buffers when
+writing. These buffers will be naturally dedupable. The contents of the
+buffers depend on what other buffer compression settings have been set. It's
+possible to have the individual buffers either fully compressible, or not at
+all \-\- this option only controls the distribution of unique buffers. Setting
+this option will also enable \fBrefill_buffers\fR to prevent every buffer
+being identical.
 .TP
 .BI invalidate \fR=\fPbool
-Invalidate buffer-cache for the file prior to starting I/O.  Default: true.
+Invalidate the buffer/page cache parts of the files to be used prior to
+starting I/O if the platform and file type support it. Defaults to true.
+This will be ignored if \fBpre_read\fR is also specified for the
+same job.
 .TP
 .BI sync \fR=\fPbool
-Use synchronous I/O for buffered writes.  For the majority of I/O engines,
-this means using O_SYNC.  Default: false.
+Use synchronous I/O for buffered writes. For the majority of I/O engines,
+this means using O_SYNC. Default: false.
 .TP
 .BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
-Allocation method for I/O unit buffer.  Allowed values are:
+Fio can use various types of memory as the I/O unit buffer. The allowed
+values are:
 .RS
 .RS
 .TP
 .B malloc
-Allocate memory with \fImalloc\fR\|(3).
+Use memory from \fBmalloc\fR\|(3) as the buffers. Default memory type.
 .TP
 .B shm
-Use shared memory buffers allocated through \fIshmget\fR\|(2).
+Use shared memory as the buffers. Allocated through \fBshmget\fR\|(2).
 .TP
 .B shmhuge
 Same as \fBshm\fR, but use huge pages as backing.
 .TP
 .B mmap
-Use \fImmap\fR\|(2) for allocation.  Uses anonymous memory unless a filename
-is given after the option in the format `:\fIfile\fR'.
+Use \fBmmap\fR\|(2) to allocate buffers. May either be anonymous memory, or can
+be file backed if a filename is given after the option. The format
+is `mem=mmap:/path/to/file'.
 .TP
 .B mmaphuge
-Same as \fBmmap\fR, but use huge files as backing.
-.RE
-.P
-The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
-job multiplied by \fBiodepth\fR.  For \fBshmhuge\fR or \fBmmaphuge\fR to work,
-the system must have free huge pages allocated.  \fBmmaphuge\fR also needs to
-have hugetlbfs mounted, and \fIfile\fR must point there. At least on Linux,
-huge pages must be manually allocated. See \fB/proc/sys/vm/nr_hugehages\fR
-and the documentation for that. Normally you just need to echo an appropriate
-number, eg echoing 8 will ensure that the OS has 8 huge pages ready for
-use.
+Use a memory mapped huge file as the buffer backing. Append filename
+after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file'.
+.TP
+.B mmapshared
+Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+The \fBioengine\fR must be \fBrdma\fR.
+.RE
+.P
+The area allocated is a function of the maximum allowed bs size for the job,
+multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
+\fBmmaphuge\fR to work, the system must have free huge pages allocated. This
+can normally be checked and set by reading/writing
+`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
+is 4MiB in size. So to calculate the number of huge pages you need for a
+given job file, add up the I/O depth of all jobs (normally one unless
+\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
+that number by the huge page size. You can see the size of the huge pages in
+`/proc/meminfo'. If no huge pages are allocated by having a non\-zero
+number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
+see \fBhugepage\-size\fR.
+.P
+\fBmmaphuge\fR also needs to have hugetlbfs mounted and the file location
+should point there. So if it's mounted in `/huge', you would use
+`mem=mmaphuge:/huge/somefile'.
 .RE
 .TP
 .BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
-This indiciates the memory alignment of the IO memory buffers. Note that the
-given alignment is applied to the first IO unit buffer, if using \fBiodepth\fR
-the alignment of the following buffers are given by the \fBbs\fR used. In
-other words, if using a \fBbs\fR that is a multiple of the page sized in the
-system, all buffers will be aligned to this value. If using a \fBbs\fR that
-is not page aligned, the alignment of subsequent IO memory buffers is the
-sum of the \fBiomem_align\fR and \fBbs\fR used.
+This indicates the memory alignment of the I/O memory buffers. Note that
+the given alignment is applied to the first I/O unit buffer, if using
+\fBiodepth\fR the alignment of the following buffers are given by the
+\fBbs\fR used. In other words, if using a \fBbs\fR that is a
+multiple of the page sized in the system, all buffers will be aligned to
+this value. If using a \fBbs\fR that is not page aligned, the alignment
+of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
+\fBbs\fR used.
 .TP
 .BI hugepage\-size \fR=\fPint
-Defines the size of a huge page.  Must be at least equal to the system setting.
-Should be a multiple of 1MB. Default: 4MB.
+Defines the size of a huge page. Must at least be equal to the system
+setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
+always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
+preferred way to set this to avoid setting a non\-pow\-2 bad value.
 .TP
-.B exitall
-Terminate all jobs when one finishes.  Default: wait for each job to finish.
+.BI lockmem \fR=\fPint
+Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
+simulate a smaller amount of memory. The amount specified is per worker.
+.SS "I/O size"
 .TP
-.BI bwavgtime \fR=\fPint
-Average bandwidth calculations over the given time in milliseconds.  Default:
-500ms.
+.BI size \fR=\fPint
+The total size of file I/O for each thread of this job. Fio will run until
+this many bytes has been transferred, unless runtime is limited by other options
+(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+Fio will divide this size between the available files determined by options
+such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
+specified by the job. If the result of division happens to be 0, the size is
+set to the physical size of the given files or devices if they exist.
+If this option is not specified, fio will use the full size of the given
+files or devices. If the files do not exist, size must be given. It is also
+possible to give size as a percentage between 1 and 100. If `size=20%' is
+given, fio will use 20% of the full size of the given files or devices.
+Can be combined with \fBoffset\fR to constrain the start and end range
+that I/O will be done within.
+.TP
+.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+Normally fio operates within the region set by \fBsize\fR, which means
+that the \fBsize\fR option sets both the region and size of I/O to be
+performed. Sometimes that is not what you want. With this option, it is
+possible to define just the amount of I/O that fio should do. For instance,
+if \fBsize\fR is set to 20GiB and \fBio_size\fR is set to 5GiB, fio
+will perform I/O within the first 20GiB but exit when 5GiB have been
+done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
+and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
+the 0..20GiB region.
+.TP
+.BI filesize \fR=\fPirange(int)
+Individual file sizes. May be a range, in which case fio will select sizes
+for files at random within the given range and limited to \fBsize\fR in
+total (if that is given). If not given, each created file is the same size.
+This option overrides \fBsize\fR in terms of file size, which means
+this value is used as a fixed size or possible range of each file.
+.TP
+.BI file_append \fR=\fPbool
+Perform I/O after the end of the file. Normally fio will operate within the
+size of a file. If this option is set, then fio will append to the file
+instead. This has identical behavior to setting \fBoffset\fR to the size
+of a file. This option is ignored on non\-regular files.
 .TP
-.BI iopsavgtime \fR=\fPint
-Average IOPS calculations over the given time in milliseconds.  Default:
-500ms.
+.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
+Sets size to something really large and waits for ENOSPC (no space left on
+device) as the terminating condition. Only makes sense with sequential
+write. For a read workload, the mount point will be filled first then I/O
+started on the result. This option doesn't make sense if operating on a raw
+device node, since the size of that is already known by the file system.
+Additionally, writing beyond end\-of\-device will not return ENOSPC there.
+.SS "I/O engine"
 .TP
-.BI create_serialize \fR=\fPbool
-If true, serialize file creation for the jobs.  Default: true.
+.BI ioengine \fR=\fPstr
+Defines how the job issues I/O to the file. The following types are defined:
+.RS
+.RS
 .TP
-.BI create_fsync \fR=\fPbool
-\fIfsync\fR\|(2) data file after creation.  Default: true.
+.B sync
+Basic \fBread\fR\|(2) or \fBwrite\fR\|(2)
+I/O. \fBlseek\fR\|(2) is used to position the I/O location.
+See \fBfsync\fR and \fBfdatasync\fR for syncing write I/Os.
 .TP
-.BI create_on_open \fR=\fPbool
-If true, the files are not created until they are opened for IO by the job.
+.B psync
+Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. Default on
+all supported operating systems except for Windows.
 .TP
-.BI create_only \fR=\fPbool
-If true, fio will only run the setup phase of the job. If files need to be
-laid out or updated on disk, only that will be done. The actual job contents
-are not executed.
+.B vsync
+Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate
+queuing by coalescing adjacent I/Os into a single submission.
 .TP
-.BI pre_read \fR=\fPbool
-If this is given, files will be pre-read into memory before starting the given
-IO operation. This will also clear the \fR \fBinvalidate\fR flag, since it is
-pointless to pre-read and then drop the cache. This will only work for IO
-engines that are seekable, since they allow you to read the same data
-multiple times. Thus it will not work on eg network or splice IO.
+.B pvsync
+Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
 .TP
-.BI unlink \fR=\fPbool
-Unlink job files when done.  Default: false.
+.B pvsync2
+Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
 .TP
-.BI loops \fR=\fPint
-Specifies the number of iterations (runs of the same workload) of this job.
-Default: 1.
+.B libaio
+Linux native asynchronous I/O. Note that Linux may only support
+queued behavior with non\-buffered I/O (set `direct=1' or
+`buffered=0').
+This engine defines engine specific options.
 .TP
-.BI do_verify \fR=\fPbool
-Run the verify phase after a write phase.  Only valid if \fBverify\fR is set.
-Default: true.
+.B posixaio
+POSIX asynchronous I/O using \fBaio_read\fR\|(3) and
+\fBaio_write\fR\|(3).
 .TP
-.BI verify \fR=\fPstr
-Method of verifying file contents after each iteration of the job.  Allowed
-values are:
-.RS
-.RS
+.B solarisaio
+Solaris native asynchronous I/O.
 .TP
-.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1
-Store appropriate checksum in the header of each block. crc32c-intel is
-hardware accelerated SSE4.2 driven, falls back to regular crc32c if
-not supported by the system.
+.B windowsaio
+Windows native asynchronous I/O. Default on Windows.
 .TP
-.B meta
-Write extra information about each I/O (timestamp, block number, etc.). The
-block number is verified. See \fBverify_pattern\fR as well.
+.B mmap
+File is memory mapped with \fBmmap\fR\|(2) and data copied
+to/from using \fBmemcpy\fR\|(3).
+.TP
+.B splice
+\fBsplice\fR\|(2) is used to transfer the data and
+\fBvmsplice\fR\|(2) to transfer data from user space to the
+kernel.
+.TP
+.B sg
+SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+ioctl, or if the target is an sg character device we use
+\fBread\fR\|(2) and \fBwrite\fR\|(2) for asynchronous
+I/O. Requires \fBfilename\fR option to specify either block or
+character devices. This engine supports trim operations. The
+sg engine includes engine specific options.
 .TP
 .B null
-Pretend to verify.  Used for testing internals.
-.RE
-
-This option can be used for repeated burn-in tests of a system to make sure
-that the written data is also correctly read back. If the data direction given
-is a read or random read, fio will assume that it should verify a previously
-written file. If the data direction includes any form of write, the verify will
-be of the newly written data.
-.RE
+Doesn't transfer any data, just pretends to. This is mainly used to
+exercise fio itself and for debugging/testing purposes.
 .TP
-.BI verify_sort \fR=\fPbool
-If true, written verify blocks are sorted if \fBfio\fR deems it to be faster to
-read them back in a sorted manner.  Default: true.
+.B net
+Transfer over the network to given `host:port'. Depending on the
+\fBprotocol\fR used, the \fBhostname\fR, \fBport\fR,
+\fBlisten\fR and \fBfilename\fR options are used to specify
+what sort of connection to make, while the \fBprotocol\fR option
+determines which protocol will be used. This engine defines engine
+specific options.
 .TP
-.BI verify_offset \fR=\fPint
-Swap the verification header with data somewhere else in the block before
-writing.  It is swapped back before verifying.
+.B netsplice
+Like \fBnet\fR, but uses \fBsplice\fR\|(2) and
+\fBvmsplice\fR\|(2) to map data and send/receive.
+This engine defines engine specific options.
 .TP
-.BI verify_interval \fR=\fPint
-Write the verification header for this number of bytes, which should divide
-\fBblocksize\fR.  Default: \fBblocksize\fR.
+.B cpuio
+Doesn't transfer any data, but burns CPU cycles according to the
+\fBcpuload\fR and \fBcpuchunks\fR options. Setting
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
+of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
+to get desired CPU usage, as the cpuload only loads a
+single CPU at the desired rate. A job never finishes unless there is
+at least one non\-cpuio job.
 .TP
-.BI verify_pattern \fR=\fPstr
-If set, fio will fill the io buffers with this pattern. Fio defaults to filling
-with totally random bytes, but sometimes it's interesting to fill with a known
-pattern for io verification purposes. Depending on the width of the pattern,
-fio will fill 1/2/3/4 bytes of the buffer at the time(it can be either a
-decimal or a hex number). The verify_pattern if larger than a 32-bit quantity
-has to be a hex number that starts with either "0x" or "0X". Use with
-\fBverify\fP=meta.
+.B guasi
+The GUASI I/O engine is the Generic Userspace Asynchronous Syscall
+Interface approach to async I/O. See \fIhttp://www.xmailserver.org/guasi\-lib.html\fR
+for more info on GUASI.
 .TP
-.BI verify_fatal \fR=\fPbool
-If true, exit the job on the first observed verification failure.  Default:
-false.
+.B rdma
+The RDMA I/O engine supports both RDMA memory semantics
+(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+InfiniBand, RoCE and iWARP protocols. This engine defines engine
+specific options.
 .TP
-.BI verify_dump \fR=\fPbool
-If set, dump the contents of both the original data block and the data block we
-read off disk to files. This allows later analysis to inspect just what kind of
-data corruption occurred. Off by default.
+.B falloc
+I/O engine that does regular fallocate to simulate data transfer as
+fio ioengine.
+.RS
+.P
+.PD 0
+DDIR_READ      does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+.P
+DIR_WRITE      does fallocate(,mode = 0).
+.P
+DDIR_TRIM      does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+.PD
+.RE
 .TP
-.BI verify_async \fR=\fPint
-Fio will normally verify IO inline from the submitting thread. This option
-takes an integer describing how many async offload threads to create for IO
-verification instead, causing fio to offload the duty of verifying IO contents
-to one or more separate threads.  If using this offload option, even sync IO
-engines can benefit from using an \fBiodepth\fR setting higher than 1, as it
-allows them to have IO in flight while verifies are running.
+.B ftruncate
+I/O engine that sends \fBftruncate\fR\|(2) operations in response
+to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+size to the current block offset. \fBblocksize\fR is ignored.
 .TP
-.BI verify_async_cpus \fR=\fPstr
-Tell fio to set the given CPU affinity on the async IO verification threads.
-See \fBcpus_allowed\fP for the format used.
+.B e4defrag
+I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+defragment activity in request to DDIR_WRITE event.
 .TP
-.BI verify_backlog \fR=\fPint
-Fio will normally verify the written contents of a job that utilizes verify
-once that job has completed. In other words, everything is written then
-everything is read back and verified. You may want to verify continually
-instead for a variety of reasons. Fio stores the meta data associated with an
-IO block in memory, so for large verify workloads, quite a bit of memory would
-be used up holding this meta data. If this option is enabled, fio will write
-only N blocks before verifying these blocks.
+.B rados
+I/O engine supporting direct access to Ceph Reliable Autonomic Distributed
+Object Store (RADOS) via librados. This ioengine defines engine specific
+options.
+.TP
+.B rbd
+I/O engine supporting direct access to Ceph Rados Block Devices
+(RBD) via librbd without the need to use the kernel rbd driver. This
+ioengine defines engine specific options.
+.TP
+.B http
+I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
+
+This engine only supports direct IO of iodepth=1; you need to scale this
+via numjobs. blocksize defines the size of the objects to be created.
+
+TRIM is translated to object deletion.
+.TP
+.B gfapi
+Using GlusterFS libgfapi sync interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
+.TP
+.B gfapi_async
+Using GlusterFS libgfapi async interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
+.TP
+.B libhdfs
+Read and write through Hadoop (HDFS). The \fBfilename\fR option
+is used to specify host,port of the hdfs name\-node to connect. This
+engine interprets offsets a little differently. In HDFS, files once
+created cannot be modified so random writes are not possible. To
+imitate this the libhdfs engine expects a bunch of small files to be
+created over HDFS and will randomly pick a file from them
+based on the offset generated by fio backend (see the example
+job file to create such files, use `rw=write' option). Please
+note, it may be necessary to set environment variables to work
+with HDFS/libhdfs properly. Each job uses its own connection to
+HDFS.
+.TP
+.B mtd
+Read, write and erase an MTD character device (e.g.,
+`/dev/mtd0'). Discards are treated as erases. Depending on the
+underlying device type, the I/O may have to go in a certain pattern,
+e.g., on NAND, writing sequentially to erase blocks and discarding
+before overwriting. The \fBtrimwrite\fR mode works well for this
+constraint.
+.TP
+.B pmemblk
+Read and write using filesystem DAX to a file on a filesystem
+mounted with DAX on a persistent memory device through the PMDK
+libpmemblk library.
+.TP
+.B dev\-dax
+Read and write using device DAX to a persistent memory device (e.g.,
+/dev/dax0.0) through the PMDK libpmem library.
 .TP
-.BI verify_backlog_batch \fR=\fPint
-Control how many blocks fio will verify if verify_backlog is set. If not set,
-will default to the value of \fBverify_backlog\fR (meaning the entire queue is
-read back and verified).  If \fBverify_backlog_batch\fR is less than 
-\fBverify_backlog\fR then not all blocks will be verified,  if 
-\fBverify_backlog_batch\fR is larger than \fBverify_backlog\fR,  some blocks
-will be verified more than once.
-.TP
-.B stonewall "\fR,\fP wait_for_previous"
-Wait for preceding jobs in the job file to exit before starting this one.
-\fBstonewall\fR implies \fBnew_group\fR.
-.TP
-.B new_group
-Start a new reporting group.  If not given, all jobs in a file will be part
-of the same reporting group, unless separated by a stonewall.
+.B external
+Prefix to specify loading an external I/O engine object file. Append
+the engine filename, e.g. `ioengine=external:/tmp/foo.o' to load
+ioengine `foo.o' in `/tmp'. The path can be either
+absolute or relative. See `engines/skeleton_external.c' in the fio source for
+details of writing an external I/O engine.
+.TP
+.B filecreate
+Simply create the files and do no I/O to them.  You still need to set
+\fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be
+done other than creating the file.
+.TP
+.B libpmem
+Read and write using mmap I/O to a file on a filesystem
+mounted with DAX on a persistent memory device through the PMDK
+libpmem library.
+.TP
+.B ime_psync
+Synchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine is very basic and issues calls to IME whenever an IO is queued.
+.TP
+.B ime_psyncv
+Synchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine uses iovecs and will try to stack as much IOs as possible (if the IOs
+are "contiguous" and the IO depth is not exceeded) before issuing a call to IME.
+.TP
+.B ime_aio
+Asynchronous read and write using DDN's Infinite Memory Engine (IME). This
+engine will try to stack as much IOs as possible by creating requests for IME.
+FIO will then decide when to commit these requests.
+.TP
+.B libiscsi
+Read and write iscsi lun with libiscsi.
+.TP
+.B nbd
+Synchronous read and write a Network Block Device (NBD).
+.SS "I/O engine specific parameters"
+In addition, there are some parameters which are only valid when a specific
+\fBioengine\fR is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+\fBioengine\fR that defines them is selected.
+.TP
+.BI (io_uring)hipri
+If this option is set, fio will attempt to use polled IO completions. Normal IO
+completions generate interrupts to signal the completion of IO, polled
+completions do not. Hence they are require active reaping by the application.
+The benefits are more efficient IO for high IOPS scenarios, and lower latencies
+for low queue depth IO.
+.TP
+.BI (io_uring)fixedbufs
+If fio is asked to do direct IO, then Linux will map pages for each IO call, and
+release them when IO is done. If this option is set, the pages are pre-mapped
+before IO is started. This eliminates the need to map and release for each IO.
+This is more efficient, and reduces the IO latency as well.
+.TP
+.BI (io_uring)registerfiles
+With this option, fio registers the set of files being used with the kernel.
+This avoids the overhead of managing file counts in the kernel, making the
+submission and completion part more lightweight. Required for the below
+sqthread_poll option.
+.TP
+.BI (io_uring)sqthread_poll
+Normally fio will submit IO by issuing a system call to notify the kernel of
+available items in the SQ ring. If this option is set, the act of submitting IO
+will be done by a polling thread in the kernel. This frees up cycles for fio, at
+the cost of using more CPU in the system.
+.TP
+.BI (io_uring)sqthread_poll_cpu
+When `sqthread_poll` is set, this option provides a way to define which CPU
+should be used for the polling thread.
 .TP
-.BI numjobs \fR=\fPint
-Number of clones (processes/threads performing the same workload) of this job.  
-Default: 1.
+.BI (libaio)userspace_reap
+Normally, with the libaio engine in use, fio will use the
+\fBio_getevents\fR\|(3) system call to reap newly returned events. With
+this flag turned on, the AIO ring will be read directly from user\-space to
+reap events. The reaping mode is only enabled when polling for a minimum of
+0 events (e.g. when `iodepth_batch_complete=0').
+.TP
+.BI (pvsync2)hipri
+Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+than normal.
+.TP
+.BI (pvsync2)hipri_percentage
+When hipri is set this determines the probability of a pvsync2 I/O being high
+priority. The default is 100%.
+.TP
+.BI (cpuio)cpuload \fR=\fPint
+Attempt to use the specified percentage of CPU cycles. This is a mandatory
+option when using cpuio I/O engine.
 .TP
-.B group_reporting
-If set, display per-group reports instead of per-job when \fBnumjobs\fR is
-specified.
-.TP
-.B thread
-Use threads created with \fBpthread_create\fR\|(3) instead of processes created
-with \fBfork\fR\|(2).
+.BI (cpuio)cpuchunks \fR=\fPint
+Split the load into cycles of the given time. In microseconds.
 .TP
-.BI zonesize \fR=\fPint
-Divide file into zones of the specified size in bytes.  See \fBzoneskip\fR.
+.BI (cpuio)exit_on_io_done \fR=\fPbool
+Detect when I/O threads are done, then exit.
 .TP
-.BI zoneskip \fR=\fPint
-Skip the specified number of bytes when \fBzonesize\fR bytes of data have been
-read.
+.BI (libhdfs)namenode \fR=\fPstr
+The hostname or IP address of a HDFS cluster namenode to contact.
 .TP
-.BI write_iolog \fR=\fPstr
-Write the issued I/O patterns to the specified file.  Specify a separate file
-for each job, otherwise the iologs will be interspersed and the file may be
-corrupt.
+.BI (libhdfs)port
+The listening port of the HFDS cluster namenode.
 .TP
-.BI read_iolog \fR=\fPstr
-Replay the I/O patterns contained in the specified file generated by
-\fBwrite_iolog\fR, or may be a \fBblktrace\fR binary file.
+.BI (netsplice,net)port
+The TCP or UDP port to bind to or connect to. If this is used with
+\fBnumjobs\fR to spawn multiple instances of the same job type, then
+this will be the starting port number since fio will use a range of
+ports.
+.TP
+.BI (rdma)port
+The port to use for RDMA-CM communication. This should be the same
+value on the client and the server side.
+.TP
+.BI (netsplice,net, rdma)hostname \fR=\fPstr
+The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
+If the job is a TCP listener or UDP reader, the hostname is not used
+and must be omitted unless it is a valid UDP multicast address.
+.TP
+.BI (netsplice,net)interface \fR=\fPstr
+The IP address of the network interface used to send or receive UDP
+multicast.
 .TP
-.BI replay_no_stall \fR=\fPint
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-attempts to respect timing information between I/Os.  Enabling
-\fBreplay_no_stall\fR causes I/Os to be replayed as fast as possible while
-still respecting ordering.
+.BI (netsplice,net)ttl \fR=\fPint
+Time\-to\-live value for outgoing UDP multicast packets. Default: 1.
 .TP
-.BI replay_redirect \fR=\fPstr
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-is to replay the IOPS onto the major/minor device that each IOP was recorded
-from.  Setting \fBreplay_redirect\fR causes all IOPS to be replayed onto the
-single specified device regardless of the device it was recorded from.
+.BI (netsplice,net)nodelay \fR=\fPbool
+Set TCP_NODELAY on TCP connections.
 .TP
-.BI write_bw_log \fR=\fPstr
-If given, write a bandwidth log of the jobs in this job file. Can be used to
-store data of the bandwidth of the jobs in their lifetime. The included
-fio_generate_plots script uses gnuplot to turn these text files into nice
-graphs. See \fBwrite_log_log\fR for behaviour of given filename. For this
-option, the postfix is _bw.log.
+.BI (netsplice,net)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
+The network protocol to use. Accepted values are:
+.RS
+.RS
 .TP
-.BI write_lat_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
-filename is given with this option, the default filename of "jobname_type.log"
-is used. Even if the filename is given, fio will still append the type of log.
+.B tcp
+Transmission control protocol.
 .TP
-.BI write_iops_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
-option, the default filename of "jobname_type.log" is used. Even if the
-filename is given, fio will still append the type of log.
+.B tcpv6
+Transmission control protocol V6.
 .TP
-.BI log_avg_msec \fR=\fPint
-By default, fio will log an entry in the iops, latency, or bw log for every
-IO that completes. When writing to the disk log, that can quickly grow to a
-very large size. Setting this option makes fio average the each log entry
-over the specified period of time, reducing the resolution of the log.
-Defaults to 0.
+.B udp
+User datagram protocol.
 .TP
-.BI disable_lat \fR=\fPbool
-Disable measurements of total latency numbers. Useful only for cutting
-back the number of calls to gettimeofday, as that does impact performance at
-really high IOPS rates.  Note that to really get rid of a large amount of these
-calls, this option must be used with disable_slat and disable_bw as well.
+.B udpv6
+User datagram protocol V6.
 .TP
-.BI disable_clat \fR=\fPbool
-Disable measurements of completion latency numbers. See \fBdisable_lat\fR.
+.B unix
+UNIX domain socket.
+.RE
+.P
+When the protocol is TCP or UDP, the port must also be given, as well as the
+hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+normal \fBfilename\fR option should be used and the port is invalid.
+.RE
 .TP
-.BI disable_slat \fR=\fPbool
-Disable measurements of submission latency numbers. See \fBdisable_lat\fR.
+.BI (netsplice,net)listen
+For TCP network connections, tell fio to listen for incoming connections
+rather than initiating an outgoing connection. The \fBhostname\fR must
+be omitted if this option is used.
 .TP
-.BI disable_bw_measurement \fR=\fPbool
-Disable measurements of throughput/bandwidth numbers. See \fBdisable_lat\fR.
+.BI (netsplice,net)pingpong
+Normally a network writer will just continue writing data, and a network
+reader will just consume packages. If `pingpong=1' is set, a writer will
+send its normal payload to the reader, then wait for the reader to send the
+same payload back. This allows fio to measure network latencies. The
+submission and completion latencies then measure local time spent sending or
+receiving, and the completion latency measures how long it took for the
+other end to receive and send back. For UDP multicast traffic
+`pingpong=1' should only be set for a single reader when multiple readers
+are listening to the same address.
 .TP
-.BI lockmem \fR=\fPint
-Pin the specified amount of memory with \fBmlock\fR\|(2).  Can be used to
-simulate a smaller amount of memory. The amount specified is per worker.
+.BI (netsplice,net)window_size \fR=\fPint
+Set the desired socket buffer size for the connection.
 .TP
-.BI exec_prerun \fR=\fPstr
-Before running the job, execute the specified command with \fBsystem\fR\|(3).
+.BI (netsplice,net)mss \fR=\fPint
+Set the TCP maximum segment size (TCP_MAXSEG).
+.TP
+.BI (e4defrag)donorname \fR=\fPstr
+File will be used as a block donor (swap extents between files).
+.TP
+.BI (e4defrag)inplace \fR=\fPint
+Configure donor file blocks allocation strategy:
+.RS
 .RS
-Output is redirected in a file called \fBjobname.prerun.txt\fR
+.TP
+.B 0
+Default. Preallocate donor's file on init.
+.TP
+.B 1
+Allocate space immediately inside defragment event, and free right
+after event.
+.RE
 .RE
 .TP
-.BI exec_postrun \fR=\fPstr
-Same as \fBexec_prerun\fR, but the command is executed after the job completes.
+.BI (rbd,rados)clustername \fR=\fPstr
+Specifies the name of the Ceph cluster.
+.TP
+.BI (rbd)rbdname \fR=\fPstr
+Specifies the name of the RBD.
+.TP
+.BI (rbd,rados)pool \fR=\fPstr
+Specifies the name of the Ceph pool containing RBD or RADOS data.
+.TP
+.BI (rbd,rados)clientname \fR=\fPstr
+Specifies the username (without the 'client.' prefix) used to access the
+Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be
+the full *type.id* string. If no type. prefix is given, fio will add 'client.'
+by default.
+.TP
+.BI (rbd,rados)busy_poll \fR=\fPbool
+Poll store instead of waiting for completion. Usually this provides better
+throughput at cost of higher(up to 100%) CPU utilization.
+.TP
+.BI (http)http_host \fR=\fPstr
+Hostname to connect to. For S3, this could be the bucket name. Default
+is \fBlocalhost\fR
+.TP
+.BI (http)http_user \fR=\fPstr
+Username for HTTP authentication.
+.TP
+.BI (http)http_pass \fR=\fPstr
+Password for HTTP authentication.
+.TP
+.BI (http)https \fR=\fPstr
+Whether to use HTTPS instead of plain HTTP. \fRon\fP enables HTTPS;
+\fRinsecure\fP will enable HTTPS, but disable SSL peer verification (use
+with caution!).  Default is \fBoff\fR.
+.TP
+.BI (http)http_mode \fR=\fPstr
+Which HTTP access mode to use: webdav, swift, or s3. Default is
+\fBwebdav\fR.
+.TP
+.BI (http)http_s3_region \fR=\fPstr
+The S3 region/zone to include in the request. Default is \fBus-east-1\fR.
+.TP
+.BI (http)http_s3_key \fR=\fPstr
+The S3 secret key.
+.TP
+.BI (http)http_s3_keyid \fR=\fPstr
+The S3 key/access id.
+.TP
+.BI (http)http_swift_auth_token \fR=\fPstr
+The Swift auth token. See the example configuration file on how to
+retrieve this.
+.TP
+.BI (http)http_verbose \fR=\fPint
+Enable verbose requests from libcurl. Useful for debugging. 1 turns on
+verbose logging from libcurl, 2 additionally enables HTTP IO tracing.
+Default is \fB0\fR
+.TP
+.BI (mtd)skip_bad \fR=\fPbool
+Skip operations against known bad blocks.
+.TP
+.BI (libhdfs)hdfsdirectory
+libhdfs will create chunk in this HDFS directory.
+.TP
+.BI (libhdfs)chunk_size
+The size of the chunk to use for each file.
+.TP
+.BI (rdma)verb \fR=\fPstr
+The RDMA verb to use on this side of the RDMA ioengine
+connection. Valid values are write, read, send and recv. These
+correspond to the equivalent RDMA verbs (e.g. write = rdma_write
+etc.). Note that this only needs to be specified on the client side of
+the connection. See the examples folder.
+.TP
+.BI (rdma)bindname \fR=\fPstr
+The name to use to bind the local RDMA-CM connection to a local RDMA
+device. This could be a hostname or an IPv4 or IPv6 address. On the
+server side this will be passed into the rdma_bind_addr() function and
+on the client site it will be used in the rdma_resolve_add()
+function. This can be useful when multiple paths exist between the
+client and the server or in certain loopback configurations.
+.TP
+.BI (sg)readfua \fR=\fPbool
+With readfua option set to 1, read operations include the force
+unit access (fua) flag. Default: 0.
+.TP
+.BI (sg)writefua \fR=\fPbool
+With writefua option set to 1, write operations include the force
+unit access (fua) flag. Default: 0.
+.TP
+.BI (sg)sg_write_mode \fR=\fPstr
+Specify the type of write commands to issue. This option can take three
+values:
+.RS
 .RS
-Output is redirected in a file called \fBjobname.postrun.txt\fR
-.RE
 .TP
-.BI ioscheduler \fR=\fPstr
-Attempt to switch the device hosting the file to the specified I/O scheduler.
+.B write (default)
+Write opcodes are issued as usual
 .TP
-.BI cpuload \fR=\fPint
-If the job is a CPU cycle-eater, attempt to use the specified percentage of
-CPU cycles.
-.TP
-.BI cpuchunks \fR=\fPint
-If the job is a CPU cycle-eater, split the load into cycles of the
-given time in milliseconds.
+.B verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+directs the device to carry out a medium verification with no data
+comparison. The writefua option is ignored with this selection.
 .TP
-.BI disk_util \fR=\fPbool
-Generate disk utilization statistics if the platform supports it. Default: true.
+.B same
+Issue WRITE SAME commands. This transfers a single block to the device
+and writes this same block of data to a contiguous sequence of LBAs
+beginning at the specified offset. fio's block size parameter
+specifies the amount of data written with each command. However, the
+amount of data actually transferred to the device is equal to the
+device's block (sector) size. For a device with 512 byte sectors,
+blocksize=8k will write 16 sectors with each command. fio will still
+generate 8k of data for each command butonly the first 512 bytes will
+be used and transferred to the device. The writefua option is ignored
+with this selection.
+.RE
+.RE
 .TP
-.BI clocksource \fR=\fPstr
-Use the given clocksource as the base of timing. The supported options are:
+.BI (nbd)uri \fR=\fPstr
+Specify the NBD URI of the server to test.
+The string is a standard NBD URI (see
+\fIhttps://github.com/NetworkBlockDevice/nbd/tree/master/doc\fR).
+Example URIs:
+.RS
 .RS
 .TP
-.B gettimeofday
-gettimeofday(2)
+\fInbd://localhost:10809\fR
 .TP
-.B clock_gettime
-clock_gettime(2)
+\fInbd+unix:///?socket=/tmp/socket\fR
 .TP
-.B cpu
-Internal CPU clock source
+\fInbds://tlshost/exportname\fR
+
+.SS "I/O depth"
 .TP
+.BI iodepth \fR=\fPint
+Number of I/O units to keep in flight against the file. Note that
+increasing \fBiodepth\fR beyond 1 will not affect synchronous ioengines (except
+for small degrees when \fBverify_async\fR is in use). Even async
+engines may impose OS restrictions causing the desired depth not to be
+achieved. This may happen on Linux when using libaio and not setting
+`direct=1', since buffered I/O is not async on that OS. Keep an
+eye on the I/O depth distribution in the fio output to verify that the
+achieved depth is as expected. Default: 1.
+.TP
+.BI iodepth_batch_submit \fR=\fPint "\fR,\fP iodepth_batch" \fR=\fPint
+This defines how many pieces of I/O to submit at once. It defaults to 1
+which means that we submit each I/O as soon as it is available, but can be
+raised to submit bigger batches of I/O at the time. If it is set to 0 the
+\fBiodepth\fR value will be used.
+.TP
+.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
+This defines how many pieces of I/O to retrieve at once. It defaults to 1
+which means that we'll ask for a minimum of 1 I/O in the retrieval process
+from the kernel. The I/O retrieval will go on until we hit the limit set by
+\fBiodepth_low\fR. If this variable is set to 0, then fio will always
+check for completed events before queuing more I/O. This helps reduce I/O
+latency, at the cost of more retrieval system calls.
+.TP
+.BI iodepth_batch_complete_max \fR=\fPint
+This defines maximum pieces of I/O to retrieve at once. This variable should
+be used along with \fBiodepth_batch_complete_min\fR=\fIint\fR variable,
+specifying the range of min and max amount of I/O which should be
+retrieved. By default it is equal to \fBiodepth_batch_complete_min\fR
+value. Example #1:
+.RS
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=1
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
 .RE
 .P
-\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast
-(and fio is heavy on time calls). Fio will automatically use this clocksource
-if it's supported and considered reliable on the system it is running on,
-unless another clocksource is specifically set. For x86/x86-64 CPUs, this
-means supporting TSC Invariant.
+which means that we will retrieve at least 1 I/O and up to the whole
+submitted queue depth. If none of I/O has been completed yet, we will wait.
+Example #2:
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=0
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
+.RE
+.P
+which means that we can retrieve up to the whole submitted queue depth, but
+if none of I/O has been completed yet, we will NOT wait and immediately exit
+the system call. In this example we simply do polling.
+.RE
 .TP
-.BI gtod_reduce \fR=\fPbool
-Enable all of the gettimeofday() reducing options (disable_clat, disable_slat,
-disable_bw) plus reduce precision of the timeout somewhat to really shrink the
-gettimeofday() call count. With this option enabled, we only do about 0.4% of
-the gtod() calls we would have done if all time keeping was enabled.
+.BI iodepth_low \fR=\fPint
+The low water mark indicating when to start filling the queue
+again. Defaults to the same as \fBiodepth\fR, meaning that fio will
+attempt to keep the queue full at all times. If \fBiodepth\fR is set to
+e.g. 16 and \fBiodepth_low\fR is set to 4, then after fio has filled the queue of
+16 requests, it will let the depth drain down to 4 before starting to fill
+it again.
+.TP
+.BI serialize_overlap \fR=\fPbool
+Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+When two or more I/Os are submitted simultaneously, there is no guarantee that
+the I/Os will be processed or completed in the submitted order. Further, if
+two or more of those I/Os are writes, any overlapping region between them can
+become indeterminate/undefined on certain storage. These issues can cause
+verification to fail erratically when at least one of the racing I/Os is
+changing data and the overlapping region has a non-zero size. Setting
+\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly
+serializing in-flight I/Os that have a non-zero overlap. Note that setting
+this option can reduce both performance and the \fBiodepth\fR achieved.
+.RS
+.P
+This option only applies to I/Os issued for a single job except when it is
+enabled along with \fBio_submit_mode\fR=offload. In offload mode, fio
+will check for overlap among all I/Os submitted by offload jobs with \fBserialize_overlap\fR
+enabled.
+.P
+Default: false.
+.RE
+.TP
+.BI io_submit_mode \fR=\fPstr
+This option controls how fio submits the I/O to the I/O engine. The default
+is `inline', which means that the fio job threads submit and reap I/O
+directly. If set to `offload', the job threads will offload I/O submission
+to a dedicated pool of I/O threads. This requires some coordination and thus
+has a bit of extra overhead, especially for lower queue depth I/O where it
+can increase latencies. The benefit is that fio can manage submission rates
+independently of the device completion rates. This avoids skewed latency
+reporting if I/O gets backed up on the device side (the coordinated omission
+problem).
+.SS "I/O rate"
+.TP
+.BI thinktime \fR=\fPtime
+Stall the job for the specified period of time after an I/O has completed before issuing the
+next. May be used to simulate processing being done by an application.
+When the unit is omitted, the value is interpreted in microseconds. See
+\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+.TP
+.BI thinktime_spin \fR=\fPtime
+Only valid if \fBthinktime\fR is set \- pretend to spend CPU time doing
+something with the data received, before falling back to sleeping for the
+rest of the period specified by \fBthinktime\fR. When the unit is
+omitted, the value is interpreted in microseconds.
 .TP
-.BI gtod_cpu \fR=\fPint
-Sometimes it's cheaper to dedicate a single thread of execution to just getting
-the current time. Fio (and databases, for instance) are very intensive on
-gettimeofday() calls. With this option, you can set one CPU aside for doing
-nothing but logging current time to a shared memory location. Then the other
-threads/processes that run IO workloads need only copy that segment, instead of
-entering the kernel with a gettimeofday() call. The CPU set aside for doing
-these time calls will be excluded from other uses. Fio will manually clear it
-from the CPU mask of other jobs.
+.BI thinktime_blocks \fR=\fPint
+Only valid if \fBthinktime\fR is set \- control how many blocks to issue,
+before waiting \fBthinktime\fR usecs. If not set, defaults to 1 which will make
+fio wait \fBthinktime\fR usecs after every block. This effectively makes any
+queue depth setting redundant, since no more than 1 I/O will be queued
+before we have to complete it and do our \fBthinktime\fR. In other words, this
+setting effectively caps the queue depth if the latter is larger.
+.TP
+.BI rate \fR=\fPint[,int][,int]
+Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+suffix rules apply. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.RS
+.P
+For example, using `rate=1m,500k' would limit reads to 1MiB/sec and writes to
+500KiB/sec. Capping only reads or writes can be done with `rate=,500k' or
+`rate=500k,' where the former will only limit writes (to 500KiB/sec) and the
+latter will only limit reads.
+.RE
+.TP
+.BI rate_min \fR=\fPint[,int][,int]
+Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+to meet this requirement will cause the job to exit. Comma\-separated values
+may be specified for reads, writes, and trims as described in
+\fBblocksize\fR.
+.TP
+.BI rate_iops \fR=\fPint[,int][,int]
+Cap the bandwidth to this number of IOPS. Basically the same as
+\fBrate\fR, just specified independently of bandwidth. If the job is
+given a block size range instead of a fixed value, the smallest block size
+is used as the metric. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI rate_iops_min \fR=\fPint[,int][,int]
+If fio doesn't meet this rate of I/O, it will cause the job to exit.
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.TP
+.BI rate_process \fR=\fPstr
+This option controls how fio manages rated I/O submissions. The default is
+`linear', which submits I/O in a linear fashion with fixed delays between
+I/Os that gets adjusted based on I/O completion rates. If this is set to
+`poisson', fio will submit I/O based on a more real world random request
+flow, known as the Poisson process
+(\fIhttps://en.wikipedia.org/wiki/Poisson_point_process\fR). The lambda will be
+10^6 / IOPS for the given workload.
+.TP
+.BI rate_ignore_thinktime \fR=\fPbool
+By default, fio will attempt to catch up to the specified rate setting, if any
+kind of thinktime setting was used. If this option is set, then fio will
+ignore the thinktime and continue doing IO at the specified rate, instead of
+entering a catch-up mode after thinktime is done.
+.SS "I/O latency"
+.TP
+.BI latency_target \fR=\fPtime
+If set, fio will attempt to find the max performance point that the given
+workload will run at while maintaining a latency below this target. When
+the unit is omitted, the value is interpreted in microseconds. See
+\fBlatency_window\fR and \fBlatency_percentile\fR.
+.TP
+.BI latency_window \fR=\fPtime
+Used with \fBlatency_target\fR to specify the sample window that the job
+is run at varying queue depths to test the performance. When the unit is
+omitted, the value is interpreted in microseconds.
+.TP
+.BI latency_percentile \fR=\fPfloat
+The percentage of I/Os that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this
+defaults to 100.0, meaning that all I/Os must be equal or below to the value
+set by \fBlatency_target\fR.
+.TP
+.BI max_latency \fR=\fPtime
+If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+maximum latency. When the unit is omitted, the value is interpreted in
+microseconds.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
+of milliseconds. Defaults to 1000.
+.SS "I/O replay"
 .TP
-.BI ignore_error \fR=\fPstr
-Sometimes you want to ignore some errors during test in that case you can specify
-error list for each error type.
-.br
-ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-.br
-errors for given error type is separated with ':'.
-Error may be symbol ('ENOSPC', 'ENOMEM') or an integer.
-.br
-Example: ignore_error=EAGAIN,ENOSPC:122 .
-.br	
-This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE. 
+.BI write_iolog \fR=\fPstr
+Write the issued I/O patterns to the specified file. See
+\fBread_iolog\fR. Specify a separate file for each job, otherwise the
+iologs will be interspersed and the file may be corrupt.
 .TP
-.BI error_dump \fR=\fPbool
-If set dump every error even if it is non fatal, true by default. If disabled
-only fatal error will be dumped
+.BI read_iolog \fR=\fPstr
+Open an iolog with the specified filename and replay the I/O patterns it
+contains. This can be used to store a workload and replay it sometime
+later. The iolog given may also be a blktrace binary file, which allows fio
+to replay a workload captured by blktrace. See
+\fBblktrace\fR\|(8) for how to capture such logging data. For blktrace
+replay, the file needs to be turned into a blkparse binary data file first
+(`blkparse <device> \-o /dev/null \-d file_for_fio.bin').
+You can specify a number of files by separating the names with a ':' character.
+See the \fBfilename\fR option for information on how to escape ':' and '\'
+characters within the file names. These files will be sequentially assigned to
+job clones created by \fBnumjobs\fR.
+.TP
+.BI read_iolog_chunked \fR=\fPbool
+Determines how iolog is read. If false (default) entire \fBread_iolog\fR will
+be read at once. If selected true, input from iolog will be read gradually.
+Useful when iolog is very large, or it is generated.
+.TP
+.BI merge_blktrace_file \fR=\fPstr
+When specified, rather than replaying the logs passed to \fBread_iolog\fR,
+the logs go through a merge phase which aggregates them into a single blktrace.
+The resulting file is then passed on as the \fBread_iolog\fR parameter. The
+intention here is to make the order of events consistent. This limits the
+influence of the scheduler compared to replaying multiple blktraces via
+concurrent jobs.
+.TP
+.BI merge_blktrace_scalars \fR=\fPfloat_list
+This is a percentage based option that is index paired with the list of files
+passed to \fBread_iolog\fR. When merging is performed, scale the time of each
+event by the corresponding amount. For example,
+`\-\-merge_blktrace_scalars="50:100"' runs the first trace in halftime and the
+second trace in realtime. This knob is separately tunable from
+\fBreplay_time_scale\fR which scales the trace during runtime and will not
+change the output of the merge unlike this option.
+.TP
+.BI merge_blktrace_iters \fR=\fPfloat_list
+This is a whole number option that is index paired with the list of files
+passed to \fBread_iolog\fR. When merging is performed, run each trace for
+the specified number of iterations. For example,
+`\-\-merge_blktrace_iters="2:1"' runs the first trace for two iterations
+and the second trace for one iteration.
+.TP
+.BI replay_no_stall \fR=\fPbool
+When replaying I/O with \fBread_iolog\fR the default behavior is to
+attempt to respect the timestamps within the log and replay them with the
+appropriate delay between IOPS. By setting this variable fio will not
+respect the timestamps and attempt to replay them as fast as possible while
+still respecting ordering. The result is the same I/O pattern to a given
+device, but different timings.
+.TP
+.BI replay_time_scale \fR=\fPint
+When replaying I/O with \fBread_iolog\fR, fio will honor the original timing
+in the trace. With this option, it's possible to scale the time. It's a
+percentage option, if set to 50 it means run at 50% the original IO rate in
+the trace. If set to 200, run at twice the original IO rate. Defaults to 100.
+.TP
+.BI replay_redirect \fR=\fPstr
+While replaying I/O patterns using \fBread_iolog\fR the default behavior
+is to replay the IOPS onto the major/minor device that each IOP was recorded
+from. This is sometimes undesirable because on a different machine those
+major/minor numbers can map to a different device. Changing hardware on the
+same system can also result in a different major/minor mapping.
+\fBreplay_redirect\fR causes all I/Os to be replayed onto the single specified
+device regardless of the device it was recorded
+from. i.e. `replay_redirect=/dev/sdc' would cause all I/O
+in the blktrace or iolog to be replayed onto `/dev/sdc'. This means
+multiple devices will be replayed onto a single device, if the trace
+contains multiple devices. If you want multiple devices to be replayed
+concurrently to multiple redirected devices you must blkparse your trace
+into separate traces and replay them with independent fio invocations.
+Unfortunately this also breaks the strict time ordering between multiple
+device accesses.
+.TP
+.BI replay_align \fR=\fPint
+Force alignment of the byte offsets in a trace to this value. The value
+must be a power of 2.
+.TP
+.BI replay_scale \fR=\fPint
+Scale bye offsets down by this factor when replaying traces. Should most
+likely use \fBreplay_align\fR as well.
+.SS "Threads, processes and job synchronization"
+.TP
+.BI replay_skip \fR=\fPstr
+Sometimes it's useful to skip certain IO types in a replay trace. This could
+be, for instance, eliminating the writes in the trace. Or not replaying the
+trims/discards, if you are redirecting to a device that doesn't support them.
+This option takes a comma separated list of read, write, trim, sync.
+.TP
+.BI thread
+Fio defaults to creating jobs by using fork, however if this option is
+given, fio will create jobs by using POSIX Threads' function
+\fBpthread_create\fR\|(3) to create threads instead.
+.TP
+.BI wait_for \fR=\fPstr
+If set, the current job won't be started until all workers of the specified
+waitee job are done.
+.\" ignore blank line here from HOWTO as it looks normal without it
+\fBwait_for\fR operates on the job name basis, so there are a few
+limitations. First, the waitee must be defined prior to the waiter job
+(meaning no forward references). Second, if a job is being referenced as a
+waitee, it must have a unique name (no duplicate waitees).
+.TP
+.BI nice \fR=\fPint
+Run the job with the given nice value. See man \fBnice\fR\|(2).
+.\" ignore blank line here from HOWTO as it looks normal without it
+On Windows, values less than \-15 set the process class to "High"; \-1 through
+\-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+priority class.
+.TP
+.BI prio \fR=\fPint
+Set the I/O priority value of this job. Linux limits us to a positive value
+between 0 and 7, with 0 being the highest. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating
+systems since meaning of priority may differ.
+.TP
+.BI prioclass \fR=\fPint
+Set the I/O priority class. See man \fBionice\fR\|(1).
+.TP
+.BI cpus_allowed \fR=\fPstr
+Controls the same options as \fBcpumask\fR, but accepts a textual
+specification of the permitted CPUs instead and CPUs are indexed from 0. So
+to use CPUs 0 and 5 you would specify `cpus_allowed=0,5'. This option also
+allows a range of CPUs to be specified \-\- say you wanted a binding to CPUs
+0, 5, and 8 to 15, you would set `cpus_allowed=0,5,8\-15'.
+.RS
+.P
+On Windows, when `cpus_allowed' is unset only CPUs from fio's current
+processor group will be used and affinity settings are inherited from the
+system. An fio build configured to target Windows 7 makes options that set
+CPUs processor group aware and values will set both the processor group
+and a CPU from within that group. For example, on a system where processor
+group 0 has 40 CPUs and processor group 1 has 32 CPUs, `cpus_allowed'
+values between 0 and 39 will bind CPUs from processor group 0 and
+`cpus_allowed' values between 40 and 71 will bind CPUs from processor
+group 1. When using `cpus_allowed_policy=shared' all CPUs specified by a
+single `cpus_allowed' option must be from the same processor group. For
+Windows fio builds not built for Windows 7, CPUs will only be selected from
+(and be relative to) whatever processor group fio happens to be running in
+and CPUs from other processor groups cannot be used.
+.RE
+.TP
+.BI cpus_allowed_policy \fR=\fPstr
+Set the policy of how fio distributes the CPUs specified by
+\fBcpus_allowed\fR or \fBcpumask\fR. Two policies are supported:
+.RS
+.RS
+.TP
+.B shared
+All jobs will share the CPU set specified.
+.TP
+.B split
+Each job will get a unique CPU from the CPU set.
+.RE
+.P
+\fBshared\fR is the default behavior, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will will assign one cpu per job. If not
+enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+in the set.
+.RE
+.TP
+.BI cpumask \fR=\fPint
+Set the CPU affinity of this job. The parameter given is a bit mask of
+allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+\fBsched_setaffinity\fR\|(2). This may not work on all supported
+operating systems or kernel versions. This option doesn't work well for a
+higher CPU count than what you can store in an integer mask, so it can only
+control cpus 1\-32. For boxes with larger CPU counts, use
+\fBcpus_allowed\fR.
+.TP
+.BI numa_cpu_nodes \fR=\fPstr
+Set this job running on specified NUMA nodes' CPUs. The arguments allow
+comma delimited list of cpu numbers, A\-B ranges, or `all'. Note, to enable
+NUMA options support, fio must be built on a system with libnuma\-dev(el)
+installed.
+.TP
+.BI numa_mem_policy \fR=\fPstr
+Set this job's memory policy and corresponding NUMA nodes. Format of the
+arguments:
+.RS
+.RS
+.P
+<mode>[:<nodelist>]
+.RE
+.P
+`mode' is one of the following memory policies: `default', `prefer',
+`bind', `interleave' or `local'. For `default' and `local' memory
+policies, no node needs to be specified. For `prefer', only one node is
+allowed. For `bind' and `interleave' the `nodelist' may be as
+follows: a comma delimited list of numbers, A\-B ranges, or `all'.
+.RE
 .TP
 .BI cgroup \fR=\fPstr
-Add job to this control group. If it doesn't exist, it will be created.
-The system must have a mounted cgroup blkio mount point for this to work. If
+Add job to this control group. If it doesn't exist, it will be created. The
+system must have a mounted cgroup blkio mount point for this to work. If
 your system doesn't have it mounted, you can do so with:
-
+.RS
+.RS
+.P
 # mount \-t cgroup \-o blkio none /cgroup
+.RE
+.RE
 .TP
 .BI cgroup_weight \fR=\fPint
 Set the weight of the cgroup to this value. See the documentation that comes
 with the kernel, allowed values are in the range of 100..1000.
 .TP
 .BI cgroup_nodelete \fR=\fPbool
-Normally fio will delete the cgroups it has created after the job completion.
-To override this behavior and to leave cgroups around after the job completion,
-set cgroup_nodelete=1. This can be useful if one wants to inspect various
-cgroup files after job completion. Default: false
-.TP
-.BI uid \fR=\fPint
-Instead of running as the invoking user, set the user ID to this value before
-the thread/process does any work.
-.TP
-.BI gid \fR=\fPint
-Set group ID, see \fBuid\fR.
+Normally fio will delete the cgroups it has created after the job
+completion. To override this behavior and to leave cgroups around after the
+job completion, set `cgroup_nodelete=1'. This can be useful if one wants
+to inspect various cgroup files after job completion. Default: false.
 .TP
 .BI flow_id \fR=\fPint
-The ID of the flow. If not specified, it defaults to being a global flow. See
-\fBflow\fR.
+The ID of the flow. If not specified, it defaults to being a global
+flow. See \fBflow\fR.
 .TP
 .BI flow \fR=\fPint
-Weight in token-based flow control. If this value is used, then there is a
-\fBflow counter\fR which is used to regulate the proportion of activity between
-two or more jobs. fio attempts to keep this flow counter near zero. The
+Weight in token\-based flow control. If this value is used, then there is
+a 'flow counter' which is used to regulate the proportion of activity between
+two or more jobs. Fio attempts to keep this flow counter near zero. The
 \fBflow\fR parameter stands for how much should be added or subtracted to the
 flow counter on each iteration of the main I/O loop. That is, if one job has
-\fBflow=8\fR and another job has \fBflow=-1\fR, then there will be a roughly
-1:8 ratio in how much one runs vs the other.
+`flow=8' and another job has `flow=\-1', then there will be a roughly 1:8
+ratio in how much one runs vs the other.
 .TP
 .BI flow_watermark \fR=\fPint
 The maximum value that the absolute value of the flow counter is allowed to
 reach before the job must wait for a lower value of the counter.
 .TP
 .BI flow_sleep \fR=\fPint
-The period of time, in microseconds, to wait after the flow watermark has been
-exceeded before retrying operations
+The period of time, in microseconds, to wait after the flow watermark has
+been exceeded before retrying operations.
+.TP
+.BI stonewall "\fR,\fB wait_for_previous"
+Wait for preceding jobs in the job file to exit, before starting this
+one. Can be used to insert serialization points in the job file. A stone
+wall also implies starting a new reporting group, see
+\fBgroup_reporting\fR.
+.TP
+.BI exitall
+By default, fio will continue running all other jobs when one job finishes
+but sometimes this is not the desired action. Setting \fBexitall\fR will
+instead make fio terminate all other jobs when one job finishes.
+.TP
+.BI exec_prerun \fR=\fPstr
+Before running this job, issue the command specified through
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.prerun.txt'.
+.TP
+.BI exec_postrun \fR=\fPstr
+After the job completes, issue the command specified though
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.postrun.txt'.
+.TP
+.BI uid \fR=\fPint
+Instead of running as the invoking user, set the user ID to this value
+before the thread/process does any work.
+.TP
+.BI gid \fR=\fPint
+Set group ID, see \fBuid\fR.
+.SS "Verification"
+.TP
+.BI verify_only
+Do not perform specified workload, only verify data still matches previous
+invocation of this workload. This option allows one to check data multiple
+times at a later date without overwriting it. This option makes sense only
+for workloads that write data, and does not support workloads with the
+\fBtime_based\fR option set.
+.TP
+.BI do_verify \fR=\fPbool
+Run the verify phase after a write phase. Only valid if \fBverify\fR is
+set. Default: true.
+.TP
+.BI verify \fR=\fPstr
+If writing to a file, fio can verify the file contents after each iteration
+of the job. Each verification method also implies verification of special
+header, which is written to the beginning of each block. This header also
+includes meta information, like offset of the block, block number, timestamp
+when block was written, etc. \fBverify\fR can be combined with
+\fBverify_pattern\fR option. The allowed values are:
+.RS
+.RS
+.TP
+.B md5
+Use an md5 sum of the data area and store it in the header of
+each block.
+.TP
+.B crc64
+Use an experimental crc64 sum of the data area and store it in the
+header of each block.
+.TP
+.B crc32c
+Use a crc32c sum of the data area and store it in the header of
+each block. This will automatically use hardware acceleration
+(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+fall back to software crc32c if none is found. Generally the
+fastest checksum fio supports when hardware accelerated.
+.TP
+.B crc32c\-intel
+Synonym for crc32c.
+.TP
+.B crc32
+Use a crc32 sum of the data area and store it in the header of each
+block.
+.TP
+.B crc16
+Use a crc16 sum of the data area and store it in the header of each
+block.
+.TP
+.B crc7
+Use a crc7 sum of the data area and store it in the header of each
+block.
+.TP
+.B xxhash
+Use xxhash as the checksum function. Generally the fastest software
+checksum that fio supports.
+.TP
+.B sha512
+Use sha512 as the checksum function.
+.TP
+.B sha256
+Use sha256 as the checksum function.
+.TP
+.B sha1
+Use optimized sha1 as the checksum function.
+.TP
+.B sha3\-224
+Use optimized sha3\-224 as the checksum function.
+.TP
+.B sha3\-256
+Use optimized sha3\-256 as the checksum function.
+.TP
+.B sha3\-384
+Use optimized sha3\-384 as the checksum function.
+.TP
+.B sha3\-512
+Use optimized sha3\-512 as the checksum function.
+.TP
+.B meta
+This option is deprecated, since now meta information is included in
+generic verification header and meta verification happens by
+default. For detailed information see the description of the
+\fBverify\fR setting. This option is kept because of
+compatibility's sake with old configurations. Do not use it.
+.TP
+.B pattern
+Verify a strict pattern. Normally fio includes a header with some
+basic information and checksumming, but if this option is set, only
+the specific pattern set with \fBverify_pattern\fR is verified.
+.TP
+.B null
+Only pretend to verify. Useful for testing internals with
+`ioengine=null', not for much else.
+.RE
+.P
+This option can be used for repeated burn\-in tests of a system to make sure
+that the written data is also correctly read back. If the data direction
+given is a read or random read, fio will assume that it should verify a
+previously written file. If the data direction includes any form of write,
+the verify will be of the newly written data.
+.P
+To avoid false verification errors, do not use the norandommap option when
+verifying data with async I/O engines and I/O depths > 1.  Or use the
+norandommap and the lfsr random generator together to avoid writing to the
+same offset with muliple outstanding I/Os.
+.RE
+.TP
+.BI verify_offset \fR=\fPint
+Swap the verification header with data somewhere else in the block before
+writing. It is swapped back before verifying.
+.TP
+.BI verify_interval \fR=\fPint
+Write the verification header at a finer granularity than the
+\fBblocksize\fR. It will be written for chunks the size of
+\fBverify_interval\fR. \fBblocksize\fR should divide this evenly.
+.TP
+.BI verify_pattern \fR=\fPstr
+If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+filling with totally random bytes, but sometimes it's interesting to fill
+with a known pattern for I/O verification purposes. Depending on the width
+of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+be either a decimal or a hex number). The \fBverify_pattern\fR if larger than
+a 32\-bit quantity has to be a hex number that starts with either "0x" or
+"0X". Use with \fBverify\fR. Also, \fBverify_pattern\fR supports %o
+format, which means that for each block offset will be written and then
+verified back, e.g.:
+.RS
+.RS
+.P
+verify_pattern=%o
+.RE
+.P
+Or use combination of everything:
+.RS
+.P
+verify_pattern=0xff%o"abcd"\-12
+.RE
+.RE
+.TP
+.BI verify_fatal \fR=\fPbool
+Normally fio will keep checking the entire contents before quitting on a
+block verification failure. If this option is set, fio will exit the job on
+the first observed failure. Default: false.
+.TP
+.BI verify_dump \fR=\fPbool
+If set, dump the contents of both the original data block and the data block
+we read off disk to files. This allows later analysis to inspect just what
+kind of data corruption occurred. Off by default.
+.TP
+.BI verify_async \fR=\fPint
+Fio will normally verify I/O inline from the submitting thread. This option
+takes an integer describing how many async offload threads to create for I/O
+verification instead, causing fio to offload the duty of verifying I/O
+contents to one or more separate threads. If using this offload option, even
+sync I/O engines can benefit from using an \fBiodepth\fR setting higher
+than 1, as it allows them to have I/O in flight while verifies are running.
+Defaults to 0 async threads, i.e. verification is not asynchronous.
+.TP
+.BI verify_async_cpus \fR=\fPstr
+Tell fio to set the given CPU affinity on the async I/O verification
+threads. See \fBcpus_allowed\fR for the format used.
+.TP
+.BI verify_backlog \fR=\fPint
+Fio will normally verify the written contents of a job that utilizes verify
+once that job has completed. In other words, everything is written then
+everything is read back and verified. You may want to verify continually
+instead for a variety of reasons. Fio stores the meta data associated with
+an I/O block in memory, so for large verify workloads, quite a bit of memory
+would be used up holding this meta data. If this option is enabled, fio will
+write only N blocks before verifying these blocks.
+.TP
+.BI verify_backlog_batch \fR=\fPint
+Control how many blocks fio will verify if \fBverify_backlog\fR is
+set. If not set, will default to the value of \fBverify_backlog\fR
+(meaning the entire queue is read back and verified). If
+\fBverify_backlog_batch\fR is less than \fBverify_backlog\fR then not all
+blocks will be verified, if \fBverify_backlog_batch\fR is larger than
+\fBverify_backlog\fR, some blocks will be verified more than once.
+.TP
+.BI verify_state_save \fR=\fPbool
+When a job exits during the write phase of a verify workload, save its
+current state. This allows fio to replay up until that point, if the verify
+state is loaded for the verify read phase. The format of the filename is,
+roughly:
+.RS
+.RS
+.P
+<type>\-<jobname>\-<jobindex>\-verify.state.
+.RE
+.P
+<type> is "local" for a local run, "sock" for a client/server socket
+connection, and "ip" (192.168.0.1, for instance) for a networked
+client/server connection. Defaults to true.
+.RE
+.TP
+.BI verify_state_load \fR=\fPbool
+If a verify termination trigger was used, fio stores the current write state
+of each thread. This can be used at verification time so that fio knows how
+far it should verify. Without this information, fio will run a full
+verification pass, according to the settings in the job file used. Default
+false.
+.TP
+.BI trim_percentage \fR=\fPint
+Number of verify blocks to discard/trim.
+.TP
+.BI trim_verify_zero \fR=\fPbool
+Verify that trim/discarded blocks are returned as zeros.
+.TP
+.BI trim_backlog \fR=\fPint
+Verify that trim/discarded blocks are returned as zeros.
+.TP
+.BI trim_backlog_batch \fR=\fPint
+Trim this number of I/O blocks.
+.TP
+.BI experimental_verify \fR=\fPbool
+Enable experimental verification.
+.SS "Steady state"
+.TP
+.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
+Define the criterion and limit for assessing steady state performance. The
+first parameter designates the criterion whereas the second parameter sets
+the threshold. When the criterion falls below the threshold for the
+specified duration, the job will stop. For example, `iops_slope:0.1%' will
+direct fio to terminate the job when the least squares regression slope
+falls below 0.1% of the mean IOPS. If \fBgroup_reporting\fR is enabled
+this will apply to all jobs in the group. Below is the list of available
+steady state assessment criteria. All assessments are carried out using only
+data from the rolling collection window. Threshold limits can be expressed
+as a fixed value or as a percentage of the mean in the collection window.
+.RS
+.P
+When using this feature, most jobs should include the \fBtime_based\fR
+and \fBruntime\fR options or the \fBloops\fR option so that fio does not
+stop running after it has covered the full size of the specified file(s)
+or device(s).
+.RS
+.RS
+.TP
+.B iops
+Collect IOPS data. Stop the job if all individual IOPS measurements
+are within the specified limit of the mean IOPS (e.g., `iops:2'
+means that all individual IOPS values must be within 2 of the mean,
+whereas `iops:0.2%' means that all individual IOPS values must be
+within 0.2% of the mean IOPS to terminate the job).
+.TP
+.B iops_slope
+Collect IOPS data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.TP
+.B bw
+Collect bandwidth data. Stop the job if all individual bandwidth
+measurements are within the specified limit of the mean bandwidth.
+.TP
+.B bw_slope
+Collect bandwidth data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.RE
+.RE
+.TP
+.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
+A rolling window of this duration will be used to judge whether steady state
+has been reached. Data will be collected once per second. The default is 0
+which disables steady state detection. When the unit is omitted, the
+value is interpreted in seconds.
+.TP
+.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
+Allow the job to run for the specified duration before beginning data
+collection for checking the steady state job termination criterion. The
+default is 0. When the unit is omitted, the value is interpreted in seconds.
+.SS "Measurements and reporting"
+.TP
+.BI per_job_logs \fR=\fPbool
+If set, this generates bw/clat/iops log with per file private filenames. If
+not set, jobs with identical names will share the log filename. Default:
+true.
+.TP
+.BI group_reporting
+It may sometimes be interesting to display statistics for groups of jobs as
+a whole instead of for each individual job. This is especially true if
+\fBnumjobs\fR is used; looking at individual thread/process output
+quickly becomes unwieldy. To see the final report per\-group instead of
+per\-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
+same reporting group, unless if separated by a \fBstonewall\fR, or by
+using \fBnew_group\fR.
+.TP
+.BI new_group
+Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
+all jobs in a file will be part of the same reporting group, unless
+separated by a \fBstonewall\fR.
+.TP
+.BI stats \fR=\fPbool
+By default, fio collects and shows final output results for all jobs
+that run. If this option is set to 0, then fio will ignore it in
+the final stat output.
+.TP
+.BI write_bw_log \fR=\fPstr
+If given, write a bandwidth log for this job. Can be used to store data of
+the bandwidth of the jobs in their lifetime.
+.RS
+.P
+If no str argument is given, the default filename of
+`jobname_type.x.log' is used. Even when the argument is given, fio
+will still append the type of log. So if one specifies:
+.RS
+.P
+write_bw_log=foo
+.RE
+.P
+The actual log name will be `foo_bw.x.log' where `x' is the index
+of the job (1..N, where N is the number of jobs). If
+\fBper_job_logs\fR is false, then the filename will not include the
+`.x` job index.
+.P
+The included \fBfio_generate_plots\fR script uses gnuplot to turn these
+text files into nice graphs. See the \fBLOG FILE FORMATS\fR section for how data is
+structured within the file.
+.RE
+.TP
+.BI write_lat_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR, except this option creates I/O
+submission (e.g., `name_slat.x.log'), completion (e.g.,
+`name_clat.x.log'), and total (e.g., `name_lat.x.log') latency
+files instead. See \fBwrite_bw_log\fR for details about the
+filename format and the \fBLOG FILE FORMATS\fR section for how data is structured
+within the files.
+.TP
+.BI write_hist_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR but writes an I/O completion latency
+histogram file (e.g., `name_hist.x.log') instead. Note that this
+file will be empty unless \fBlog_hist_msec\fR has also been set.
+See \fBwrite_bw_log\fR for details about the filename format and
+the \fBLOG FILE FORMATS\fR section for how data is structured
+within the file.
+.TP
+.BI write_iops_log \fR=\fPstr
+Same as \fBwrite_bw_log\fR, but writes an IOPS file (e.g.
+`name_iops.x.log`) instead. Because fio defaults to individual
+I/O logging, the value entry in the IOPS log will be 1 unless windowed
+logging (see \fBlog_avg_msec\fR) has been enabled. See
+\fBwrite_bw_log\fR for details about the filename format and \fBLOG
+FILE FORMATS\fR for how data is structured within the file.
+.TP
+.BI log_avg_msec \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for every
+I/O that completes. When writing to the disk log, that can quickly grow to a
+very large size. Setting this option makes fio average the each log entry
+over the specified period of time, reducing the resolution of the log. See
+\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
+Also see \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_hist_msec \fR=\fPint
+Same as \fBlog_avg_msec\fR, but logs entries for completion latency
+histograms. Computing latency percentiles from averages of intervals using
+\fBlog_avg_msec\fR is inaccurate. Setting this option makes fio log
+histogram entries over the specified period of time, reducing log sizes for
+high IOPS devices while retaining percentile accuracy. See
+\fBlog_hist_coarseness\fR and \fBwrite_hist_log\fR as well.
+Defaults to 0, meaning histogram logging is disabled.
+.TP
+.BI log_hist_coarseness \fR=\fPint
+Integer ranging from 0 to 6, defining the coarseness of the resolution of
+the histogram logs enabled with \fBlog_hist_msec\fR. For each increment
+in coarseness, fio outputs half as many bins. Defaults to 0, for which
+histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
+you instead want to log the maximum value, set this option to 1. Defaults to
+0, meaning that averaged values are logged.
+.TP
+.BI log_offset \fR=\fPbool
+If this is set, the iolog options will include the byte offset for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_compression \fR=\fPint
+If this is set, fio will compress the I/O logs as it goes, to keep the
+memory footprint lower. When a log reaches the specified size, that chunk is
+removed and compressed in the background. Given that I/O logs are fairly
+highly compressible, this yields a nice memory savings for longer runs. The
+downside is that the compression will consume some background CPU cycles, so
+it may impact the run. This, however, is also true if the logging ends up
+consuming most of the system memory. So pick your poison. The I/O logs are
+saved normally at the end of a run, by decompressing the chunks and storing
+them in the specified log file. This feature depends on the availability of
+zlib.
+.TP
+.BI log_compression_cpus \fR=\fPstr
+Define the set of CPUs that are allowed to handle online log compression for
+the I/O jobs. This can provide better isolation between performance
+sensitive jobs, and background compression work. See \fBcpus_allowed\fR for
+the format used.
+.TP
+.BI log_store_compressed \fR=\fPbool
+If set, fio will store the log files in a compressed format. They can be
+decompressed with fio, using the \fB\-\-inflate\-log\fR command line
+parameter. The files will be stored with a `.fz' suffix.
+.TP
+.BI log_unix_epoch \fR=\fPbool
+If set, fio will log Unix timestamps to the log files produced by enabling
+write_type_log for each log type, instead of the default zero\-based
+timestamps.
+.TP
+.BI block_error_percentiles \fR=\fPbool
+If set, record errors in trim block\-sized units from writes and trims and
+output a histogram of how many trims it took to get to errors, and what kind
+of error was encountered.
+.TP
+.BI bwavgtime \fR=\fPint
+Average the calculated bandwidth over the given time. Value is specified in
+milliseconds. If the job also does bandwidth logging through
+\fBwrite_bw_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI iopsavgtime \fR=\fPint
+Average the calculated IOPS over the given time. Value is specified in
+milliseconds. If the job also does IOPS logging through
+\fBwrite_iops_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
+.TP
+.BI disk_util \fR=\fPbool
+Generate disk utilization statistics, if the platform supports it.
+Default: true.
+.TP
+.BI disable_lat \fR=\fPbool
+Disable measurements of total latency numbers. Useful only for cutting back
+the number of calls to \fBgettimeofday\fR\|(2), as that does impact
+performance at really high IOPS rates. Note that to really get rid of a
+large amount of these calls, this option must be used with
+\fBdisable_slat\fR and \fBdisable_bw_measurement\fR as well.
+.TP
+.BI disable_clat \fR=\fPbool
+Disable measurements of completion latency numbers. See
+\fBdisable_lat\fR.
+.TP
+.BI disable_slat \fR=\fPbool
+Disable measurements of submission latency numbers. See
+\fBdisable_lat\fR.
+.TP
+.BI disable_bw_measurement \fR=\fPbool "\fR,\fP disable_bw" \fR=\fPbool
+Disable measurements of throughput/bandwidth numbers. See
+\fBdisable_lat\fR.
 .TP
 .BI clat_percentiles \fR=\fPbool
-Enable the reporting of percentiles of completion latencies.
+Enable the reporting of percentiles of completion latencies. This option is
+mutually exclusive with \fBlat_percentiles\fR.
+.TP
+.BI lat_percentiles \fR=\fPbool
+Enable the reporting of percentiles of I/O latencies. This is similar to
+\fBclat_percentiles\fR, except that this includes the submission latency.
+This option is mutually exclusive with \fBclat_percentiles\fR.
 .TP
 .BI percentile_list \fR=\fPfloat_list
-Overwrite the default list of percentiles for completion
-latencies. Each number is a floating number in the range (0,100], and
-the maximum length of the list is 20. Use ':' to separate the
-numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to
-report the values of completion latency below which 99.5% and 99.9% of
-the observed latencies fell, respectively.
-.SS "Ioengine Parameters List"
-Some parameters are only valid when a specific ioengine is in use. These are
-used identically to normal parameters, with the caveat that when used on the
-command line, the must come after the ioengine that defines them is selected.
+Overwrite the default list of percentiles for completion latencies and the
+block error histogram. Each number is a floating number in the range
+(0,100], and the maximum length of the list is 20. Use ':' to separate the
+numbers, and list the numbers in ascending order. For example,
+`\-\-percentile_list=99.5:99.9' will cause fio to report the values of
+completion latency below which 99.5% and 99.9% of the observed latencies
+fell, respectively.
+.TP
+.BI significant_figures \fR=\fPint
+If using \fB\-\-output\-format\fR of `normal', set the significant figures
+to this value. Higher values will yield more precise IOPS and throughput
+units, while lower values will round. Requires a minimum value of 1 and a
+maximum value of 10. Defaults to 4.
+.SS "Error handling"
+.TP
+.BI exitall_on_error
+When one job finishes in error, terminate the rest. The default is to wait
+for each job to finish.
+.TP
+.BI continue_on_error \fR=\fPstr
+Normally fio will exit the job on the first observed failure. If this option
+is set, fio will continue the job when there is a 'non\-fatal error' (EIO or
+EILSEQ) until the runtime is exceeded or the I/O size specified is
+completed. If this option is used, there are two more stats that are
+appended, the total error count and the first error. The error field given
+in the stats is the first error that was hit during the run.
+The allowed values are:
+.RS
+.RS
 .TP
-.BI (cpu)cpuload \fR=\fPint
-Attempt to use the specified percentage of CPU cycles.
+.B none
+Exit on any I/O or verify errors.
+.TP
+.B read
+Continue on read errors, exit on all others.
+.TP
+.B write
+Continue on write errors, exit on all others.
+.TP
+.B io
+Continue on any I/O error, exit on all others.
 .TP
-.BI (cpu)cpuchunks \fR=\fPint
-Split the load into cycles of the given time. In microseconds.
+.B verify
+Continue on verify errors, exit on all others.
 .TP
-.BI (libaio)userspace_reap
-Normally, with the libaio engine in use, fio will use
-the io_getevents system call to reap newly returned events.
-With this flag turned on, the AIO ring will be read directly
-from user-space to reap events. The reaping mode is only
-enabled when polling for a minimum of 0 events (eg when
-iodepth_batch_complete=0).
-.TP
-.BI (net,netsplice)hostname \fR=\fPstr
-The host name or IP address to use for TCP or UDP based IO.
-If the job is a TCP listener or UDP reader, the hostname is not
-used and must be omitted unless it is a valid UDP multicast address.
-.TP
-.BI (net,netsplice)port \fR=\fPint
-The TCP or UDP port to bind to or connect to.
-.TP
-.BI (net,netsplice)interface \fR=\fPstr
-The IP address of the network interface used to send or receive UDP multicast
-packets.
+.B all
+Continue on all errors.
 .TP
-.BI (net,netsplice)ttl \fR=\fPint
-Time-to-live value for outgoing UDP multicast packets. Default: 1
+.B 0
+Backward\-compatible alias for 'none'.
 .TP
-.BI (net,netsplice)nodelay \fR=\fPbool
-Set TCP_NODELAY on TCP connections.
+.B 1
+Backward\-compatible alias for 'all'.
+.RE
+.RE
 .TP
-.BI (net,netsplice)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
-The network protocol to use. Accepted values are:
+.BI ignore_error \fR=\fPstr
+Sometimes you want to ignore some errors during test in that case you can
+specify error list for each error type, instead of only being able to
+ignore the default 'non\-fatal error' using \fBcontinue_on_error\fR.
+`ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST' errors for
+given error type is separated with ':'. Error may be symbol ('ENOSPC', 'ENOMEM')
+or integer. Example:
 .RS
 .RS
+.P
+ignore_error=EAGAIN,ENOSPC:122
+.RE
+.P
+This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+WRITE. This option works by overriding \fBcontinue_on_error\fR with
+the list of errors for each error type if any.
+.RE
 .TP
-.B tcp
-Transmission control protocol
+.BI error_dump \fR=\fPbool
+If set dump every error even if it is non fatal, true by default. If
+disabled only fatal error will be dumped.
+.SS "Running predefined workloads"
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
 .TP
-.B udp
-User datagram protocol
+.BI profile \fR=\fPstr
+The predefined workload to run. Current profiles are:
+.RS
+.RS
 .TP
-.B unix
-UNIX domain socket
+.B tiobench
+Threaded I/O bench (tiotest/tiobench) like workload.
+.TP
+.B act
+Aerospike Certification Tool (ACT) like workload.
+.RE
 .RE
 .P
-When the protocol is TCP or UDP, the port must also be given,
-as well as the hostname if the job is a TCP listener or UDP
-reader. For unix sockets, the normal filename option should be
-used and the port is invalid.
+To view a profile's additional options use \fB\-\-cmdhelp\fR after specifying
+the profile. For example:
+.RS
+.TP
+$ fio \-\-profile=act \-\-cmdhelp
 .RE
+.SS "Act profile options"
 .TP
-.BI (net,netsplice)listen
-For TCP network connections, tell fio to listen for incoming
-connections rather than initiating an outgoing connection. The
-hostname must be omitted if this option is used.
+.BI device\-names \fR=\fPstr
+Devices to use.
 .TP
-.BI (net, pingpong) \fR=\fPbool
-Normaly a network writer will just continue writing data, and a network reader
-will just consume packages. If pingpong=1 is set, a writer will send its normal
-payload to the reader, then wait for the reader to send the same payload back.
-This allows fio to measure network latencies. The submission and completion
-latencies then measure local time spent sending or receiving, and the
-completion latency measures how long it took for the other end to receive and
-send back. For UDP multicast traffic pingpong=1 should only be set for a single
-reader when multiple readers are listening to the same address.
+.BI load \fR=\fPint
+ACT load multiplier. Default: 1.
 .TP
-.BI (e4defrag,donorname) \fR=\fPstr
-File will be used as a block donor (swap extents between files)
+.BI test\-duration\fR=\fPtime
+How long the entire test takes to run. When the unit is omitted, the value
+is given in seconds. Default: 24h.
 .TP
-.BI (e4defrag,inplace) \fR=\fPint
-Configure donor file block allocation strategy		
-.RS
-.BI 0(default) :
-Preallocate donor's file on init
+.BI threads\-per\-queue\fR=\fPint
+Number of read I/O threads per device. Default: 8.
 .TP
-.BI 1:
-allocate space immidietly inside defragment event, and free right after event
-.RE
+.BI read\-req\-num\-512\-blocks\fR=\fPint
+Number of 512B blocks to read at the time. Default: 3.
+.TP
+.BI large\-block\-op\-kbytes\fR=\fPint
+Size of large block ops in KiB (writes). Default: 131072.
+.TP
+.BI prep
+Set to run ACT prep phase.
+.SS "Tiobench profile options"
+.TP
+.BI size\fR=\fPstr
+Size in MiB.
 .TP
+.BI block\fR=\fPint
+Block size in bytes. Default: 4096.
+.TP
+.BI numruns\fR=\fPint
+Number of runs.
+.TP
+.BI dir\fR=\fPstr
+Test directory.
+.TP
+.BI threads\fR=\fPint
+Number of threads.
 .SH OUTPUT
-While running, \fBfio\fR will display the status of the created jobs.  For
-example:
-.RS
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be:
 .P
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
-.RE
-.P
-The characters in the first set of brackets denote the current status of each
-threads.  The possible values are:
-.P
-.PD 0
+.nf
+		Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+.fi
+.P
+The characters inside the first set of square brackets denote the current status of
+each thread. The first character is the first job defined in the job file, and so
+forth. The possible values (in typical life cycle order) are:
 .RS
 .TP
+.PD 0
 .B P
-Setup but not started.
+Thread setup, but not started.
 .TP
 .B C
 Thread created.
 .TP
 .B I
-Initialized, waiting.
+Thread initialized, waiting or generating necessary data.
+.TP
+.B p
+Thread running pre\-reading file(s).
+.TP
+.B /
+Thread is in ramp period.
 .TP
 .B R
 Running, doing sequential reads.
@@ -1333,279 +3154,805 @@
 .B m
 Running, doing mixed random reads/writes.
 .TP
+.B D
+Running, doing sequential trims.
+.TP
+.B d
+Running, doing random trims.
+.TP
 .B F
 Running, currently waiting for \fBfsync\fR\|(2).
 .TP
 .B V
-Running, verifying written data.
+Running, doing verification of written data.
+.TP
+.B f
+Thread finishing.
 .TP
 .B E
-Exited, not reaped by main thread.
+Thread exited, not reaped by main thread yet.
 .TP
 .B \-
-Exited, thread reaped.
-.RE
-.PD
-.P
-The second set of brackets shows the estimated completion percentage of
-the current group.  The third set shows the read and write I/O rate,
-respectively. Finally, the estimated run time of the job is displayed.
-.P
-When \fBfio\fR completes (or is interrupted by Ctrl-C), it will show data
-for each thread, each group of threads, and each disk, in that order.
-.P
-Per-thread statistics first show the threads client number, group-id, and
-error code.  The remaining figures are as follows:
-.RS
+Thread reaped.
 .TP
-.B io
-Number of megabytes of I/O performed.
-.TP
-.B bw
-Average data rate (bandwidth).
+.B X
+Thread reaped, exited with an error.
 .TP
-.B runt
-Threads run time.
+.B K
+Thread reaped, exited due to signal.
+.PD
+.RE
+.P
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this:
+.P
+.nf
+		Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+.fi
+.P
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what. In the example above this means that jobs 1\-\-10
+are readers and 11\-\-20 are writers.
+.P
+The other values are fairly self explanatory \-\- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+.P
+When fio is done (or interrupted by Ctrl\-C), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like:
+.P
+.nf
+		Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+		  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+		    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+		    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+		     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+		    clat percentiles (usec):
+		     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+		     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+		     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+		     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+		     | 99.99th=[78119]
+		   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+		   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+		  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+		  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+		  lat (msec)   : 100=0.65%
+		  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+		  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+		     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+		     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+		     latency   : target=0, window=0, percentile=100.00%, depth=8
+.fi
+.P
+The job name (or first job's name when using \fBgroup_reporting\fR) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed. Below are the I/O statistics for each data direction performed (showing
+writes in the example above). In the order listed, they denote:
+.RS
+.TP
+.B read/write/trim
+The string before the colon shows the I/O direction the statistics
+are for. \fIIOPS\fR is the average I/Os performed per second. \fIBW\fR
+is the average bandwidth rate shown as: value in power of 2 format
+(value in power of 10 format). The last two values show: (total
+I/O performed in power of 2 format / \fIruntime\fR of that thread).
 .TP
 .B slat
-Submission latency minimum, maximum, average and standard deviation. This is
-the time it took to submit the I/O.
+Submission latency (\fImin\fR being the minimum, \fImax\fR being the
+maximum, \fIavg\fR being the average, \fIstdev\fR being the standard
+deviation). This is the time it took to submit the I/O. For
+sync I/O this row is not displayed as the slat is really the
+completion latency (since queue/complete is one operation there).
+This value can be in nanoseconds, microseconds or milliseconds \-\-\-
+fio will choose the most appropriate base and print that (in the
+example above nanoseconds was the best scale). Note: in \fB\-\-minimal\fR mode
+latencies are always expressed in microseconds.
 .TP
 .B clat
-Completion latency minimum, maximum, average and standard deviation.  This
-is the time between submission and completion.
+Completion latency. Same names as slat, this denotes the time from
+submission to completion of the I/O pieces. For sync I/O, clat will
+usually be equal (or very close) to 0, as the time from submit to
+complete is basically just CPU time (I/O has already been done, see slat
+explanation).
+.TP
+.B lat
+Total latency. Same names as slat and clat, this denotes the time from
+when fio created the I/O unit to completion of the I/O operation.
 .TP
 .B bw
-Bandwidth minimum, maximum, percentage of aggregate bandwidth received, average
-and standard deviation.
+Bandwidth statistics based on samples. Same names as the xlat stats,
+but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread
+received in its group (\fIper\fR). This last value is only really
+useful if the threads in this group are on the same disk, since they
+are then competing for disk access.
+.TP
+.B iops
+IOPS statistics based on samples. Same names as \fBbw\fR.
+.TP
+.B lat (nsec/usec/msec)
+The distribution of I/O completion latencies. This is the time from when
+I/O leaves fio and when it gets completed. Unlike the separate
+read/write/trim sections above, the data here and in the remaining
+sections apply to all I/Os for the reporting group. 250=0.04% means that
+0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+of the I/Os required 250 to 499us for completion.
 .TP
 .B cpu
-CPU usage statistics. Includes user and system time, number of context switches
-this thread went through and number of major and minor page faults.
+CPU usage. User and system time, along with the number of context
+switches this thread went through, usage of system and user time, and
+finally the number of major and minor page faults. The CPU utilization
+numbers are averages for the jobs in that reporting group, while the
+context and fault counters are summed.
 .TP
 .B IO depths
-Distribution of I/O depths.  Each depth includes everything less than (or equal)
-to it, but greater than the previous depth.
-.TP
-.B IO issued
-Number of read/write requests issued, and number of short read/write requests.
-.TP
-.B IO latencies
-Distribution of I/O completion latencies.  The numbers follow the same pattern
-as \fBIO depths\fR.
-.RE
+The distribution of I/O depths over the job lifetime. The numbers are
+divided into powers of 2 and each entry covers depths from that value
+up to those that are lower than the next entry \-\- e.g., 16= covers
+depths from 16 to 31. Note that the range covered by a depth
+distribution entry can be different to the range covered by the
+equivalent \fBsubmit\fR/\fBcomplete\fR distribution entry.
+.TP
+.B IO submit
+How many pieces of I/O were submitting in a single submit call. Each
+entry denotes that amount and below, until the previous entry \-\- e.g.,
+16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+call. Note that the range covered by a \fBsubmit\fR distribution entry can
+be different to the range covered by the equivalent depth distribution
+entry.
+.TP
+.B IO complete
+Like the above \fBsubmit\fR number, but for completions instead.
+.TP
+.B IO issued rwt
+The number of \fBread/write/trim\fR requests issued, and how many of them were
+short or dropped.
+.TP
+.B IO latency
+These values are for \fBlatency_target\fR and related options. When
+these options are engaged, this section describes the I/O depth required
+to meet the specified latency target.
+.RE
+.P
+After each client has been listed, the group statistics are printed. They
+will look like this:
+.P
+.nf
+		Run status group 0 (all jobs):
+		   READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s\-10.8MiB/s (10.9MB/s\-11.3MB/s), io=64.0MiB (67.1MB), run=2973\-3069msec
+		  WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s\-621KiB/s (630kB/s\-636kB/s), io=64.0MiB (67.1MB), run=52747\-53223msec
+.fi
 .P
-The group statistics show:
-.PD 0
+For each data direction it prints:
 .RS
 .TP
-.B io
-Number of megabytes I/O performed.
-.TP
-.B aggrb
-Aggregate bandwidth of threads in the group.
-.TP
-.B minb
-Minimum average bandwidth a thread saw.
-.TP
-.B maxb
-Maximum average bandwidth a thread saw.
+.B bw
+Aggregate bandwidth of threads in this group followed by the
+minimum and maximum bandwidth of all the threads in this group.
+Values outside of brackets are power\-of\-2 format and those
+within are the equivalent value in a power\-of\-10 format.
 .TP
-.B mint
-Shortest runtime of threads in the group.
+.B io
+Aggregate I/O performed of all threads in this group. The
+format is the same as \fBbw\fR.
 .TP
-.B maxt
-Longest runtime of threads in the group.
+.B run
+The smallest and longest runtimes of the threads in this group.
 .RE
-.PD
 .P
-Finally, disk statistics are printed with reads first:
-.PD 0
+And finally, the disk statistics are printed. This is Linux specific.
+They will look like this:
+.P
+.nf
+		  Disk stats (read/write):
+		    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+.fi
+.P
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
 .RS
 .TP
 .B ios
 Number of I/Os performed by all groups.
 .TP
 .B merge
-Number of merges in the I/O scheduler.
+Number of merges performed by the I/O scheduler.
 .TP
 .B ticks
 Number of ticks we kept the disk busy.
 .TP
-.B io_queue
+.B in_queue
 Total time spent in the disk queue.
 .TP
 .B util
-Disk utilization.
+The disk utilization. A value of 100% means we kept the disk
+busy constantly, 50% would be a disk idling half of the time.
 .RE
-.PD
 .P
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the \fBUSR1\fR
-signal.
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the USR1 signal. You can
+also get regularly timed dumps by using the \fB\-\-status\-interval\fR
+parameter, or by creating a file in `/tmp' named
+`fio\-dump\-status'. If fio sees this file, it will unlink it and dump the
+current output status.
 .SH TERSE OUTPUT
-If the \fB\-\-minimal\fR option is given, the results will be printed in a
-semicolon-delimited format suitable for scripted use - a job description
-(if provided) follows on a new line.  Note that the first
-number in the line is the version number. If the output has to be changed
-for some reason, this number will be incremented by 1 to signify that
-change.  The fields are:
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format. The format
+is one long line of values, such as:
+.P
+.nf
+		2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+		A description of this job goes here.
+.fi
+.P
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+.P
+To enable terse output, use the \fB\-\-minimal\fR or
+`\-\-output\-format=terse' command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
 .P
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+.P
+.nf
+			terse version, fio version [v3], jobname, groupid, error
+.fi
 .RS
-.B terse version, fio version, jobname, groupid, error
 .P
-Read status:
+.B
+READ status:
+.RE
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
 .RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
 .P
-Submission latency:
+.B
+WRITE status:
+.RE
+.P
+.nf
+			Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+			Submission latency: min, max, mean, stdev (usec)
+			Completion latency: min, max, mean, stdev (usec)
+			Completion latency percentiles: 20 fields (see below)
+			Total latency: min, max, mean, stdev (usec)
+			Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+			IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+TRIM status [all but version 3]:
 .RE
-Completion latency:
+.P
+.nf
+			Fields are similar to \fBREAD/WRITE\fR status.
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+CPU usage:
 .RE
-Completion latency percentiles (20 fields):
+.P
+.nf
+			user, system, context switches, major faults, minor faults
+.fi
 .RS
-.B Xth percentile=usec
+.P
+.B
+I/O depths:
 .RE
-Total latency:
+.P
+.nf
+			<=1, 2, 4, 8, 16, 32, >=64
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+I/O latencies microseconds:
 .RE
-Bandwidth:
+.P
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.fi
 .RS
-.B min, max, aggregate percentage of total, mean, standard deviation
+.P
+.B
+I/O latencies milliseconds:
 .RE
+.P
+.nf
+			<=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+.fi
+.RS
+.P
+.B
+Disk utilization [v3]:
 .RE
 .P
-Write status:
+.nf
+			disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, time spent in queue, disk utilization percentage
+.fi
 .RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
 .P
-Submission latency:
+.B
+Additional Info (dependent on continue_on_error, default off):
+.RE
+.P
+.nf
+			total # errors, first error code
+.fi
 .RS
-.B min, max, mean, standard deviation
+.P
+.B
+Additional Info (dependent on description being set):
 .RE
-Completion latency:
+.P
+.nf
+			Text description
+.fi
+.P
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this:
+.P
+.nf
+		1.00%=6112
+.fi
+.P
+which is the Xth percentile, and the `usec' latency associated with it.
+.P
+For \fBDisk utilization\fR, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+.P
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons:
+.P
+.nf
+		terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+.fi
+.P
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+.SH JSON OUTPUT
+The \fBjson\fR output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+\fBnormal\fR output. The \fBruntime\fR value is reported in msec and the \fBbw\fR value is
+reported in 1024 bytes per second units.
+.fi
+.SH JSON+ OUTPUT
+The \fBjson+\fR output format is identical to the \fBjson\fR output format except that it
+adds a full dump of the completion latency bins. Each \fBbins\fR object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
 .RS
-.B min, max, mean, standard deviation
+.P
+"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
 .RE
-Completion latency percentiles (20 fields):
+.P
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+.P
+Also included with fio is a Python script \fBfio_jsonplus_clat2csv\fR that takes
+json+ output and generates CSV\-formatted latency data suitable for plotting.
+.P
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to `stat.h' in the fio source.
+.SH TRACE FILE FORMAT
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20\-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+.P
+In any case the trace is a simple text file with a single action per line.
+.TP
+.B Trace file format v1
+Each line represents a single I/O action in the following format:
+.RS
 .RS
-.B Xth percentile=usec
+.P
+rw, offset, length
+.RE
+.P
+where `rw=0/1' for read/write, and the `offset' and `length' entries being in bytes.
+.P
+This format is not supported in fio versions >= 1.20\-rc3.
 .RE
-Total latency:
+.TP
+.B Trace file format v2
+The second version of the trace file format was added in fio version 1.17. It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
+.RS
+.P
+The first line of the trace file has to be:
 .RS
-.B min, max, mean, standard deviation
+.P
+"fio version 2 iolog"
 .RE
-Bandwidth:
+.P
+Following this can be lines in two different formats, which are described below.
+.P
+.B
+The file management format:
+.RS
+filename action
+.P
+The `filename' is given as an absolute path. The `action' can be one of these:
 .RS
-.B min, max, aggregate percentage of total, mean, standard deviation
+.TP
+.B add
+Add the given `filename' to the trace.
+.TP
+.B open
+Open the file with the given `filename'. The `filename' has to have
+been added with the \fBadd\fR action before.
+.TP
+.B close
+Close the file with the given `filename'. The file has to have been
+\fBopen\fRed before.
 .RE
 .RE
 .P
-CPU usage:
+.B
+The file I/O action format:
+.RS
+filename action offset length
+.P
+The `filename' is given as an absolute path, and has to have been \fBadd\fRed and
+\fBopen\fRed before it can be used with this format. The `offset' and `length' are
+given in bytes. The `action' can be one of these:
+.RS
+.TP
+.B wait
+Wait for `offset' microseconds. Everything below 100 is discarded.
+The time is relative to the previous `wait' statement.
+.TP
+.B read
+Read `length' bytes beginning from `offset'.
+.TP
+.B write
+Write `length' bytes beginning from `offset'.
+.TP
+.B sync
+\fBfsync\fR\|(2) the file.
+.TP
+.B datasync
+\fBfdatasync\fR\|(2) the file.
+.TP
+.B trim
+Trim the given file from the given `offset' for `length' bytes.
+.RE
+.RE
+.SH I/O REPLAY \- MERGING TRACES
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+.P
+Merging is integrated into I/O replay and done when a \fBmerge_blktrace_file\fR
+is specified. The list of files passed to \fBread_iolog\fR go through the merge
+process and output a single file stored to the specified file. The output file is
+passed on as if it were the only file passed to \fBread_iolog\fR. An example would
+look like:
+.RS
+.P
+$ fio \-\-read_iolog="<file1>:<file2>" \-\-merge_blktrace_file="<output_file>"
+.RE
+.P
+Creating only the merged file can be done by passing the command line argument
+\fBmerge-blktrace-only\fR.
+.P
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. \fBmerge_blktrace_scalars\fR takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to \fBread_iolog\fR.
+.P
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with \fBmerge_blktrace_iters\fR. It is index paired with
+\fBread_iolog\fR just like \fBmerge_blktrace_scalars\fR.
+.P
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done:
+.RS
+.P
+$ fio \-\-read_iolog="<trace_a>:"<trace_b>" \-\-merge_blktrace_file"<output_file>" \-\-merge_blktrace_scalars="50:100" \-\-merge_blktrace_iters="2:1"
+.RE
+.P
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+.SH CPU IDLENESS PROFILING
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+.P
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+.SH VERIFICATION AND TRIGGERS
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+.P
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+.P
+A verification trigger consists of two things:
 .RS
-.B user, system, context switches, major page faults, minor page faults
+.P
+1) Storing the write state of each job.
+.P
+2) Executing a trigger command.
+.RE
+.P
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+.P
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+`\-\-trigger\-file=/tmp/trigger\-file', then it will continually
+check for the existence of `/tmp/trigger\-file'. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+.P
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
 .RE
 .P
-IO depth distribution:
+.B Verification trigger example
+.RS
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in `write\-test.fio'. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally:
 .RS
-.B <=1, 2, 4, 8, 16, 32, >=64
+.P
+server# fio \-\-server
 .RE
 .P
-IO latency distribution:
+and on the client, we'll fire off the workload:
 .RS
-Microseconds:
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger\-remote="bash \-c "echo b > /proc/sysrq\-triger""
+.RE
+.P
+We set `/tmp/my\-trigger' as the trigger file, and we tell fio to execute:
 .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.P
+echo b > /proc/sysrq\-trigger
 .RE
-Milliseconds:
+.P
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not really cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi\-reboot. On localbox, we could then have run fio with a local trigger
+instead:
 .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi\-reboot server"
 .RE
+.P
+For this case, fio would wait for the server to send us the write state, then
+execute `ipmi\-reboot server' when that happened.
 .RE
 .P
-Disk utilization (1 for each disk used):
+.B Loading verify state
 .RS
-.B name, read ios, write ios, read merges, write merges, read ticks, write ticks, read in-queue time, write in-queue time, disk utilization percentage
+To load stored write state, a read verification job file must contain the
+\fBverify_state_load\fR option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+.RE
+.SH LOG FILE FORMATS
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+.RS
+.P
+time (msec), value, data direction, block size (bytes), offset (bytes)
 .RE
 .P
-Error Info (dependent on continue_on_error, default off):
+`Time' for the log entry is always in milliseconds. The `value' logged depends
+on the type of log, it will be one of the following:
 .RS
-.B total # errors, first error code 
+.TP
+.B Latency log
+Value is latency in nsecs
+.TP
+.B Bandwidth log
+Value is in KiB/sec
+.TP
+.B IOPS log
+Value is IOPS
 .RE
 .P
-.B text description (if provided in config - appears on newline)
+`Data direction' is one of the following:
+.RS
+.TP
+.B 0
+I/O is a READ
+.TP
+.B 1
+I/O is a WRITE
+.TP
+.B 2
+I/O is a TRIM
 .RE
+.P
+The entry's `block size' is always in bytes. The `offset' is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with \fBlog_offset\fR.
+.P
+Fio defaults to logging every individual I/O but when windowed logging is set
+through \fBlog_avg_msec\fR, either the average (by default) or the maximum
+(\fBlog_max_value\fR is set) `value' seen over the specified period of time
+is recorded. Each `data direction' seen within the window period will aggregate
+its values in a separate row. Further, when using windowed logging the `block
+size' and `offset' entries will always contain 0.
 .SH CLIENT / SERVER
-Normally you would run fio as a stand-alone application on the machine
-where the IO workload should be generated. However, it is also possible to
-run the frontend and backend of fio separately. This makes it possible to
-have a fio server running on the machine(s) where the IO workload should
-be running, while controlling it from another machine.
-
-To start the server, you would do:
-
-\fBfio \-\-server=args\fR
-
-on that machine, where args defines what fio listens to. The arguments
-are of the form 'type:hostname or IP:port'. 'type' is either 'ip' (or ip4)
-for TCP/IP v4, 'ip6' for TCP/IP v6, or 'sock' for a local unix domain
-socket. 'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) fio \-\-server
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
-2) fio \-\-server=ip:hostname,4444
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
-3) fio \-\-server=ip6:::1,4444
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) fio \-\-server=,4444
-
-   Start a fio server, listening on all interfaces on port 4444.
-
-5) fio \-\-server=1.2.3.4
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
-6) fio \-\-server=sock:/tmp/fio.sock
-
-   Start a fio server, listening on the local socket /tmp/fio.sock.
-
-When a server is running, you can connect to it from a client. The client
-is run with:
-
-fio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>
-
-where \-\-local-args are arguments that are local to the client where it is
-running, 'server' is the connect string, and \-\-remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
-You can connect to multiple clients as well, to do that you could run:
-
-fio \-\-client=server2 \-\-client=server2 <job file(s)>
+Normally fio is invoked as a stand\-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+.P
+Start the server on the machine which has access to the storage DUT:
+.RS
+.P
+$ fio \-\-server=args
+.RE
+.P
+where `args' defines what fio listens to. The arguments are of the form
+`type,hostname' or `IP,port'. `type' is either `ip' (or ip4) for TCP/IP
+v4, `ip6' for TCP/IP v6, or `sock' for a local unix domain socket.
+`hostname' is either a hostname or IP address, and `port' is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+.RS
+.TP
+1) \fBfio \-\-server\fR
+Start a fio server, listening on all interfaces on the default port (8765).
+.TP
+2) \fBfio \-\-server=ip:hostname,4444\fR
+Start a fio server, listening on IP belonging to hostname and on port 4444.
+.TP
+3) \fBfio \-\-server=ip6:::1,4444\fR
+Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+.TP
+4) \fBfio \-\-server=,4444\fR
+Start a fio server, listening on all interfaces on port 4444.
+.TP
+5) \fBfio \-\-server=1.2.3.4\fR
+Start a fio server, listening on IP 1.2.3.4 on the default port.
+.TP
+6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
+Start a fio server, listening on the local socket `/tmp/fio.sock'.
+.RE
+.P
+Once a server is running, a "client" can connect to the fio server with:
+.RS
+.P
+$ fio <local\-args> \-\-client=<server> <remote\-args> <job file(s)>
+.RE
+.P
+where `local\-args' are arguments for the client where it is running, `server'
+is the connect string, and `remote\-args' and `job file(s)' are sent to the
+server. The `server' string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+.P
+Fio can connect to multiple servers this way:
+.RS
+.P
+$ fio \-\-client=<server1> <job file(s)> \-\-client=<server2> <job file(s)>
+.RE
+.P
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using \fB\-\-remote\-config\fR:
+.RS
+.P
+$ fio \-\-client=server \-\-remote\-config /path/to/file.fio
+.RE
+.P
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+.P
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+\fB\-\-client\fR option. For example, here is an example `host.list'
+file containing 2 hostnames:
+.RS
+.P
+.PD 0
+host1.your.dns.domain
+.P
+host2.your.dns.domain
+.PD
+.RE
+.P
+The fio command would then be:
+.RS
+.P
+$ fio \-\-client=host.list <job file(s)>
+.RE
+.P
+In this mode, you cannot input server\-specific parameters or job files \-\- all
+servers receive the same job file.
+.P
+In order to let `fio \-\-client' runs use a shared filesystem from multiple
+hosts, `fio \-\-client' now prepends the IP address of the server to the
+filename. For example, if fio is using the directory `/mnt/nfs/fio' and is
+writing filename `fileio.tmp', with a \fB\-\-client\fR `hostfile'
+containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files:
+.RS
+.P
+.PD 0
+/mnt/nfs/fio/192.168.10.120.fileio.tmp
+.P
+/mnt/nfs/fio/192.168.10.121.fileio.tmp
+.PD
+.RE
+.P
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
 .SH AUTHORS
-
 .B fio
-was written by Jens Axboe <jens.axboe@oracle.com>,
-now Jens Axboe <jaxboe@fusionio.com>.
+was written by Jens Axboe <axboe@kernel.dk>.
 .br
 This man page was written by Aaron Carroll <aaronc@cse.unsw.edu.au> based
 on documentation by Jens Axboe.
+.br
+This man page was rewritten by Tomohiro Kusumi <tkusumi@tuxera.com> based
+on documentation by Jens Axboe.
 .SH "REPORTING BUGS"
 Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
-See \fBREADME\fR.
+.br
+See \fBREPORTING\-BUGS\fR.
+.P
+\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR
 .SH "SEE ALSO"
 For further documentation see \fBHOWTO\fR and \fBREADME\fR.
 .br
-Sample jobfiles are available in the \fBexamples\fR directory.
-
+Sample jobfiles are available in the `examples/' directory.
+.br
+These are typically located under `/usr/share/doc/fio'.
+.P
+\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR
+.br
+\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR
diff -Nru fio-2.1.3/fio.c fio-3.16/fio.c
--- fio-2.1.3/fio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/fio.c	2019-09-20 01:01:52.000000000 +0000
@@ -18,18 +18,17 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
-#include <unistd.h>
-#include <locale.h>
-#include <time.h>
-
 #include "fio.h"
-#include "smalloc.h"
 
 int main(int argc, char *argv[], char *envp[])
 {
+	int ret = 1;
+
+	compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+
 	if (initialize_fio(envp))
 		return 1;
 
@@ -37,15 +36,32 @@
 #error "No available clock source!"
 #endif
 
+	if (fio_server_create_sk_key())
+		goto done;
+
 	if (parse_options(argc, argv))
-		return 1;
+		goto done_key;
+
+	/*
+	 * line buffer stdout to avoid output lines from multiple
+	 * threads getting mixed
+	 */
+	setvbuf(stdout, NULL, _IOLBF, 0);
 
 	fio_time_init();
 
 	if (nr_clients) {
+		set_genesis_time();
+
 		if (fio_start_all_clients())
-			return 1;
-		return fio_handle_clients(&fio_client_ops);
+			goto done_key;
+		ret = fio_handle_clients(&fio_client_ops);
 	} else
-		return fio_backend();
+		ret = fio_backend(NULL);
+
+done_key:
+	fio_server_destroy_sk_key();
+done:
+	deinitialize_fio();
+	return ret;
 }
diff -Nru fio-2.1.3/fio.h fio-3.16/fio.h
--- fio-2.1.3/fio.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/fio.h	2019-09-20 01:01:52.000000000 +0000
@@ -20,26 +20,31 @@
 #include "fifo.h"
 #include "arch/arch.h"
 #include "os/os.h"
-#include "mutex.h"
 #include "log.h"
 #include "debug.h"
 #include "file.h"
 #include "io_ddir.h"
-#include "ioengine.h"
+#include "ioengines.h"
 #include "iolog.h"
 #include "helpers.h"
+#include "minmax.h"
 #include "options.h"
 #include "profile.h"
-#include "time.h"
+#include "fio_time.h"
 #include "gettime.h"
-#include "lib/getopt.h"
+#include "oslib/getopt.h"
 #include "lib/rand.h"
 #include "lib/rbtree.h"
+#include "lib/num2str.h"
 #include "client.h"
 #include "server.h"
 #include "stat.h"
 #include "flow.h"
+#include "io_u.h"
 #include "io_u_queue.h"
+#include "workqueue.h"
+#include "steadystate.h"
+#include "lib/nowarn_snprintf.h"
 
 #ifdef CONFIG_SOLARISAIO
 #include <sys/asynch.h>
@@ -52,8 +57,16 @@
 /*
  * "local" is pseudo-policy
  */
-#define MPOL_LOCAL MPOL_MAX
+#ifndef MPOL_LOCAL
+#define MPOL_LOCAL 4
 #endif
+#endif
+
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
+struct fio_sem;
 
 /*
  * offset generator types
@@ -64,17 +77,50 @@
 };
 
 enum {
-	TD_F_VER_BACKLOG	= 1,
-	TD_F_TRIM_BACKLOG	= 2,
-	TD_F_READ_IOLOG		= 4,
-	TD_F_REFILL_BUFFERS	= 8,
-	TD_F_SCRAMBLE_BUFFERS	= 16,
-	TD_F_VER_NONE		= 32,
-	TD_F_PROFILE_OPS	= 64,
+	__TD_F_VER_BACKLOG	= 0,
+	__TD_F_TRIM_BACKLOG,
+	__TD_F_READ_IOLOG,
+	__TD_F_REFILL_BUFFERS,
+	__TD_F_SCRAMBLE_BUFFERS,
+	__TD_F_DO_VERIFY,
+	__TD_F_PROFILE_OPS,
+	__TD_F_COMPRESS,
+	__TD_F_COMPRESS_LOG,
+	__TD_F_VSTATE_SAVED,
+	__TD_F_NEED_LOCK,
+	__TD_F_CHILD,
+	__TD_F_NO_PROGRESS,
+	__TD_F_REGROW_LOGS,
+	__TD_F_MMAP_KEEP,
+	__TD_F_DIRS_CREATED,
+	__TD_F_CHECK_RATE,
+	__TD_F_LAST,		/* not a real bit, keep last */
+};
+
+enum {
+	TD_F_VER_BACKLOG	= 1U << __TD_F_VER_BACKLOG,
+	TD_F_TRIM_BACKLOG	= 1U << __TD_F_TRIM_BACKLOG,
+	TD_F_READ_IOLOG		= 1U << __TD_F_READ_IOLOG,
+	TD_F_REFILL_BUFFERS	= 1U << __TD_F_REFILL_BUFFERS,
+	TD_F_SCRAMBLE_BUFFERS	= 1U << __TD_F_SCRAMBLE_BUFFERS,
+	TD_F_DO_VERIFY		= 1U << __TD_F_DO_VERIFY,
+	TD_F_PROFILE_OPS	= 1U << __TD_F_PROFILE_OPS,
+	TD_F_COMPRESS		= 1U << __TD_F_COMPRESS,
+	TD_F_COMPRESS_LOG	= 1U << __TD_F_COMPRESS_LOG,
+	TD_F_VSTATE_SAVED	= 1U << __TD_F_VSTATE_SAVED,
+	TD_F_NEED_LOCK		= 1U << __TD_F_NEED_LOCK,
+	TD_F_CHILD		= 1U << __TD_F_CHILD,
+	TD_F_NO_PROGRESS        = 1U << __TD_F_NO_PROGRESS,
+	TD_F_REGROW_LOGS	= 1U << __TD_F_REGROW_LOGS,
+	TD_F_MMAP_KEEP		= 1U << __TD_F_MMAP_KEEP,
+	TD_F_DIRS_CREATED	= 1U << __TD_F_DIRS_CREATED,
+	TD_F_CHECK_RATE		= 1U << __TD_F_CHECK_RATE,
 };
 
 enum {
 	FIO_RAND_BS_OFF		= 0,
+	FIO_RAND_BS1_OFF,
+	FIO_RAND_BS2_OFF,
 	FIO_RAND_VER_OFF,
 	FIO_RAND_MIX_OFF,
 	FIO_RAND_FILE_OFF,
@@ -85,38 +131,81 @@
 	FIO_RAND_SEQ_RAND_READ_OFF,
 	FIO_RAND_SEQ_RAND_WRITE_OFF,
 	FIO_RAND_SEQ_RAND_TRIM_OFF,
+	FIO_RAND_START_DELAY,
+	FIO_DEDUPE_OFF,
+	FIO_RAND_POISSON_OFF,
+	FIO_RAND_ZONE_OFF,
+	FIO_RAND_POISSON2_OFF,
+	FIO_RAND_POISSON3_OFF,
 	FIO_RAND_NR_OFFS,
 };
 
+enum {
+	IO_MODE_INLINE = 0,
+	IO_MODE_OFFLOAD = 1,
+
+	RATE_PROCESS_LINEAR = 0,
+	RATE_PROCESS_POISSON = 1,
+};
+
+enum {
+	F_ADV_NONE = 0,
+	F_ADV_TYPE,
+	F_ADV_RANDOM,
+	F_ADV_SEQUENTIAL,
+};
+
+/*
+ * Per-thread/process specific data. Only used for the network client
+ * for now.
+ */
+void sk_out_assign(struct sk_out *);
+void sk_out_drop(void);
+
+struct zone_split_index {
+	uint8_t size_perc;
+	uint8_t size_perc_prev;
+	uint64_t size;
+	uint64_t size_prev;
+};
+
+#define FIO_MAX_OPEN_ZBD_ZONES 128
+
 /*
  * This describes a single thread/process executing a fio job.
  */
 struct thread_data {
-	struct thread_options o;
+	struct flist_head opt_list;
 	unsigned long flags;
+	struct thread_options o;
 	void *eo;
-	char verror[FIO_VERROR_SIZE];
 	pthread_t thread;
 	unsigned int thread_number;
+	unsigned int subjob_number;
 	unsigned int groupid;
-	struct thread_stat ts;
+	struct thread_stat ts __attribute__ ((aligned(8)));
 
 	int client_type;
 
 	struct io_log *slat_log;
 	struct io_log *clat_log;
+	struct io_log *clat_hist_log;
 	struct io_log *lat_log;
 	struct io_log *bw_log;
 	struct io_log *iops_log;
 
+	struct workqueue log_compress_wq;
+
+	struct thread_data *parent;
+
 	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
-	struct timeval bw_sample_time;
+	struct timespec bw_sample_time;
 
 	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
-	struct timeval iops_sample_time;
+	struct timespec iops_sample_time;
 
 	volatile int update_rusage;
-	struct fio_mutex *rusage_sem;
+	struct fio_sem *rusage_sem;
 	struct rusage ru_start;
 	struct rusage ru_end;
 
@@ -126,21 +215,29 @@
 	unsigned int files_index;
 	unsigned int nr_open_files;
 	unsigned int nr_done_files;
-	unsigned int nr_normal_files;
 	union {
 		unsigned int next_file;
-		os_random_state_t next_file_state;
-		struct frand_state __next_file_state;
+		struct frand_state next_file_state;
+	};
+	union {
+		struct zipf_state next_file_zipf;
+		struct gauss_state next_file_gauss;
+	};
+	union {
+		double zipf_theta;
+		double pareto_h;
+		double gauss_dev;
 	};
 	int error;
 	int sig;
 	int done;
+	int stop_io;
 	pid_t pid;
 	char *orig_buffer;
 	size_t orig_buffer_size;
-	volatile int terminate;
 	volatile int runstate;
-	unsigned int last_was_sync;
+	volatile bool terminate;
+	bool last_was_sync;
 	enum fio_ddir last_ddir;
 
 	int mmapfd;
@@ -148,28 +245,25 @@
 	void *iolog_buf;
 	FILE *iolog_f;
 
-	char *sysfs_root;
-
-	unsigned long rand_seeds[FIO_RAND_NR_OFFS];
+	uint64_t rand_seeds[FIO_RAND_NR_OFFS];
 
-	union {
-		os_random_state_t bsrange_state;
-		struct frand_state __bsrange_state;
-	};
-	union {
-		os_random_state_t verify_state;
-		struct frand_state __verify_state;
-	};
-	union {
-		os_random_state_t trim_state;
-		struct frand_state __trim_state;
-	};
+	struct frand_state bsrange_state[DDIR_RWDIR_CNT];
+	struct frand_state verify_state;
+	struct frand_state trim_state;
+	struct frand_state delay_state;
 
 	struct frand_state buf_state;
+	struct frand_state buf_state_prev;
+	struct frand_state dedupe_state;
+	struct frand_state zone_state;
+
+	struct zone_split_index **zone_state_index;
 
 	unsigned int verify_batch;
 	unsigned int trim_batch;
 
+	struct thread_io_list *vstate;
+
 	int shm_id;
 
 	/*
@@ -177,6 +271,13 @@
 	 * to any of the available IO engines.
 	 */
 	struct ioengine_ops *io_ops;
+	int io_ops_init;
+
+	/*
+	 * IO engine private data and dlhandle.
+	 */
+	void *io_ops_data;
+	void *io_ops_dlhandle;
 
 	/*
 	 * Queue depth of io_u's that fio MIGHT do
@@ -215,46 +316,72 @@
 	 * Rate state
 	 */
 	uint64_t rate_bps[DDIR_RWDIR_CNT];
-	long rate_pending_usleep[DDIR_RWDIR_CNT];
+	uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
 	unsigned long rate_bytes[DDIR_RWDIR_CNT];
 	unsigned long rate_blocks[DDIR_RWDIR_CNT];
-	struct timeval lastrate[DDIR_RWDIR_CNT];
+	unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
+	struct timespec lastrate[DDIR_RWDIR_CNT];
+	int64_t last_usec[DDIR_RWDIR_CNT];
+	struct frand_state poisson_state[DDIR_RWDIR_CNT];
+
+	/*
+	 * Enforced rate submission/completion workqueue
+	 */
+	struct workqueue io_wq;
 
 	uint64_t total_io_size;
 	uint64_t fill_device_size;
 
-	unsigned long io_issues[DDIR_RWDIR_CNT];
+	/*
+	 * Issue side
+	 */
+	uint64_t io_issues[DDIR_RWDIR_CNT];
+	uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
+	uint64_t loops;
+
+	/*
+	 * Completions
+	 */
 	uint64_t io_blocks[DDIR_RWDIR_CNT];
 	uint64_t this_io_blocks[DDIR_RWDIR_CNT];
 	uint64_t io_bytes[DDIR_RWDIR_CNT];
-	uint64_t io_skip_bytes;
 	uint64_t this_io_bytes[DDIR_RWDIR_CNT];
+	uint64_t io_skip_bytes;
 	uint64_t zone_bytes;
-	struct fio_mutex *mutex;
+	struct fio_sem *sem;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
 
 	/*
 	 * State for random io, a bitmap of blocks done vs not done
 	 */
-	union {
-		os_random_state_t random_state;
-		struct frand_state __random_state;
-	};
+	struct frand_state random_state;
 
-	struct timeval start;	/* start of this loop */
-	struct timeval epoch;	/* time job was started */
-	struct timeval last_issue;
-	struct timeval tv_cache;
-	unsigned int tv_cache_nr;
-	unsigned int tv_cache_mask;
-	unsigned int ramp_time_over;
+	struct timespec start;	/* start of this loop */
+	struct timespec epoch;	/* time job was started */
+	unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
+	struct timespec last_issue;
+	long time_offset;
+	struct timespec ts_cache;
+	struct timespec terminate_time;
+	unsigned int ts_cache_nr;
+	unsigned int ts_cache_mask;
+	bool ramp_time_over;
+
+	/*
+	 * Time since last latency_window was started
+	 */
+	struct timespec latency_ts;
+	unsigned int latency_qd;
+	unsigned int latency_qd_high;
+	unsigned int latency_qd_low;
+	unsigned int latency_failed;
+	uint64_t latency_ios;
+	int latency_end_run;
 
 	/*
 	 * read/write mixed workload state
 	 */
-	union {
-		os_random_state_t rwmix_state;
-		struct frand_state __rwmix_state;
-	};
+	struct frand_state rwmix_state;
 	unsigned long rwmix_issues;
 	enum fio_ddir rwmix_ddir;
 	unsigned int ddir_seq_nr;
@@ -262,10 +389,7 @@
 	/*
 	 * rand/seq mixed workload state
 	 */
-	union {
-		os_random_state_t seq_rand_state[DDIR_RWDIR_CNT];
-		struct frand_state __seq_rand_state[DDIR_RWDIR_CNT];
-	};
+	struct frand_state seq_rand_state[DDIR_RWDIR_CNT];
 
 	/*
 	 * IO history logs for verification. We use a tree for sorting,
@@ -279,6 +403,11 @@
 	 * For IO replaying
 	 */
 	struct flist_head io_log_list;
+	FILE *io_log_rfile;
+	unsigned int io_log_current;
+	unsigned int io_log_checkmark;
+	unsigned int io_log_highmark;
+	struct timespec io_log_highmark_time;
 
 	/*
 	 * For tracking/handling discards
@@ -286,8 +415,6 @@
 	struct flist_head trim_list;
 	unsigned long trim_entries;
 
-	struct flist_head next_rand_list;
-
 	/*
 	 * for fileservice, how often to switch to a new file
 	 */
@@ -300,10 +427,7 @@
 	/*
 	 * For generating file sizes
 	 */
-	union {
-		os_random_state_t file_size_state;
-		struct frand_state __file_size_state;
-	};
+	struct frand_state file_size_state;
 
 	/*
 	 * Error counts
@@ -320,6 +444,22 @@
 	void *prof_data;
 
 	void *pinned_mem;
+
+	struct steadystate_data ss;
+
+	char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+	/*
+	 * for GPU memory management
+	 */
+	int gpu_dev_cnt;
+	int gpu_dev_id;
+	CUdevice  cu_dev;
+	CUcontext cu_ctx;
+	CUdeviceptr dev_mem_ptr;
+#endif	
+
 };
 
 /*
@@ -333,67 +473,95 @@
 
 #define __td_verror(td, err, msg, func)					\
 	do {								\
-		int e = (err);						\
+		unsigned int ____e = (err);				\
 		if ((td)->error)					\
 			break;						\
-		(td)->error = e;					\
+		(td)->error = ____e;					\
 		if (!(td)->first_error)					\
-			snprintf(td->verror, sizeof(td->verror), "file:%s:%d, func=%s, error=%s", __FILE__, __LINE__, (func), (msg));		\
+			nowarn_snprintf(td->verror, sizeof(td->verror),	\
+					"file:%s:%d, func=%s, error=%s", \
+					__FILE__, __LINE__, (func), (msg)); \
 	} while (0)
 
 
-#define td_clear_error(td)		\
-	(td)->error = 0;
-#define td_verror(td, err, func)	\
-	__td_verror((td), (err), strerror((err)), (func))
-#define td_vmsg(td, err, msg, func)	\
-	__td_verror((td), (err), (msg), (func))
+#define td_clear_error(td)		do {		\
+	(td)->error = 0;				\
+	if ((td)->parent)				\
+		(td)->parent->error = 0;		\
+} while (0)
+
+#define td_verror(td, err, func)	do {			\
+	__td_verror((td), (err), strerror((err)), (func));	\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), strerror((err)), (func)); \
+} while (0)
+
+#define td_vmsg(td, err, msg, func)	do {			\
+	__td_verror((td), (err), (msg), (func));		\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), (msg), (func));	\
+} while (0)
 
 #define __fio_stringify_1(x)	#x
 #define __fio_stringify(x)	__fio_stringify_1(x)
 
-extern int exitall_on_terminate;
+extern bool exitall_on_terminate;
 extern unsigned int thread_number;
 extern unsigned int stat_number;
 extern int shm_id;
 extern int groupid;
 extern int output_format;
+extern int append_terse_output;
 extern int temp_stall_ts;
 extern uintptr_t page_mask, page_size;
-extern int read_only;
+extern bool read_only;
 extern int eta_print;
 extern int eta_new_line;
+extern unsigned int eta_interval_msec;
 extern unsigned long done_secs;
-extern char *job_section;
 extern int fio_gtod_offload;
 extern int fio_gtod_cpu;
 extern enum fio_cs fio_clock_source;
 extern int fio_clock_source_set;
 extern int warnings_fatal;
 extern int terse_version;
-extern int is_backend;
+extern bool is_backend;
+extern bool is_local_backend;
 extern int nr_clients;
-extern int log_syslog;
+extern bool log_syslog;
 extern int status_interval;
 extern const char fio_version_string[];
+extern char *trigger_file;
+extern char *trigger_cmd;
+extern char *trigger_remote_cmd;
+extern long long trigger_timeout;
+extern char *aux_path;
 
 extern struct thread_data *threads;
 
-static inline void fio_ro_check(struct thread_data *td, struct io_u *io_u)
+static inline bool is_running_backend(void)
+{
+	return is_backend || is_local_backend;
+}
+
+extern bool eta_time_within_slack(unsigned int time);
+
+static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u)
 {
-	assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)));
+	assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)) &&
+	       !(io_u->ddir == DDIR_TRIM && !td_trim(td)));
 }
 
-#define REAL_MAX_JOBS		2048
+#define REAL_MAX_JOBS		4096
 
-static inline int should_fsync(struct thread_data *td)
+static inline bool should_fsync(struct thread_data *td)
 {
 	if (td->last_was_sync)
-		return 0;
-	if (td_write(td) || td_rw(td) || td->o.override_sync)
-		return 1;
+		return false;
+	if (td_write(td) || td->o.override_sync)
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
@@ -403,28 +571,34 @@
 extern int __must_check parse_options(int, char **);
 extern int parse_jobs_ini(char *, int, int, int);
 extern int parse_cmd_line(int, char **, int);
-extern int fio_backend(void);
+extern int fio_backend(struct sk_out *);
 extern void reset_fio_state(void);
-extern void clear_io_state(struct thread_data *);
+extern void clear_io_state(struct thread_data *, int);
 extern int fio_options_parse(struct thread_data *, char **, int);
 extern void fio_keywords_init(void);
+extern void fio_keywords_exit(void);
 extern int fio_cmd_option_parse(struct thread_data *, const char *, char *);
 extern int fio_cmd_ioengine_option_parse(struct thread_data *, const char *, char *);
 extern void fio_fill_default_options(struct thread_data *);
 extern int fio_show_option_help(const char *);
 extern void fio_options_set_ioengine_opts(struct option *long_options, struct thread_data *td);
 extern void fio_options_dup_and_init(struct option *);
+extern char *fio_option_dup_subs(const char *);
 extern void fio_options_mem_dupe(struct thread_data *);
-extern void options_mem_dupe(void *data, struct fio_option *options);
 extern void td_fill_rand_seeds(struct thread_data *);
+extern void td_fill_verify_state_seed(struct thread_data *);
 extern void add_job_opts(const char **, int);
-extern char *num2str(unsigned long, int, int, int, int);
 extern int ioengine_load(struct thread_data *);
-extern int parse_dryrun(void);
+extern bool parse_dryrun(void);
+extern int fio_running_or_pending_io_threads(void);
+extern int fio_set_fd_nonblocking(int, const char *);
+extern void sig_show_status(int sig);
+extern struct thread_data *get_global_options(void);
 
 extern uintptr_t page_mask;
 extern uintptr_t page_size;
 extern int initialize_fio(char *envp[]);
+extern void deinitialize_fio(void);
 
 #define FIO_GETOPT_JOB		0x89000000
 #define FIO_GETOPT_IOENGINE	0x98000000
@@ -453,13 +627,42 @@
 	TD_PRE_READING,
 	TD_VERIFYING,
 	TD_FSYNCING,
+	TD_FINISHING,
 	TD_EXITED,
 	TD_REAPED,
+	TD_LAST,
+	TD_NR,
 };
 
+#define TD_ENG_FLAG_SHIFT	17
+#define TD_ENG_FLAG_MASK	((1U << 17) - 1)
+
+static inline void td_set_ioengine_flags(struct thread_data *td)
+{
+	td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
+		    (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+}
+
+static inline bool td_ioengine_flagged(struct thread_data *td,
+				       enum fio_ioengine_flags flags)
+{
+	return ((td->flags >> TD_ENG_FLAG_SHIFT) & flags) != 0;
+}
+
 extern void td_set_runstate(struct thread_data *, int);
-#define TERMINATE_ALL		(-1)
-extern void fio_terminate_threads(int);
+extern int td_bump_runstate(struct thread_data *, int);
+extern void td_restore_runstate(struct thread_data *, int);
+extern const char *runstate_to_name(int runstate);
+
+/*
+ * Allow 60 seconds for a job to quit on its own, otherwise reap with
+ * a vengeance.
+ */
+#define FIO_REAP_TIMEOUT	300
+
+#define TERMINATE_ALL		(-1U)
+extern void fio_terminate_threads(unsigned int);
+extern void fio_mark_td_terminate(struct thread_data *);
 
 /*
  * Memory helpers
@@ -470,19 +673,30 @@
 extern void free_io_mem(struct thread_data *);
 extern void free_threads_shm(void);
 
+#ifdef FIO_INTERNAL
+#define PTR_ALIGN(ptr, mask)	\
+	(char *) (((uintptr_t) (ptr) + (mask)) & ~(mask))
+#endif
+
 /*
  * Reset stats after ramp time completes
  */
 extern void reset_all_stats(struct thread_data *);
 
+extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timespec *comp_time);
+
 /*
- * blktrace support
+ * Latency target helpers
  */
-#ifdef FIO_HAVE_BLKTRACE
-extern int is_blktrace(const char *);
-extern int load_blktrace(struct thread_data *, const char *);
-#endif
+extern void lat_target_check(struct thread_data *);
+extern void lat_target_init(struct thread_data *);
+extern void lat_target_reset(struct thread_data *);
 
+/*
+ * Iterates all threads/processes within all the defined jobs
+ */
 #define for_each_td(td, i)	\
 	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
 #define for_each_file(td, f, i)	\
@@ -491,27 +705,16 @@
 	    	 (i) < (td)->o.nr_files && ((f) = (td)->files[i]) != NULL; \
 		 (i)++)
 
-#define fio_assert(td, cond)	do {	\
-	if (!(cond)) {			\
-		int *__foo = NULL;	\
-		fprintf(stderr, "file:%s:%d, assert %s failed\n", __FILE__, __LINE__, #cond);	\
-		td_set_runstate((td), TD_EXITED);	\
-		(td)->error = EFAULT;		\
-		*__foo = 0;			\
-	}	\
-} while (0)
-
-static inline int fio_fill_issue_time(struct thread_data *td)
+static inline bool fio_fill_issue_time(struct thread_data *td)
 {
 	if (td->o.read_iolog_file ||
 	    !td->o.disable_clat || !td->o.disable_slat || !td->o.disable_bw)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static inline int __should_check_rate(struct thread_data *td,
-				      enum fio_ddir ddir)
+static inline bool option_check_rate(struct thread_data *td, enum fio_ddir ddir)
 {
 	struct thread_options *o = &td->o;
 
@@ -520,89 +723,131 @@
 	 */
 	if (o->rate[ddir] || o->ratemin[ddir] || o->rate_iops[ddir] ||
 	    o->rate_iops_min[ddir])
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static inline int should_check_rate(struct thread_data *td,
-				    uint64_t *bytes_done)
+static inline bool __should_check_rate(struct thread_data *td)
 {
-	int ret = 0;
+	return (td->flags & TD_F_CHECK_RATE) != 0;
+}
 
-	if (bytes_done[DDIR_READ])
-		ret |= __should_check_rate(td, DDIR_READ);
-	if (bytes_done[DDIR_WRITE])
-		ret |= __should_check_rate(td, DDIR_WRITE);
-	if (bytes_done[DDIR_TRIM])
-		ret |= __should_check_rate(td, DDIR_TRIM);
+static inline bool should_check_rate(struct thread_data *td)
+{
+	if (!__should_check_rate(td))
+		return false;
 
-	return ret;
+	return ddir_rw_sum(td->bytes_done) != 0;
 }
 
-static inline unsigned int td_max_bs(struct thread_data *td)
+static inline unsigned long long td_max_bs(struct thread_data *td)
 {
-	unsigned int max_bs;
+	unsigned long long max_bs;
 
 	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
 	return max(td->o.max_bs[DDIR_TRIM], max_bs);
 }
 
-static inline unsigned int td_min_bs(struct thread_data *td)
+static inline unsigned long long td_min_bs(struct thread_data *td)
 {
-	unsigned int min_bs;
+	unsigned long long min_bs;
 
 	min_bs = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
 	return min(td->o.min_bs[DDIR_TRIM], min_bs);
 }
 
-static inline int is_power_of_2(unsigned int val)
+static inline bool td_async_processing(struct thread_data *td)
 {
-	return (val != 0 && ((val & (val - 1)) == 0));
+	return (td->flags & TD_F_NEED_LOCK) != 0;
+}
+
+static inline bool td_offload_overlap(struct thread_data *td)
+{
+	return td->o.serialize_overlap && td->o.io_submit_mode == IO_MODE_OFFLOAD;
 }
 
 /*
  * We currently only need to do locking if we have verifier threads
  * accessing our internal structures too
  */
-static inline void td_io_u_lock(struct thread_data *td)
+static inline void __td_io_u_lock(struct thread_data *td)
 {
-	if (td->o.verify_async)
-		pthread_mutex_lock(&td->io_u_lock);
+	pthread_mutex_lock(&td->io_u_lock);
 }
 
-static inline void td_io_u_unlock(struct thread_data *td)
+static inline void __td_io_u_unlock(struct thread_data *td)
 {
-	if (td->o.verify_async)
-		pthread_mutex_unlock(&td->io_u_lock);
+	pthread_mutex_unlock(&td->io_u_lock);
 }
 
 static inline void td_io_u_free_notify(struct thread_data *td)
 {
-	if (td->o.verify_async)
+	if (td_async_processing(td))
 		pthread_cond_signal(&td->free_cond);
 }
 
+static inline void td_flags_clear(struct thread_data *td, unsigned int *flags,
+				  unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags &= ~value;
+	else
+		__sync_fetch_and_and(flags, ~value);
+}
+
+static inline void td_flags_set(struct thread_data *td, unsigned int *flags,
+				unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags |= value;
+	else
+		__sync_fetch_and_or(flags, value);
+}
+
 extern const char *fio_get_arch_string(int);
 extern const char *fio_get_os_string(int);
 
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-
 enum {
-	FIO_OUTPUT_TERSE	= 0,
-	FIO_OUTPUT_JSON,
-	FIO_OUTPUT_NORMAL,
+	__FIO_OUTPUT_TERSE	= 0,
+	__FIO_OUTPUT_JSON	= 1,
+	__FIO_OUTPUT_NORMAL	= 2,
+        __FIO_OUTPUT_JSON_PLUS  = 3,
+	FIO_OUTPUT_NR		= 4,
+
+	FIO_OUTPUT_TERSE	= 1U << __FIO_OUTPUT_TERSE,
+	FIO_OUTPUT_JSON		= 1U << __FIO_OUTPUT_JSON,
+	FIO_OUTPUT_NORMAL	= 1U << __FIO_OUTPUT_NORMAL,
+	FIO_OUTPUT_JSON_PLUS    = 1U << __FIO_OUTPUT_JSON_PLUS,
 };
 
 enum {
 	FIO_RAND_DIST_RANDOM	= 0,
 	FIO_RAND_DIST_ZIPF,
 	FIO_RAND_DIST_PARETO,
+	FIO_RAND_DIST_GAUSS,
+	FIO_RAND_DIST_ZONED,
+	FIO_RAND_DIST_ZONED_ABS,
 };
 
+#define FIO_DEF_ZIPF		1.1
+#define FIO_DEF_PARETO		0.2
+
 enum {
 	FIO_RAND_GEN_TAUSWORTHE = 0,
 	FIO_RAND_GEN_LFSR,
+	FIO_RAND_GEN_TAUSWORTHE64,
+};
+
+enum {
+	FIO_CPUS_SHARED		= 0,
+	FIO_CPUS_SPLIT,
 };
 
+extern void exec_trigger(const char *);
+extern void check_trigger_file(void);
+
+extern bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u);
+extern pthread_mutex_t overlap_check;
+
 #endif
diff -Nru fio-2.1.3/fio_sem.c fio-3.16/fio_sem.c
--- fio-2.1.3/fio_sem.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/fio_sem.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <assert.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/valgrind.h>
+#else
+#define RUNNING_ON_VALGRIND 0
+#endif
+
+#include "fio_sem.h"
+#include "pshared.h"
+#include "os/os.h"
+#include "fio_time.h"
+#include "gettime.h"
+
+void __fio_sem_remove(struct fio_sem *sem)
+{
+	assert(sem->magic == FIO_SEM_MAGIC);
+	pthread_mutex_destroy(&sem->lock);
+	pthread_cond_destroy(&sem->cond);
+
+	/*
+	 * When not running on Valgrind, ensure any subsequent attempt to grab
+	 * this semaphore will fail with an assert, instead of just silently
+	 * hanging. When running on Valgrind, let Valgrind detect
+	 * use-after-free.
+         */
+	if (!RUNNING_ON_VALGRIND)
+		memset(sem, 0, sizeof(*sem));
+}
+
+void fio_sem_remove(struct fio_sem *sem)
+{
+	__fio_sem_remove(sem);
+	munmap((void *) sem, sizeof(*sem));
+}
+
+int __fio_sem_init(struct fio_sem *sem, int value)
+{
+	int ret;
+
+	sem->value = value;
+	/* Initialize .waiters explicitly for Valgrind. */
+	sem->waiters = 0;
+	sem->magic = FIO_SEM_MAGIC;
+
+	ret = mutex_cond_init_pshared(&sem->lock, &sem->cond);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+struct fio_sem *fio_sem_init(int value)
+{
+	struct fio_sem *sem = NULL;
+
+	sem = (void *) mmap(NULL, sizeof(struct fio_sem),
+				PROT_READ | PROT_WRITE,
+				OS_MAP_ANON | MAP_SHARED, -1, 0);
+	if (sem == MAP_FAILED) {
+		perror("mmap semaphore");
+		return NULL;
+	}
+
+	if (!__fio_sem_init(sem, value))
+		return sem;
+
+	fio_sem_remove(sem);
+	return NULL;
+}
+
+static bool sem_timed_out(struct timespec *t, unsigned int msecs)
+{
+	struct timeval tv;
+	struct timespec now;
+
+	gettimeofday(&tv, NULL);
+	now.tv_sec = tv.tv_sec;
+	now.tv_nsec = tv.tv_usec * 1000;
+
+	return mtime_since(t, &now) >= msecs;
+}
+
+int fio_sem_down_timeout(struct fio_sem *sem, unsigned int msecs)
+{
+	struct timeval tv_s;
+	struct timespec base;
+	struct timespec t;
+	int ret = 0;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	gettimeofday(&tv_s, NULL);
+	base.tv_sec = t.tv_sec = tv_s.tv_sec;
+	base.tv_nsec = t.tv_nsec = tv_s.tv_usec * 1000;
+
+	t.tv_sec += msecs / 1000;
+	t.tv_nsec += ((msecs * 1000000ULL) % 1000000000);
+	if (t.tv_nsec >= 1000000000) {
+		t.tv_nsec -= 1000000000;
+		t.tv_sec++;
+	}
+
+	pthread_mutex_lock(&sem->lock);
+
+	sem->waiters++;
+	while (!sem->value && !ret) {
+		/*
+		 * Some platforms (FreeBSD 9?) seems to return timed out
+		 * way too early, double check.
+		 */
+		ret = pthread_cond_timedwait(&sem->cond, &sem->lock, &t);
+		if (ret == ETIMEDOUT && !sem_timed_out(&base, msecs))
+			ret = 0;
+	}
+	sem->waiters--;
+
+	if (!ret) {
+		sem->value--;
+		pthread_mutex_unlock(&sem->lock);
+		return 0;
+	}
+
+	pthread_mutex_unlock(&sem->lock);
+	return ret;
+}
+
+bool fio_sem_down_trylock(struct fio_sem *sem)
+{
+	bool ret = true;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+	if (sem->value) {
+		sem->value--;
+		ret = false;
+	}
+	pthread_mutex_unlock(&sem->lock);
+
+	return ret;
+}
+
+void fio_sem_down(struct fio_sem *sem)
+{
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+
+	while (!sem->value) {
+		sem->waiters++;
+		pthread_cond_wait(&sem->cond, &sem->lock);
+		sem->waiters--;
+	}
+
+	sem->value--;
+	pthread_mutex_unlock(&sem->lock);
+}
+
+void fio_sem_up(struct fio_sem *sem)
+{
+	int do_wake = 0;
+
+	assert(sem->magic == FIO_SEM_MAGIC);
+
+	pthread_mutex_lock(&sem->lock);
+	read_barrier();
+	if (!sem->value && sem->waiters)
+		do_wake = 1;
+	sem->value++;
+
+	if (do_wake)
+		pthread_cond_signal(&sem->cond);
+
+	pthread_mutex_unlock(&sem->lock);
+}
diff -Nru fio-2.1.3/fio_sem.h fio-3.16/fio_sem.h
--- fio-2.1.3/fio_sem.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/fio_sem.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,31 @@
+#ifndef FIO_SEM_H
+#define FIO_SEM_H
+
+#include <pthread.h>
+#include "lib/types.h"
+
+#define FIO_SEM_MAGIC		0x4d555445U
+
+struct fio_sem {
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	int value;
+	int waiters;
+	int magic;
+};
+
+enum {
+	FIO_SEM_LOCKED	= 0,
+	FIO_SEM_UNLOCKED	= 1,
+};
+
+extern int __fio_sem_init(struct fio_sem *, int);
+extern struct fio_sem *fio_sem_init(int);
+extern void __fio_sem_remove(struct fio_sem *);
+extern void fio_sem_remove(struct fio_sem *);
+extern void fio_sem_up(struct fio_sem *);
+extern void fio_sem_down(struct fio_sem *);
+extern bool fio_sem_down_trylock(struct fio_sem *);
+extern int fio_sem_down_timeout(struct fio_sem *, unsigned int);
+
+#endif
diff -Nru fio-2.1.3/fio_time.h fio-3.16/fio_time.h
--- fio-2.1.3/fio_time.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/fio_time.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,33 @@
+#ifndef FIO_TIME_H
+#define FIO_TIME_H
+
+#include <stdint.h>
+/* IWYU pragma: begin_exports */
+#include <time.h>
+#include <sys/time.h>
+/* IWYU pragma: end_exports */
+#include "lib/types.h"
+
+struct thread_data;
+extern uint64_t ntime_since(const struct timespec *, const struct timespec *);
+extern uint64_t ntime_since_now(const struct timespec *);
+extern uint64_t utime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since_now(const struct timespec *);
+extern uint64_t mtime_since(const struct timespec *, const struct timespec *);
+extern uint64_t mtime_since_now(const struct timespec *);
+extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *);
+extern uint64_t time_since_now(const struct timespec *);
+extern uint64_t time_since_genesis(void);
+extern uint64_t mtime_since_genesis(void);
+extern uint64_t utime_since_genesis(void);
+extern uint64_t usec_spin(unsigned int);
+extern uint64_t usec_sleep(struct thread_data *, unsigned long);
+extern void fill_start_time(struct timespec *);
+extern void set_genesis_time(void);
+extern bool ramp_time_over(struct thread_data *);
+extern bool in_ramp_time(struct thread_data *);
+extern void fio_time_init(void);
+extern void timespec_add_msec(struct timespec *, unsigned int);
+extern void set_epoch_time(struct thread_data *, int);
+
+#endif
diff -Nru fio-2.1.3/FIO-VERSION-GEN fio-3.16/FIO-VERSION-GEN
--- fio-2.1.3/FIO-VERSION-GEN	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/FIO-VERSION-GEN	2019-09-20 01:01:52.000000000 +0000
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.1.3
+DEF_VER=fio-3.16
 
 LF='
 '
@@ -15,7 +15,7 @@
 	VN=`git describe --match "fio-[0-9]*" --abbrev=4 HEAD 2>/dev/null` &&
 	case "$VN" in
 	*$LF*) (exit 1) ;;
-	v[0-9]*)
+	fio-[0-9]*)
 		git update-index -q --refresh
 		test -z "`git diff-index --name-only HEAD --`" ||
 		VN="$VN-dirty" ;;
@@ -38,5 +38,3 @@
 	echo >&2 "FIO_VERSION = $VN"
 	echo "FIO_VERSION = $VN" >$GVF
 }
-
-
diff -Nru fio-2.1.3/flist.h fio-3.16/flist.h
--- fio-2.1.3/flist.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/flist.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,16 +2,10 @@
 #define _LINUX_FLIST_H
 
 #include <stdlib.h>
+#include <stddef.h>
 
-#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
-
-#define container_of(ptr, type, member) ({			\
-	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+#define container_of(ptr, type, member)  ({			\
+	const __typeof__( ((type *)0)->member ) *__mptr = (ptr);	\
 	(type *)( (char *)__mptr - offsetof(type,member) );})
 
 /*
@@ -140,6 +134,22 @@
 		__flist_splice(list, head, head->next);
 }
 
+static inline void flist_splice_tail(struct flist_head *list,
+				     struct flist_head *head)
+{
+	if (!flist_empty(list))
+		__flist_splice(list, head->prev, head);
+}
+
+static inline void flist_splice_tail_init(struct flist_head *list,
+					  struct flist_head *head)
+{
+	if (!flist_empty(list)) {
+		__flist_splice(list, head->prev, head);
+		INIT_FLIST_HEAD(list);
+	}
+}
+
 static inline void flist_splice_init(struct flist_head *list,
 				    struct flist_head *head)
 {
@@ -158,6 +168,12 @@
 #define flist_entry(ptr, type, member) \
 	container_of(ptr, type, member)
 
+#define flist_first_entry(ptr, type, member) \
+	flist_entry((ptr)->next, type, member)
+
+#define flist_last_entry(ptr, type, member) \
+	flist_entry((ptr)->prev, type, member)
+
 /**
  * flist_for_each	-	iterate over a list
  * @pos:	the &struct flist_head to use as a loop counter.
diff -Nru fio-2.1.3/flow.c fio-3.16/flow.c
--- fio-2.1.3/flow.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/flow.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,5 @@
 #include "fio.h"
-#include "mutex.h"
+#include "fio_sem.h"
 #include "smalloc.h"
 #include "flist.h"
 
@@ -11,18 +11,22 @@
 };
 
 static struct flist_head *flow_list;
-static struct fio_mutex *flow_lock;
+static struct fio_sem *flow_lock;
 
 int flow_threshold_exceeded(struct thread_data *td)
 {
 	struct fio_flow *flow = td->flow;
-	int sign;
+	long long flow_counter;
 
 	if (!flow)
 		return 0;
 
-	sign = td->o.flow > 0 ? 1 : -1;
-	if (sign * flow->flow_counter > td->o.flow_watermark) {
+	if (td->o.flow > 0)
+		flow_counter = flow->flow_counter;
+	else
+		flow_counter = -flow->flow_counter;
+
+	if (flow_counter > td->o.flow_watermark) {
 		if (td->o.flow_sleep) {
 			io_u_quiesce(td);
 			usleep(td->o.flow_sleep);
@@ -45,7 +49,7 @@
 	if (!flow_lock)
 		return NULL;
 
-	fio_mutex_down(flow_lock);
+	fio_sem_down(flow_lock);
 
 	flist_for_each(n, flow_list) {
 		flow = flist_entry(n, struct fio_flow, list);
@@ -58,7 +62,7 @@
 	if (!flow) {
 		flow = smalloc(sizeof(*flow));
 		if (!flow) {
-			log_err("fio: smalloc pool exhausted\n");
+			fio_sem_up(flow_lock);
 			return NULL;
 		}
 		flow->refs = 0;
@@ -70,7 +74,7 @@
 	}
 
 	flow->refs++;
-	fio_mutex_up(flow_lock);
+	fio_sem_up(flow_lock);
 	return flow;
 }
 
@@ -79,14 +83,14 @@
 	if (!flow_lock)
 		return;
 
-	fio_mutex_down(flow_lock);
+	fio_sem_down(flow_lock);
 
 	if (!--flow->refs) {
 		flist_del(&flow->list);
 		sfree(flow);
 	}
 
-	fio_mutex_up(flow_lock);
+	fio_sem_up(flow_lock);
 }
 
 void flow_init_job(struct thread_data *td)
@@ -111,7 +115,7 @@
 		return;
 	}
 
-	flow_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	flow_lock = fio_sem_init(FIO_SEM_UNLOCKED);
 	if (!flow_lock) {
 		log_err("fio: failed to allocate flow lock\n");
 		sfree(flow_list);
@@ -124,7 +128,7 @@
 void flow_exit(void)
 {
 	if (flow_lock)
-		fio_mutex_remove(flow_lock);
+		fio_sem_remove(flow_lock);
 	if (flow_list)
 		sfree(flow_list);
 }
diff -Nru fio-2.1.3/gclient.c fio-3.16/gclient.c
--- fio-2.1.3/gclient.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gclient.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,4 @@
-#include <malloc.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <glib.h>
@@ -13,6 +13,7 @@
 #include "graph.h"
 #include "gclient.h"
 #include "printing.h"
+#include "lib/pow2.h"
 
 static void gfio_display_ts(struct fio_client *client, struct thread_stat *ts,
 			    struct group_run_stats *rs);
@@ -47,7 +48,7 @@
 	{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
 	{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
 };
-static gint results_nmenu_items = sizeof(results_menu_items) / sizeof(results_menu_items[0]);
+static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
 
 static const gchar *results_ui_string = " \
 	<ui> \
@@ -120,7 +121,7 @@
 	GtkTreeIter iter;
 	struct tm *tm;
 	time_t sec;
-	char tmp[64], timebuf[80];
+	char tmp[64], timebuf[96];
 
 	sec = p->log_sec;
 	tm = localtime(&sec);
@@ -195,39 +196,39 @@
 	vbox = gtk_hbox_new(TRUE, 3);
 	gtk_container_add(GTK_CONTAINER(frame), vbox);
 	entry = new_info_entry_in_frame(vbox, "IOs");
-	entry_set_int_value(entry, p->dus.ios[0]);
+	entry_set_int_value(entry, p->dus.s.ios[0]);
 	entry = new_info_entry_in_frame(vbox, "Merges");
-	entry_set_int_value(entry, p->dus.merges[0]);
+	entry_set_int_value(entry, p->dus.s.merges[0]);
 	entry = new_info_entry_in_frame(vbox, "Sectors");
-	entry_set_int_value(entry, p->dus.sectors[0]);
+	entry_set_int_value(entry, p->dus.s.sectors[0]);
 	entry = new_info_entry_in_frame(vbox, "Ticks");
-	entry_set_int_value(entry, p->dus.ticks[0]);
+	entry_set_int_value(entry, p->dus.s.ticks[0]);
 
 	frame = gtk_frame_new("Write");
 	gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 2);
 	vbox = gtk_hbox_new(TRUE, 3);
 	gtk_container_add(GTK_CONTAINER(frame), vbox);
 	entry = new_info_entry_in_frame(vbox, "IOs");
-	entry_set_int_value(entry, p->dus.ios[1]);
+	entry_set_int_value(entry, p->dus.s.ios[1]);
 	entry = new_info_entry_in_frame(vbox, "Merges");
-	entry_set_int_value(entry, p->dus.merges[1]);
+	entry_set_int_value(entry, p->dus.s.merges[1]);
 	entry = new_info_entry_in_frame(vbox, "Sectors");
-	entry_set_int_value(entry, p->dus.sectors[1]);
+	entry_set_int_value(entry, p->dus.s.sectors[1]);
 	entry = new_info_entry_in_frame(vbox, "Ticks");
-	entry_set_int_value(entry, p->dus.ticks[1]);
+	entry_set_int_value(entry, p->dus.s.ticks[1]);
 
 	frame = gtk_frame_new("Shared");
 	gtk_box_pack_start(GTK_BOX(box), frame, FALSE, FALSE, 2);
 	vbox = gtk_hbox_new(TRUE, 3);
 	gtk_container_add(GTK_CONTAINER(frame), vbox);
 	entry = new_info_entry_in_frame(vbox, "IO ticks");
-	entry_set_int_value(entry, p->dus.io_ticks);
+	entry_set_int_value(entry, p->dus.s.io_ticks);
 	entry = new_info_entry_in_frame(vbox, "Time in queue");
-	entry_set_int_value(entry, p->dus.time_in_queue);
+	entry_set_int_value(entry, p->dus.s.time_in_queue);
 
 	util = 0.0;
-	if (p->dus.msec)
-		util = (double) 100 * p->dus.io_ticks / (double) p->dus.msec;
+	if (p->dus.s.msec)
+		util = (double) 100 * p->dus.s.io_ticks / (double) p->dus.s.msec;
 	if (util > 100.0)
 		util = 100.0;
 
@@ -279,10 +280,6 @@
 	gdk_threads_leave();
 }
 
-extern int sum_stat_clients;
-extern struct thread_stat client_ts;
-extern struct group_run_stats client_gs;
-
 static int sum_stat_nr;
 
 static void gfio_thread_status_op(struct fio_client *client,
@@ -295,12 +292,13 @@
 	if (sum_stat_clients == 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
 	client_ts.thread_number = p->ts.thread_number;
 	client_ts.groupid = p->ts.groupid;
+	client_ts.sig_figs = p->ts.sig_figs;
 
 	if (++sum_stat_nr == sum_stat_clients) {
 		strcpy(client_ts.name, "All clients");
@@ -320,7 +318,7 @@
 	static char message[100];
 	const char *m = message;
 
-	strncpy(message, status_message, sizeof(message) - 1);
+	snprintf(message, sizeof(message), "%s", status_message);
 	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ge->thread_status_pb), m);
 	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ge->thread_status_pb), perc / 100.0);
 	gtk_widget_queue_draw(ge->ui->window);
@@ -332,7 +330,7 @@
 	static char message[100];
 	const char *m = message;
 
-	strncpy(message, status_message, sizeof(message) - 1);
+	strncpy(message, sizeof(message), "%s", status_message);
 	gtk_progress_bar_set_text(GTK_PROGRESS_BAR(ui->thread_status_pb), m);
 	gtk_progress_bar_set_fraction(GTK_PROGRESS_BAR(ui->thread_status_pb), perc / 100.0);
 	gtk_widget_queue_draw(ui->window);
@@ -367,29 +365,11 @@
 	sprintf(tmp, "%u", je->files_open);
 	gtk_entry_set_text(GTK_ENTRY(ge->eta.files), tmp);
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_iops), "---");
-#endif
-
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char *iops_str[DDIR_RWDIR_CNT];
 		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 		int i;
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
@@ -400,19 +380,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
-
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), rate_str[0]);
+		iops_str[0] = num2str(je->iops[0], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], je->sig_figs, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -424,6 +411,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -460,31 +448,13 @@
 		eta_to_str(eta_str, je->eta_sec);
 	}
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_iops), "---");
-#endif
-
 	entry_set_int_value(ui->eta.jobs, je->nr_running);
 
 	if (je->eta_sec != INT_MAX && je->nr_running) {
-		char *iops_str[3];
-		char *rate_str[3];
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
 			strcpy(output, "-.-% done");
@@ -494,19 +464,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
-
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), rate_str[0]);
+		iops_str[0] = num2str(je->iops[0], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], je->sig_figs, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], je->sig_figs, 1, 0, N2S_PERSEC);
+
+		rate_str[0] = num2str(je->rate[0], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], je->sig_figs, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], je->sig_figs, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -518,6 +495,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -595,6 +573,7 @@
 	struct thread_options *o;
 	char *c1, *c2, *c3, *c4;
 	char tmp[80];
+	int i2p;
 
 	p->thread_number = le32_to_cpu(p->thread_number);
 	p->groupid = le32_to_cpu(p->groupid);
@@ -608,11 +587,13 @@
 	sprintf(tmp, "%s %s", o->odirect ? "direct" : "buffered", ddir_str(o->td_ddir));
 	multitext_add_entry(&ge->eta.iotype, tmp);
 
-	c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c2 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	c3 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	sprintf(tmp, "%s-%s/%s-%s", c1, c2, c3, c4);
+	i2p = is_power_of_2(o->kb_base);
+	c1 = num2str(o->min_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+	c2 = num2str(o->max_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+	c3 = num2str(o->min_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+	c4 = num2str(o->max_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+
+	sprintf(tmp, "%s-%s,%s-%s", c1, c2, c3, c4);
 	free(c1);
 	free(c2);
 	free(c3);
@@ -660,7 +641,7 @@
 	gdk_threads_leave();
 }
 
-static void gfio_client_stop(struct fio_client *client, struct fio_net_cmd *cmd)
+static void gfio_client_stop(struct fio_client *client)
 {
 	struct gfio_client *gc = client->client_data;
 
@@ -692,12 +673,6 @@
 	gdk_threads_leave();
 }
 
-static void gfio_client_iolog(struct fio_client *client, struct cmd_iolog_pdu *pdu)
-{
-	printf("got iolog: name=%s, type=%u, entries=%u\n", pdu->name, pdu->log_type, pdu->nr_samples);
-	free(pdu);
-}
-
 static void gfio_add_total_depths_tree(GtkListStore *model,
 				       struct thread_stat *ts, unsigned int len)
 {
@@ -956,18 +931,21 @@
 static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
 				      struct thread_stat *ts)
 {
-	double io_u_lat[FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
-	const char *ranges[] = { "2u", "4u", "10u", "20u", "50u", "100u",
-				 "250u", "500u", "750u", "1m", "2m",
-				 "4m", "10m", "20m", "50m", "100m",
-				 "250m", "500m", "750m", "1s", "2s", ">= 2s" };
+	double io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
+	const char *ranges[] = { "2ns", "4ns", "10ns", "20ns", "50ns", "100ns",
+				 "250ns", "500ns", "750ns", "1000ns", "2us",
+				 "4us", "10us", "20us", "50us", "100us",
+				 "250us", "500us", "750us", "1ms", "2ms",
+				 "4ms", "10ms", "20ms", "50ms", "100ms",
+				 "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" };
 	int start, end, i;
 	const int total = FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR;
 	GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area;
 	struct gui_entry *ge = gc->ge;
 
-	stat_calc_lat_u(ts, io_u_lat);
-	stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_U_NR]);
+	stat_calc_lat_n(ts, io_u_lat);
+	stat_calc_lat_u(ts, &io_u_lat[FIO_IO_U_LAT_N_NR]);
+	stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR]);
 
 	/*
 	 * Found out which first bucket has entries, and which last bucket
@@ -989,7 +967,7 @@
 		return;
 
 	tree_view = gfio_output_lat_buckets(&io_u_lat[start], &ranges[start], end - start + 1);
-	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency Buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
+	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
 
 	frame = gtk_frame_new("Latency buckets");
 	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
@@ -1009,19 +987,21 @@
 	gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3);
 }
 
-static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long min,
-			  unsigned long max, double mean, double dev)
+static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long long min,
+			  unsigned long long max, double mean, double dev)
 {
-	const char *base = "(usec)";
+	const char *base = "(nsec)";
 	GtkWidget *hbox, *label, *frame;
 	char *minp, *maxp;
 	char tmp[64];
 
-	if (!usec_to_msec(&min, &max, &mean, &dev))
+	if (nsec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
 	sprintf(tmp, "%s %s", name, base);
 	frame = gtk_frame_new(tmp);
@@ -1045,7 +1025,7 @@
 	free(maxp);
 }
 
-static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals,
+static GtkWidget *gfio_output_clat_percentiles(unsigned long long *ovals,
 					       fio_fp64_t *plist,
 					       unsigned int len,
 					       const char *base,
@@ -1056,10 +1036,10 @@
 	GtkTreeSelection *selection;
 	GtkListStore *model;
 	GtkTreeIter iter;
-	int i;
+	int i, j;
 
 	for (i = 0; i < len; i++)
-		types[i] = G_TYPE_INT;
+		types[i] = G_TYPE_ULONG;
 
 	model = gtk_list_store_newv(len, types);
 
@@ -1082,15 +1062,15 @@
 	gtk_list_store_append(model, &iter);
 
 	for (i = 0; i < len; i++) {
-		if (scale)
+		for (j = 0; j < scale; j++)
 			ovals[i] = (ovals[i] + 999) / 1000;
-		gtk_list_store_set(model, &iter, i, ovals[i], -1);
+		gtk_list_store_set(model, &iter, i, (unsigned long) ovals[i], -1);
 	}
 
 	return tree_view;
 }
 
-static struct graph *setup_clat_graph(char *title, unsigned int *ovals,
+static struct graph *setup_clat_graph(char *title, unsigned long long *ovals,
 				      fio_fp64_t *plist,
 				      unsigned int len,
 				      double xdim, double ydim)
@@ -1119,10 +1099,11 @@
 				       GtkWidget *vbox, struct thread_stat *ts,
 				       int ddir)
 {
-	unsigned int *io_u_plat = ts->io_u_plat[ddir];
-	unsigned long nr = ts->clat_stat[ddir].samples;
+	uint64_t *io_u_plat = ts->io_u_plat[ddir];
+	unsigned long long nr = ts->clat_stat[ddir].samples;
 	fio_fp64_t *plist = ts->percentile_list;
-	unsigned int *ovals, len, minv, maxv, scale_down;
+	unsigned int len, scale_down;
+	unsigned long long *ovals, minv, maxv;
 	const char *base;
 	GtkWidget *tree_view, *frame, *hbox, *drawing_area, *completion_vbox;
 	struct gui_entry *ge = gc->ge;
@@ -1133,18 +1114,25 @@
 		goto out;
 
 	/*
-	 * We default to usecs, but if the value range is such that we
-	 * should scale down to msecs, do that.
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
 	 */
-	if (minv > 2000 && maxv > 99999) {
-		scale_down = 1;
+        if (minv > 2000000 && maxv > 99999999ULL) {
+                scale_down = 2;
 		base = "msec";
-	} else {
-		scale_down = 0;
+        } else if (minv > 2000 && maxv > 99999) {
+                scale_down = 1;
 		base = "usec";
-	}
+        } else {
+                scale_down = 0;
+		base = "nsec";
+        }
+
+	if (ts->clat_percentiles)
+		sprintf(tmp, "Completion percentiles (%s)", base);
+	else
+		sprintf(tmp, "Latency percentiles (%s)", base);
 
-	sprintf(tmp, "Completion percentiles (%s)", base);
 	tree_view = gfio_output_clat_percentiles(ovals, plist, len, base, scale_down);
 	ge->clat_graph = setup_clat_graph(tmp, ovals, plist, len, 700.0, 300.0);
 
@@ -1178,11 +1166,13 @@
 {
 	const char *ddir_label[3] = { "Read", "Write", "Trim" };
 	GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-	unsigned long min[3], max[3], runt;
+	unsigned long long min[3], max[3];
+	unsigned long runt;
 	unsigned long long bw, iops;
 	unsigned int flags = 0;
 	double mean[3], dev[3];
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
+	char tmp[128];
 	int i2p;
 
 	if (!ts->runtime[ddir])
@@ -1192,11 +1182,9 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
+	iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_PERSEC);
 
 	box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(mbox), box, TRUE, FALSE, 3);
@@ -1211,9 +1199,17 @@
 	gtk_box_pack_start(GTK_BOX(main_vbox), box, TRUE, FALSE, 3);
 
 	label = new_info_label_in_frame(box, "IO");
-	gtk_label_set_text(GTK_LABEL(label), io_p);
+	io_p = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
+	io_palt = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, !i2p, N2S_BYTE);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", io_p, io_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "Bandwidth");
-	gtk_label_set_text(GTK_LABEL(label), bw_p);
+	bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
+	bw_palt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", bw_p, bw_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "IOPS");
 	gtk_label_set_text(GTK_LABEL(label), iops_p);
 	label = new_info_label_in_frame(box, "Runtime (msec)");
@@ -1221,7 +1217,7 @@
 
 	if (calc_lat(&ts->bw_stat[ddir], &min[0], &max[0], &mean[0], &dev[0])) {
 		double p_of_agg = 100.0;
-		const char *bw_str = "KB";
+		const char *bw_str = "KiB/s";
 		char tmp[32];
 
 		if (rs->agg[ddir]) {
@@ -1230,14 +1226,21 @@
 				p_of_agg = 100.0;
 		}
 
-		if (mean[0] > 999999.9) {
-			min[0] /= 1000.0;
-			max[0] /= 1000.0;
-			mean[0] /= 1000.0;
-			dev[0] /= 1000.0;
-			bw_str = "MB";
+		if (mean[0] > 1073741824.9) {
+			min[0] /= 1048576.0;
+			max[0] /= 1048576.0;
+			mean[0] /= 1048576.0;
+			dev[0] /= 1048576.0;
+			bw_str = "GiB/s";
 		}
 
+		if (mean[0] > 1047575.9) {
+			min[0] /= 1024.0;
+			max[0] /= 1024.0;
+			mean[0] /= 1024.0;
+			dev[0] /= 1024.0;
+			bw_str = "MiB/s";
+		}
 		sprintf(tmp, "Bandwidth (%s)", bw_str);
 		frame = gtk_frame_new(tmp);
 		gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5);
@@ -1287,6 +1290,8 @@
 
 	free(io_p);
 	free(bw_p);
+	free(io_palt);
+	free(bw_palt);
 	free(iops_p);
 }
 
@@ -1393,7 +1398,6 @@
 	.stop			= gfio_client_stop,
 	.start			= gfio_client_start,
 	.job_start		= gfio_client_job_start,
-	.iolog			= gfio_client_iolog,
 	.removed		= gfio_client_removed,
 	.eta_msec		= FIO_CLIENT_DEF_ETA_MSEC,
 	.stay_connected		= 1,
diff -Nru fio-2.1.3/gerror.c fio-3.16/gerror.c
--- fio-2.1.3/gerror.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gerror.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,5 @@
 #include <locale.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <string.h>
 #include <stdarg.h>
 
diff -Nru fio-2.1.3/gettime.c fio-3.16/gettime.c
--- fio-2.1.3/gettime.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gettime.c	2019-09-20 01:01:52.000000000 +0000
@@ -2,37 +2,42 @@
  * Clock functions
  */
 
-#include <unistd.h>
 #include <math.h>
-#include <sys/time.h>
-#include <time.h>
 
 #include "fio.h"
-#include "smalloc.h"
-
-#include "hash.h"
 #include "os/os.h"
 
-#ifdef ARCH_HAVE_CPU_CLOCK
-static unsigned long cycles_per_usec;
-static unsigned long inv_cycles_per_usec;
+#if defined(ARCH_HAVE_CPU_CLOCK)
+#ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+static unsigned long long cycles_per_msec;
+static unsigned long long cycles_start;
+static unsigned long long clock_mult;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+static unsigned int clock_shift;
+static unsigned int max_cycles_shift;
+#define MAX_CLOCK_SEC 60*60
+#endif
+#ifdef ARCH_CPU_CLOCK_WRAPS
+static unsigned int cycles_wrap;
+#endif
 #endif
-int tsc_reliable = 0;
+bool tsc_reliable = false;
 
 struct tv_valid {
-	struct timeval last_tv;
-	uint64_t last_cycles;
-	int last_tv_valid;
+	int warned;
 };
+#ifdef ARCH_HAVE_CPU_CLOCK
 #ifdef CONFIG_TLS_THREAD
 static __thread struct tv_valid static_tv_valid;
 #else
 static pthread_key_t tv_tls_key;
 #endif
+#endif
 
 enum fio_cs fio_clock_source = FIO_PREFERRED_CLOCK_SOURCE;
 int fio_clock_source_set = 0;
-enum fio_cs fio_clock_source_inited = CS_INVAL;
+static enum fio_cs fio_clock_source_inited = CS_INVAL;
 
 #ifdef FIO_DEBUG_TIME
 
@@ -64,7 +69,7 @@
 	return NULL;
 }
 
-static struct gtod_log *find_log(void *caller)
+static void inc_caller(void *caller)
 {
 	struct gtod_log *log = find_hash(caller);
 
@@ -80,16 +85,13 @@
 		flist_add_tail(&log->list, &hash[h]);
 	}
 
-	return log;
+	log->calls++;
 }
 
 static void gtod_log_caller(void *caller)
 {
-	if (gtod_inited) {
-		struct gtod_log *log = find_log(caller);
-
-		log->calls++;
-	}
+	if (gtod_inited)
+		inc_caller(caller);
 }
 
 static void fio_exit fio_dump_gtod(void)
@@ -128,7 +130,9 @@
 #ifdef CONFIG_CLOCK_GETTIME
 static int fill_clock_gettime(struct timespec *ts)
 {
-#ifdef CONFIG_CLOCK_MONOTONIC
+#if defined(CONFIG_CLOCK_MONOTONIC_RAW)
+	return clock_gettime(CLOCK_MONOTONIC_RAW, ts);
+#elif defined(CONFIG_CLOCK_MONOTONIC)
 	return clock_gettime(CLOCK_MONOTONIC, ts);
 #else
 	return clock_gettime(CLOCK_REALTIME, ts);
@@ -136,50 +140,58 @@
 }
 #endif
 
-static void *__fio_gettime(struct timeval *tp)
+static void __fio_gettime(struct timespec *tp)
 {
-	struct tv_valid *tv;
-
-#ifdef CONFIG_TLS_THREAD
-	tv = &static_tv_valid;
-#else
-	tv = pthread_getspecific(tv_tls_key);
-#endif
-
 	switch (fio_clock_source) {
 #ifdef CONFIG_GETTIMEOFDAY
-	case CS_GTOD:
-		gettimeofday(tp, NULL);
+	case CS_GTOD: {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+
+		tp->tv_sec = tv.tv_sec;
+		tp->tv_nsec = tv.tv_usec * 1000;
 		break;
+		}
 #endif
 #ifdef CONFIG_CLOCK_GETTIME
 	case CS_CGETTIME: {
-		struct timespec ts;
-
-		if (fill_clock_gettime(&ts) < 0) {
+		if (fill_clock_gettime(tp) < 0) {
 			log_err("fio: clock_gettime fails\n");
 			assert(0);
 		}
-
-		tp->tv_sec = ts.tv_sec;
-		tp->tv_usec = ts.tv_nsec / 1000;
 		break;
 		}
 #endif
 #ifdef ARCH_HAVE_CPU_CLOCK
 	case CS_CPUCLOCK: {
-		uint64_t usecs, t;
+		uint64_t nsecs, t, multiples;
+		struct tv_valid *tv;
+
+#ifdef CONFIG_TLS_THREAD
+		tv = &static_tv_valid;
+#else
+		tv = pthread_getspecific(tv_tls_key);
+#endif
 
 		t = get_cpu_clock();
-		if (tv && t < tv->last_cycles) {
-			dprint(FD_TIME, "CPU clock going back in time\n");
-			t = tv->last_cycles;
-		} else if (tv)
-			tv->last_cycles = t;
-
-		usecs = (t * inv_cycles_per_usec) / 16777216UL;
-		tp->tv_sec = usecs / 1000000;
-		tp->tv_usec = usecs % 1000000;
+#ifdef ARCH_CPU_CLOCK_WRAPS
+		if (t < cycles_start && !cycles_wrap)
+			cycles_wrap = 1;
+		else if (cycles_wrap && t >= cycles_start && !tv->warned) {
+			log_err("fio: double CPU clock wrap\n");
+			tv->warned = 1;
+		}
+#endif
+#ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+		nsecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC * 1000;
+#else
+		t -= cycles_start;
+		multiples = t >> max_cycles_shift;
+		nsecs = multiples * nsecs_for_max_cycles;
+		nsecs += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+#endif
+		tp->tv_sec = nsecs / 1000000000ULL;
+		tp->tv_nsec = nsecs % 1000000000ULL;
 		break;
 		}
 #endif
@@ -187,54 +199,33 @@
 		log_err("fio: invalid clock source %d\n", fio_clock_source);
 		break;
 	}
-
-	return tv;
 }
 
 #ifdef FIO_DEBUG_TIME
-void fio_gettime(struct timeval *tp, void *caller)
+void fio_gettime(struct timespec *tp, void *caller)
 #else
-void fio_gettime(struct timeval *tp, void fio_unused *caller)
+void fio_gettime(struct timespec *tp, void fio_unused *caller)
 #endif
 {
-	struct tv_valid *tv;
-
 #ifdef FIO_DEBUG_TIME
 	if (!caller)
 		caller = __builtin_return_address(0);
 
 	gtod_log_caller(caller);
 #endif
-	if (fio_tv) {
-		memcpy(tp, fio_tv, sizeof(*tp));
+	if (fio_unlikely(fio_gettime_offload(tp)))
 		return;
-	}
-
-	tv = __fio_gettime(tp);
 
-	/*
-	 * If Linux is using the tsc clock on non-synced processors,
-	 * sometimes time can appear to drift backwards. Fix that up.
-	 */
-	if (tv) {
-		if (tv->last_tv_valid) {
-			if (tp->tv_sec < tv->last_tv.tv_sec)
-				tp->tv_sec = tv->last_tv.tv_sec;
-			else if (tv->last_tv.tv_sec == tp->tv_sec &&
-				 tp->tv_usec < tv->last_tv.tv_usec)
-				tp->tv_usec = tv->last_tv.tv_usec;
-		}
-		tv->last_tv_valid = 1;
-		memcpy(&tv->last_tv, tp, sizeof(*tp));
-	}
+	__fio_gettime(tp);
 }
 
-#ifdef ARCH_HAVE_CPU_CLOCK
-static unsigned long get_cycles_per_usec(void)
+#if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
+static unsigned long get_cycles_per_msec(void)
 {
-	struct timeval s, e;
+	struct timespec s, e;
 	uint64_t c_s, c_e;
 	enum fio_cs old_cs = fio_clock_source;
+	uint64_t elapsed;
 
 #ifdef CONFIG_CLOCK_GETTIME
 	fio_clock_source = CS_CGETTIME;
@@ -245,19 +236,16 @@
 
 	c_s = get_cpu_clock();
 	do {
-		uint64_t elapsed;
-
 		__fio_gettime(&e);
+		c_e = get_cpu_clock();
 
-		elapsed = utime_since(&s, &e);
-		if (elapsed >= 1280) {
-			c_e = get_cpu_clock();
+		elapsed = ntime_since(&s, &e);
+		if (elapsed >= 1280000)
 			break;
-		}
 	} while (1);
 
 	fio_clock_source = old_cs;
-	return (c_e - c_s + 127) >> 7;
+	return (c_e - c_s) * 1000000 / elapsed;
 }
 
 #define NR_TIME_ITERS	50
@@ -265,13 +253,14 @@
 static int calibrate_cpu_clock(void)
 {
 	double delta, mean, S;
-	uint64_t avg, cycles[NR_TIME_ITERS];
-	int i, samples;
+	uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS];
+	int i, samples, sft = 0;
+	unsigned long long tmp, max_ticks, max_mult;
 
-	cycles[0] = get_cycles_per_usec();
+	cycles[0] = get_cycles_per_msec();
 	S = delta = mean = 0.0;
 	for (i = 0; i < NR_TIME_ITERS; i++) {
-		cycles[i] = get_cycles_per_usec();
+		cycles[i] = get_cycles_per_msec();
 		delta = cycles[i] - mean;
 		if (delta) {
 			mean += delta / (i + 1.0);
@@ -288,10 +277,14 @@
 
 	S = sqrt(S / (NR_TIME_ITERS - 1.0));
 
-	samples = avg = 0;
+	minc = -1ULL;
+	maxc = samples = avg = 0;
 	for (i = 0; i < NR_TIME_ITERS; i++) {
 		double this = cycles[i];
 
+		minc = min(cycles[i], minc);
+		maxc = max(cycles[i], maxc);
+
 		if ((fmax(this, mean) - fmin(this, mean)) > S)
 			continue;
 		samples++;
@@ -299,37 +292,95 @@
 	}
 
 	S /= (double) NR_TIME_ITERS;
-	mean /= 10.0;
 
 	for (i = 0; i < NR_TIME_ITERS; i++)
-		dprint(FD_TIME, "cycles[%d]=%llu\n", i,
-					(unsigned long long) cycles[i] / 10);
+		dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]);
 
 	avg /= samples;
-	avg = (avg + 5) / 10;
-	dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg);
-	dprint(FD_TIME, "mean=%f, S=%f\n", mean, S);
-
-	cycles_per_usec = avg;
-	inv_cycles_per_usec = 16777216UL / cycles_per_usec;
-	dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
+	cycles_per_msec = avg;
+	dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f, N=%d\n",
+			(unsigned long long) minc,
+			(unsigned long long) maxc, mean, S, NR_TIME_ITERS);
+	dprint(FD_TIME, "trimmed mean=%llu, N=%d\n", (unsigned long long) avg, samples);
+
+	max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
+	max_mult = ULLONG_MAX / max_ticks;
+	dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+			"max_mult=%llu\n", max_ticks,
+			__builtin_clzll(max_ticks), max_mult);
+
+        /*
+         * Find the largest shift count that will produce
+         * a multiplier that does not exceed max_mult
+         */
+        tmp = max_mult * cycles_per_msec / 1000000;
+        while (tmp > 1) {
+                tmp >>= 1;
+                sft++;
+                dprint(FD_TIME, "tmp=%llu, sft=%u\n", tmp, sft);
+        }
+
+	clock_shift = sft;
+	clock_mult = (1ULL << sft) * 1000000 / cycles_per_msec;
+	dprint(FD_TIME, "clock_shift=%u, clock_mult=%llu\n", clock_shift,
+							clock_mult);
+
+	/*
+	 * Find the greatest power of 2 clock ticks that is less than the
+	 * ticks in MAX_CLOCK_SEC_2STAGE
+	 */
+	max_cycles_shift = max_cycles_mask = 0;
+	tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
+	dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp,
+							max_cycles_shift);
+	while (tmp > 1) {
+		tmp >>= 1;
+		max_cycles_shift++;
+		dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+	}
+	/*
+	 * if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_msec
+	 * here we will have a discontinuity every
+	 * (1ULL << max_cycles_shift) cycles
+	 */
+	nsecs_for_max_cycles = ((1ULL << max_cycles_shift) * clock_mult)
+					>> clock_shift;
+
+	/* Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) */
+	for (tmp = 0; tmp < max_cycles_shift; tmp++)
+		max_cycles_mask |= 1ULL << tmp;
+
+	dprint(FD_TIME, "max_cycles_shift=%u, 2^max_cycles_shift=%llu, "
+			"nsecs_for_max_cycles=%llu, "
+			"max_cycles_mask=%016llx\n",
+			max_cycles_shift, (1ULL << max_cycles_shift),
+			nsecs_for_max_cycles, max_cycles_mask);
+
+	cycles_start = get_cpu_clock();
+	dprint(FD_TIME, "cycles_start=%llu\n", cycles_start);
 	return 0;
 }
 #else
 static int calibrate_cpu_clock(void)
 {
+#ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
+	return 0;
+#else
 	return 1;
-}
 #endif
+}
+#endif // ARCH_HAVE_CPU_CLOCK
 
 #ifndef CONFIG_TLS_THREAD
-void fio_local_clock_init(int is_thread)
+void fio_local_clock_init(void)
 {
 	struct tv_valid *t;
 
-	t = calloc(sizeof(*t), 1);
-	if (pthread_setspecific(tv_tls_key, t))
+	t = calloc(1, sizeof(*t));
+	if (pthread_setspecific(tv_tls_key, t)) {
 		log_err("fio: can't set TLS key\n");
+		assert(0);
+	}
 }
 
 static void kill_tv_tls_key(void *data)
@@ -337,7 +388,7 @@
 	free(data);
 }
 #else
-void fio_local_clock_init(int is_thread)
+void fio_local_clock_init(void)
 {
 }
 #endif
@@ -355,7 +406,7 @@
 	fio_clock_source_inited = fio_clock_source;
 
 	if (calibrate_cpu_clock())
-		tsc_reliable = 0;
+		tsc_reliable = false;
 
 	/*
 	 * If the arch sets tsc_reliable != 0, then it must be good enough
@@ -363,19 +414,47 @@
 	 * runs at a constant rate and is synced across CPU cores.
 	 */
 	if (tsc_reliable) {
-		if (!fio_clock_source_set)
+		if (!fio_clock_source_set && !fio_monotonic_clocktest(0))
 			fio_clock_source = CS_CPUCLOCK;
 	} else if (fio_clock_source == CS_CPUCLOCK)
 		log_info("fio: clocksource=cpu may not be reliable\n");
+	dprint(FD_TIME, "gettime: clocksource=%d\n", (int) fio_clock_source);
+}
+
+uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
+{
+       int64_t sec, nsec;
+
+       sec = e->tv_sec - s->tv_sec;
+       nsec = e->tv_nsec - s->tv_nsec;
+       if (sec > 0 && nsec < 0) {
+	       sec--;
+	       nsec += 1000000000LL;
+       }
+
+       /*
+	* time warp bug on some kernels?
+	*/
+       if (sec < 0 || (sec == 0 && nsec < 0))
+	       return 0;
+
+       return nsec + (sec * 1000000000LL);
 }
 
-uint64_t utime_since(struct timeval *s, struct timeval *e)
+uint64_t ntime_since_now(const struct timespec *s)
 {
-	long sec, usec;
-	uint64_t ret;
+	struct timespec now;
+
+	fio_gettime(&now, NULL);
+	return ntime_since(s, &now);
+}
+
+uint64_t utime_since(const struct timespec *s, const struct timespec *e)
+{
+	int64_t sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
 	if (sec > 0 && usec < 0) {
 		sec--;
 		usec += 1000000;
@@ -387,25 +466,29 @@
 	if (sec < 0 || (sec == 0 && usec < 0))
 		return 0;
 
-	ret = sec * 1000000ULL + usec;
-
-	return ret;
+	return usec + (sec * 1000000);
 }
 
-uint64_t utime_since_now(struct timeval *s)
+uint64_t utime_since_now(const struct timespec *s)
 {
-	struct timeval t;
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
+	void *p = __builtin_return_address(0);
 
+	fio_gettime(&t, p);
+#else
 	fio_gettime(&t, NULL);
+#endif
+
 	return utime_since(s, &t);
 }
 
-uint64_t mtime_since(struct timeval *s, struct timeval *e)
+uint64_t mtime_since_tv(const struct timeval *s, const struct timeval *e)
 {
-	long sec, usec, ret;
+	int64_t sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
+	usec = (e->tv_usec - s->tv_usec);
 	if (sec > 0 && usec < 0) {
 		sec--;
 		usec += 1000000;
@@ -414,31 +497,54 @@
 	if (sec < 0 || (sec == 0 && usec < 0))
 		return 0;
 
-	sec *= 1000UL;
-	usec /= 1000UL;
-	ret = sec + usec;
-
-	return ret;
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
 }
 
-uint64_t mtime_since_now(struct timeval *s)
+uint64_t mtime_since_now(const struct timespec *s)
 {
-	struct timeval t;
+	struct timespec t;
+#ifdef FIO_DEBUG_TIME
 	void *p = __builtin_return_address(0);
 
 	fio_gettime(&t, p);
+#else
+	fio_gettime(&t, NULL);
+#endif
+
 	return mtime_since(s, &t);
 }
 
-uint64_t time_since_now(struct timeval *s)
+uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+{
+	int64_t sec, usec;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = (e->tv_nsec - s->tv_nsec) / 1000;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
+}
+
+uint64_t time_since_now(const struct timespec *s)
 {
 	return mtime_since_now(s) / 1000;
 }
 
 #if defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK)  && \
-    defined(CONFIG_SFAA)
+    defined(CONFIG_SYNC_SYNC) && defined(CONFIG_CMP_SWAP)
 
-#define CLOCK_ENTRIES	100000
+#define CLOCK_ENTRIES_DEBUG	100000
+#define CLOCK_ENTRIES_TEST	1000
 
 struct clock_entry {
 	uint32_t seq;
@@ -449,15 +555,17 @@
 struct clock_thread {
 	pthread_t thread;
 	int cpu;
-	pthread_mutex_t lock;
-	pthread_mutex_t started;
+	int debug;
+	struct fio_sem lock;
+	unsigned long nr_entries;
 	uint32_t *seq;
 	struct clock_entry *entries;
 };
 
-static inline uint32_t atomic32_inc_return(uint32_t *seq)
+static inline uint32_t atomic32_compare_and_swap(uint32_t *ptr, uint32_t old,
+						 uint32_t new)
 {
-	return 1 + __sync_fetch_and_add(seq, 1);
+	return __sync_val_compare_and_swap(ptr, old, new);
 }
 
 static void *clock_thread_fn(void *data)
@@ -465,49 +573,70 @@
 	struct clock_thread *t = data;
 	struct clock_entry *c;
 	os_cpu_mask_t cpu_mask;
-	uint32_t last_seq;
+	unsigned long long first;
 	int i;
 
-	memset(&cpu_mask, 0, sizeof(cpu_mask));
+	if (fio_cpuset_init(&cpu_mask)) {
+		int __err = errno;
+
+		log_err("clock cpuset init failed: %s\n", strerror(__err));
+		goto err_out;
+	}
+
 	fio_cpu_set(&cpu_mask, t->cpu);
 
 	if (fio_setaffinity(gettid(), cpu_mask) == -1) {
-		log_err("clock setaffinity failed\n");
-		return (void *) 1;
+		int __err = errno;
+
+		log_err("clock setaffinity failed: %s\n", strerror(__err));
+		goto err;
 	}
 
-	pthread_mutex_lock(&t->lock);
-	pthread_mutex_unlock(&t->started);
+	fio_sem_down(&t->lock);
 
-	last_seq = 0;
+	first = get_cpu_clock();
 	c = &t->entries[0];
-	for (i = 0; i < CLOCK_ENTRIES; i++, c++) {
+	for (i = 0; i < t->nr_entries; i++, c++) {
 		uint32_t seq;
 		uint64_t tsc;
 
 		c->cpu = t->cpu;
 		do {
-			seq = atomic32_inc_return(t->seq);
-			if (seq < last_seq)
+			seq = *t->seq;
+			if (seq == UINT_MAX)
 				break;
+			__sync_synchronize();
 			tsc = get_cpu_clock();
-		} while (seq != *t->seq);
+		} while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1));
+
+		if (seq == UINT_MAX)
+			break;
 
 		c->seq = seq;
 		c->tsc = tsc;
 	}
 
-	log_info("cs: cpu%3d: %llu clocks seen\n", t->cpu,
-		(unsigned long long) t->entries[i - 1].tsc - t->entries[0].tsc);
+	if (t->debug) {
+		unsigned long long clocks;
+
+		clocks = t->entries[i - 1].tsc - t->entries[0].tsc;
+		log_info("cs: cpu%3d: %llu clocks seen, first %llu\n", t->cpu,
+							clocks, first);
+	}
 
 	/*
 	 * The most common platform clock breakage is returning zero
 	 * indefinitely. Check for that and return failure.
 	 */
-	if (!t->entries[i - 1].tsc && !t->entries[0].tsc)
-		return (void *) 1;
+	if (i > 1 && !t->entries[i - 1].tsc && !t->entries[0].tsc)
+		goto err;
 
+	fio_cpuset_exit(&cpu_mask);
 	return NULL;
+err:
+	fio_cpuset_exit(&cpu_mask);
+err_out:
+	return (void *) 1;
 }
 
 static int clock_cmp(const void *p1, const void *p2)
@@ -521,69 +650,84 @@
 	return c1->seq - c2->seq;
 }
 
-int fio_monotonic_clocktest(void)
+int fio_monotonic_clocktest(int debug)
 {
-	struct clock_thread *threads;
+	struct clock_thread *cthreads;
 	unsigned int nr_cpus = cpus_online();
 	struct clock_entry *entries;
-	unsigned long tentries, failed;
+	unsigned long nr_entries, tentries, failed = 0;
 	struct clock_entry *prev, *this;
 	uint32_t seq = 0;
-	int i;
+	unsigned int i;
 
-	log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
+	if (debug) {
+		log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
+
+#ifdef FIO_INC_DEBUG
+		fio_debug |= 1U << FD_TIME;
+#endif
+		nr_entries = CLOCK_ENTRIES_DEBUG;
+	} else
+		nr_entries = CLOCK_ENTRIES_TEST;
 
-	fio_debug |= 1U << FD_TIME;
 	calibrate_cpu_clock();
-	fio_debug &= ~(1U << FD_TIME);
 
-	threads = malloc(nr_cpus * sizeof(struct clock_thread));
-	tentries = CLOCK_ENTRIES * nr_cpus;
+	if (debug) {
+#ifdef FIO_INC_DEBUG
+		fio_debug &= ~(1U << FD_TIME);
+#endif
+	}
+
+	cthreads = malloc(nr_cpus * sizeof(struct clock_thread));
+	tentries = nr_entries * nr_cpus;
 	entries = malloc(tentries * sizeof(struct clock_entry));
 
-	log_info("cs: Testing %u CPUs\n", nr_cpus);
+	if (debug)
+		log_info("cs: Testing %u CPUs\n", nr_cpus);
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 
 		t->cpu = i;
+		t->debug = debug;
 		t->seq = &seq;
-		t->entries = &entries[i * CLOCK_ENTRIES];
-		pthread_mutex_init(&t->lock, NULL);
-		pthread_mutex_init(&t->started, NULL);
-		pthread_mutex_lock(&t->lock);
-		pthread_create(&t->thread, NULL, clock_thread_fn, t);
+		t->nr_entries = nr_entries;
+		t->entries = &entries[i * nr_entries];
+		__fio_sem_init(&t->lock, FIO_SEM_LOCKED);
+		if (pthread_create(&t->thread, NULL, clock_thread_fn, t)) {
+			failed++;
+			nr_cpus = i;
+			break;
+		}
 	}
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 
-		pthread_mutex_lock(&t->started);
+		fio_sem_up(&t->lock);
 	}
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
-
-		pthread_mutex_unlock(&t->lock);
-	}
-
-	for (failed = i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 		void *ret;
 
 		pthread_join(t->thread, &ret);
 		if (ret)
 			failed++;
+		__fio_sem_remove(&t->lock);
 	}
-	free(threads);
+	free(cthreads);
 
 	if (failed) {
-		log_err("Clocksource test: %lu threads failed\n", failed);
+		if (debug)
+			log_err("Clocksource test: %lu threads failed\n", failed);
 		goto err;
 	}
 
 	qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
 
+	/* silence silly gcc */
+	prev = NULL;
 	for (failed = i = 0; i < tentries; i++) {
 		this = &entries[i];
 
@@ -595,6 +739,11 @@
 		if (prev->tsc > this->tsc) {
 			uint64_t diff = prev->tsc - this->tsc;
 
+			if (!debug) {
+				failed++;
+				break;
+			}
+
 			log_info("cs: CPU clock mismatch (diff=%llu):\n",
 						(unsigned long long) diff);
 			log_info("\t CPU%3u: TSC=%llu, SEQ=%u\n", prev->cpu, (unsigned long long) prev->tsc, prev->seq);
@@ -605,11 +754,12 @@
 		prev = this;
 	}
 
-	if (failed)
-		log_info("cs: Failed: %lu\n", failed);
-	else
-		log_info("cs: Pass!\n");
-
+	if (debug) {
+		if (failed)
+			log_info("cs: Failed: %lu\n", failed);
+		else
+			log_info("cs: Pass!\n");
+	}
 err:
 	free(entries);
 	return !!failed;
@@ -617,10 +767,11 @@
 
 #else /* defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK) */
 
-int fio_monotonic_clocktest(void)
+int fio_monotonic_clocktest(int debug)
 {
-	log_info("cs: current platform does not support CPU clocks\n");
-	return 0;
+	if (debug)
+		log_info("cs: current platform does not support CPU clocks\n");
+	return 1;
 }
 
 #endif
diff -Nru fio-2.1.3/gettime.h fio-3.16/gettime.h
--- fio-2.1.3/gettime.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gettime.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,10 @@
 #ifndef FIO_GETTIME_H
 #define FIO_GETTIME_H
 
+#include <sys/time.h>
+
+#include "arch/arch.h"
+
 /*
  * Clock sources
  */
@@ -11,13 +15,31 @@
 	CS_INVAL,
 };
 
-extern void fio_gettime(struct timeval *, void *);
+extern void fio_gettime(struct timespec *, void *);
 extern void fio_gtod_init(void);
 extern void fio_clock_init(void);
 extern int fio_start_gtod_thread(void);
-extern int fio_monotonic_clocktest(void);
-extern void fio_local_clock_init(int);
+extern int fio_monotonic_clocktest(int debug);
+extern void fio_local_clock_init(void);
+
+extern struct timespec *fio_ts;
+
+static inline int fio_gettime_offload(struct timespec *ts)
+{
+	time_t last_sec;
+
+	if (!fio_ts)
+		return 0;
+
+	do {
+		read_barrier();
+		last_sec = ts->tv_sec = fio_ts->tv_sec;
+		ts->tv_nsec = fio_ts->tv_nsec;
+	} while (fio_ts->tv_sec != last_sec);
+
+	return 1;
+}
 
-extern struct timeval *fio_tv;
+extern void fio_gtod_set_cpu(unsigned int cpu);
 
 #endif
diff -Nru fio-2.1.3/gettime-thread.c fio-3.16/gettime-thread.c
--- fio-2.1.3/gettime-thread.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gettime-thread.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,34 +1,53 @@
-#include <unistd.h>
-#include <math.h>
 #include <sys/time.h>
 #include <time.h>
 
 #include "fio.h"
 #include "smalloc.h"
 
-struct timeval *fio_tv = NULL;
+struct timespec *fio_ts = NULL;
 int fio_gtod_offload = 0;
-int fio_gtod_cpu = -1;
 static pthread_t gtod_thread;
+static os_cpu_mask_t fio_gtod_cpumask;
 
 void fio_gtod_init(void)
 {
-	fio_tv = smalloc(sizeof(struct timeval));
-	if (!fio_tv)
-		log_err("fio: smalloc pool exhausted\n");
+	if (fio_ts)
+		return;
+
+	fio_ts = smalloc(sizeof(*fio_ts));
 }
 
 static void fio_gtod_update(void)
 {
-	if (fio_tv)
-		gettimeofday(fio_tv, NULL);
+	if (fio_ts) {
+		struct timeval __tv;
+
+		gettimeofday(&__tv, NULL);
+		fio_ts->tv_sec = __tv.tv_sec;
+		write_barrier();
+		fio_ts->tv_nsec = __tv.tv_usec * 1000;
+		write_barrier();
+	}
 }
 
+struct gtod_cpu_data {
+	struct fio_sem *sem;
+	unsigned int cpu;
+};
+
 static void *gtod_thread_main(void *data)
 {
-	struct fio_mutex *mutex = data;
+	struct fio_sem *sem = data;
+	int ret;
+
+	ret = fio_setaffinity(gettid(), fio_gtod_cpumask);
 
-	fio_mutex_up(mutex);
+	fio_sem_up(sem);
+
+	if (ret == -1) {
+		log_err("gtod: setaffinity failed\n");
+		return NULL;
+	}
 
 	/*
 	 * As long as we have jobs around, update the clock. It would be nice
@@ -46,17 +65,17 @@
 
 int fio_start_gtod_thread(void)
 {
-	struct fio_mutex *mutex;
+	struct fio_sem *sem;
 	pthread_attr_t attr;
 	int ret;
 
-	mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
-	if (!mutex)
+	sem = fio_sem_init(FIO_SEM_LOCKED);
+	if (!sem)
 		return 1;
 
 	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
-	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, NULL);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
+	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, sem);
 	pthread_attr_destroy(&attr);
 	if (ret) {
 		log_err("Can't create gtod thread: %s\n", strerror(ret));
@@ -65,16 +84,21 @@
 
 	ret = pthread_detach(gtod_thread);
 	if (ret) {
-		log_err("Can't detatch gtod thread: %s\n", strerror(ret));
+		log_err("Can't detach gtod thread: %s\n", strerror(ret));
 		goto err;
 	}
 
-	dprint(FD_MUTEX, "wait on startup_mutex\n");
-	fio_mutex_down(mutex);
-	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+	dprint(FD_MUTEX, "wait on startup_sem\n");
+	fio_sem_down(sem);
+	dprint(FD_MUTEX, "done waiting on startup_sem\n");
 err:
-	fio_mutex_remove(mutex);
+	fio_sem_remove(sem);
 	return ret;
 }
 
-
+void fio_gtod_set_cpu(unsigned int cpu)
+{
+#ifdef FIO_HAVE_CPU_AFFINITY
+	fio_cpu_set(&fio_gtod_cpumask, cpu);
+#endif
+}
diff -Nru fio-2.1.3/gfio.c fio-3.16/gfio.c
--- fio-2.1.3/gfio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/gfio.c	2019-09-20 01:01:52.000000000 +0000
@@ -18,12 +18,13 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
 #include <locale.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <string.h>
+#include <libgen.h>
 
 #include <glib.h>
 #include <cairo.h>
@@ -37,7 +38,7 @@
 #include "gclient.h"
 #include "graph.h"
 
-static int gfio_server_running;
+static bool gfio_server_running;
 static unsigned int gfio_graph_limit = 100;
 
 GdkColor gfio_color_white;
@@ -444,12 +445,12 @@
 	while (!flist_empty(&gc->o_list)) {
 		struct gfio_client_options *gco;
 
-		gco = flist_entry(gc->o_list.next, struct gfio_client_options, list);
+		gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
 		flist_del(&gco->list);
 		free(gco);
 	}
 
-	ret = fio_client_send_ini(gc->client, ge->job_file);
+	ret = fio_client_send_ini(gc->client, ge->job_file, false);
 	if (!ret)
 		return 0;
 
@@ -459,17 +460,19 @@
 
 static void *server_thread(void *arg)
 {
-	is_backend = 1;
-	gfio_server_running = 1;
+	fio_server_create_sk_key();
+	is_backend = true;
+	gfio_server_running = true;
 	fio_start_server(NULL);
-	gfio_server_running = 0;
+	gfio_server_running = false;
+	fio_server_destroy_sk_key();
 	return NULL;
 }
 
 static void gfio_start_server(struct gui *ui)
 {
 	if (!gfio_server_running) {
-		gfio_server_running = 1;
+		gfio_server_running = true;
 		pthread_create(&ui->server_t, NULL, server_thread, NULL);
 		pthread_detach(ui->server_t);
 	}
@@ -1213,7 +1216,7 @@
 {
 	const char *authors[] = {
 		"Jens Axboe <axboe@kernel.dk>",
-		"Stephen Carmeron <stephenmcameron@gmail.com>",
+		"Stephen Cameron <stephenmcameron@gmail.com>",
 		NULL
 	};
 	const char *license[] = {
@@ -1238,10 +1241,10 @@
 		"program-name", "gfio",
 		"comments", "Gtk2 UI for fio",
 		"license", license_trans,
-		"website", "http://git.kernel.dk/?p=fio.git;a=summary",
+		"website", "http://git.kernel.dk/cgit/fio/",
 		"authors", authors,
 		"version", fio_version_string,
-		"copyright", "© 2012 Jens Axboe <axboe@kernel.dk>",
+		"copyright", "© 2012-2017 Jens Axboe <axboe@kernel.dk>",
 		"logo-icon-name", "fio",
 		/* Must be last: */
 		"wrap-license", TRUE,
@@ -1269,7 +1272,7 @@
 	{ "Quit", GTK_STOCK_QUIT, NULL,   "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
 	{ "About", GTK_STOCK_ABOUT, NULL,  NULL, NULL, G_CALLBACK(about_dialog) },
 };
-static gint nmenu_items = sizeof(menu_items) / sizeof(menu_items[0]);
+static gint nmenu_items = ARRAY_SIZE(menu_items);
 
 static const gchar *ui_string = " \
 	<ui> \
@@ -1384,7 +1387,7 @@
 	g_signal_connect(ge->eta.names, "changed", G_CALLBACK(combo_entry_changed), ge);
 	g_signal_connect(ge->eta.names, "destroy", G_CALLBACK(combo_entry_destroy), ge);
 	ge->eta.iotype.entry = new_info_entry_in_frame(probe_box, "IO");
-	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write)");
+	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write/Trim)");
 	ge->eta.ioengine.entry = new_info_entry_in_frame(probe_box, "IO Engine");
 	ge->eta.iodepth.entry = new_info_entry_in_frame(probe_box, "IO Depth");
 	ge->eta.jobs = new_info_entry_in_frame(probe_box, "Jobs");
@@ -1393,11 +1396,11 @@
 	probe_box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
 	ge->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
-	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "Read IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
 	ge->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
-	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "Write IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
 	ge->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
-	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "Trim IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
 
 	/*
 	 * Only add this if we have a commit rate
@@ -1677,7 +1680,7 @@
 	 * Without it, the update that happens in gfio_update_thread_status
 	 * doesn't really happen in a timely fashion, you need expose events
 	 */
-#if !GTK_CHECK_VERSION(2, 24, 0)
+#if !GLIB_CHECK_VERSION(2, 31, 0)
 	if (!g_thread_supported())
 		g_thread_init(NULL);
 #endif
@@ -1687,7 +1690,9 @@
 	gtk_init(argc, argv);
 	settings = gtk_settings_get_default();
 	gtk_settings_set_long_property(settings, "gtk_tooltip_timeout", 10, "gfio setting");
+#if !GLIB_CHECK_VERSION(2, 36, 0)
 	g_type_init();
+#endif
 	gdk_color_parse("#fffff4", &gfio_color_lightyellow);
 	gdk_color_parse("white", &gfio_color_white);
 
diff -Nru fio-2.1.3/.gitignore fio-3.16/.gitignore
--- fio-2.1.3/.gitignore	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/.gitignore	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,6 @@
 *.d
 *.o
+*.exe
 /.depend
 /FIO-VERSION-FILE
 /config-host.h
@@ -7,3 +8,20 @@
 /config.log
 /cscope.out
 /fio
+/gfio
+/t/axmap
+/t/fio-btrace2fio
+/t/fio-dedupe
+/t/fio-genzipf
+/t/fio-verify-state
+/t/gen-rand
+/t/ieee754
+/t/lfsr-test
+/t/stest
+/unittests/unittest
+y.tab.*
+lex.yy.c
+*.un~
+doc/output
+/tags
+/TAGS
diff -Nru fio-2.1.3/goptions.c fio-3.16/goptions.c
--- fio-2.1.3/goptions.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/goptions.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,5 @@
 #include <locale.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <glib.h>
@@ -11,6 +11,7 @@
 #include "ghelpers.h"
 #include "gerror.h"
 #include "parse.h"
+#include "optgroup.h"
 
 struct gopt {
 	GtkWidget *box;
@@ -92,10 +93,10 @@
 static GNode *gopt_dep_tree;
 
 static GtkWidget *gopt_get_group_frame(struct gopt_job_view *gjv,
-				       GtkWidget *box, unsigned int groupmask)
+				       GtkWidget *box, uint64_t groupmask)
 {
-	unsigned int mask, group;
-	struct opt_group *og;
+	uint64_t mask, group;
+	const struct opt_group *og;
 	GtkWidget *frame, *hbox;
 	struct gopt_frame_widget *gfw;
 
@@ -107,7 +108,7 @@
 	if (!og)
 		return NULL;
 
-	group = ffz(~groupmask);
+	group = ffz64(~groupmask);
 	gfw = &gjv->g_widgets[group];
 	if (!gfw->vbox[0]) {
 		frame = gtk_frame_new(og->name);
@@ -825,7 +826,7 @@
 				     unsigned long long *p, unsigned int idx)
 {
 	struct gopt_str_val *g;
-	const gchar *postfix[] = { "B", "KB", "MB", "GB", "PB", "TB", "" };
+	const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
 	GtkWidget *label;
 	int i;
 
@@ -874,7 +875,7 @@
 		struct gopt_str_val *g;
 
 		if (o->off1)
-			ullp = td_var(to, o->off1);
+			ullp = td_var(to, o, o->off1);
 
 		g = container_of(gopt, struct gopt_str_val, gopt);
 		if (ullp)
@@ -886,7 +887,7 @@
 		struct gopt_int *i;
 
 		if (o->off1)
-			ullp = td_var(to, o->off1);
+			ullp = td_var(to, o, o->off1);
 
 		i = container_of(gopt, struct gopt_int, gopt);
 		if (ullp)
@@ -899,7 +900,7 @@
 			struct gopt_combo *c;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			c = container_of(gopt, struct gopt_combo, gopt);
 			if (ip)
@@ -909,7 +910,7 @@
 			struct gopt_int *i;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			i = container_of(gopt, struct gopt_int, gopt);
 			if (ip)
@@ -922,7 +923,7 @@
 		struct gopt_bool *b;
 
 		if (o->off1)
-			ip = td_var(to, o->off1);
+			ip = td_var(to, o, o->off1);
 
 		b = container_of(gopt, struct gopt_bool, gopt);
 		if (ip)
@@ -935,7 +936,7 @@
 			struct gopt_combo *c;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			c = container_of(gopt, struct gopt_combo, gopt);
 			if (ip)
@@ -945,7 +946,7 @@
 			char *text = NULL;
 
 			if (o->off1) {
-				char **p = td_var(to, o->off1);
+				char **p = td_var(to, o, o->off1);
 
 				text = *p;
 			}
@@ -961,7 +962,7 @@
 		char *text = NULL;
 
 		if (o->off1) {
-			char **p = td_var(to, o->off1);
+			char **p = td_var(to, o, o->off1);
 			text = *p;
 		}
 
@@ -983,10 +984,10 @@
 		break;
 	case FIO_OPT_RANGE: {
 		struct gopt_range *r;
-		unsigned int *ip[4] = { td_var(to, o->off1),
-					td_var(to, o->off2),
-					td_var(to, o->off3),
-					td_var(to, o->off4) };
+		unsigned int *ip[4] = { td_var(to, o, o->off1),
+					td_var(to, o, o->off2),
+					td_var(to, o, o->off3),
+					td_var(to, o, o->off4) };
 
 		r = container_of(gopt, struct gopt_range, gopt);
 		gopt_int_range_set_val(r, *ip);
@@ -1014,7 +1015,7 @@
 		unsigned long long *ullp = NULL;
 
 		if (o->off1)
-			ullp = td_var(to, o->off1);
+			ullp = td_var(to, o, o->off1);
 
 		go = gopt_new_str_val(gjv, o, ullp, opt_index);
 		break;
@@ -1023,7 +1024,7 @@
 		unsigned long long *ullp = NULL;
 
 		if (o->off1)
-			ullp = td_var(to, o->off1);
+			ullp = td_var(to, o, o->off1);
 
 		go = gopt_new_ullong(gjv, o, ullp, opt_index);
 		break;
@@ -1033,14 +1034,14 @@
 			unsigned int *ip = NULL;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			go = gopt_new_combo_int(gjv, o, ip, opt_index);
 		} else {
 			unsigned int *ip = NULL;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			go = gopt_new_int(gjv, o, ip, opt_index);
 		}
@@ -1050,7 +1051,7 @@
 		unsigned int *ip = NULL;
 
 		if (o->off1)
-			ip = td_var(to, o->off1);
+			ip = td_var(to, o, o->off1);
 
 		go = gopt_new_bool(gjv, o, ip, opt_index);
 		break;
@@ -1060,7 +1061,7 @@
 			unsigned int *ip = NULL;
 
 			if (o->off1)
-				ip = td_var(to, o->off1);
+				ip = td_var(to, o, o->off1);
 
 			go = gopt_new_combo_int(gjv, o, ip, opt_index);
 		} else {
@@ -1074,7 +1075,7 @@
 		char *text = NULL;
 
 		if (o->off1) {
-			char **p = td_var(to, o->off1);
+			char **p = td_var(to, o, o->off1);
 			text = *p;
 		}
 
@@ -1090,10 +1091,10 @@
 		go = gopt_new_str_multi(gjv, o, opt_index);
 		break;
 	case FIO_OPT_RANGE: {
-		unsigned int *ip[4] = { td_var(to, o->off1),
-					td_var(to, o->off2),
-					td_var(to, o->off3),
-					td_var(to, o->off4) };
+		unsigned int *ip[4] = { td_var(to, o, o->off1),
+					td_var(to, o, o->off2),
+					td_var(to, o, o->off3),
+					td_var(to, o, o->off4) };
 
 		go = gopt_new_int_range(gjv, o, ip, opt_index);
 		break;
@@ -1135,11 +1136,11 @@
 	 */
 	for (i = 0; fio_options[i].name; i++) {
 		struct fio_option *o = &fio_options[i];
-		unsigned int mask = o->category;
-		struct opt_group *og;
+		uint64_t mask = o->category;
+		const struct opt_group *og;
 
 		while ((og = opt_group_from_mask(&mask)) != NULL) {
-			GtkWidget *vbox = gjv->vboxes[ffz(~og->mask)];
+			GtkWidget *vbox = gjv->vboxes[ffz64(~og->mask)];
 
 			hbox = gtk_hbox_new(FALSE, 3);
 			gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, FALSE, 5);
@@ -1177,19 +1178,20 @@
 	return vbox;
 }
 
-static GtkWidget *gopt_add_group_tab(GtkWidget *notebook, struct opt_group *og)
+static GtkWidget *gopt_add_group_tab(GtkWidget *notebook,
+				     const struct opt_group *og)
 {
 	return gopt_add_tab(notebook, og->name);
 }
 
 static void gopt_add_group_tabs(GtkWidget *notebook, struct gopt_job_view *gjv)
 {
-	struct opt_group *og;
+	const struct opt_group *og;
 	unsigned int i;
 
 	i = 0;
 	do {
-		unsigned int mask = (1U << i);
+		uint64_t mask = (1ULL << i);
 
 		og = opt_group_from_mask(&mask);
 		if (!og)
@@ -1203,7 +1205,7 @@
 					  struct gopt_str_multi *m,
 					  struct fio_option *o)
 {
-	unsigned int *ip = td_var(gjv->o, o->off1);
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
 	struct value_pair *vp;
 	gboolean set;
 	guint val = 0;
@@ -1216,7 +1218,7 @@
 			break;
 		set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(m->checks[i]));
 		if (set) {
-			if (vp->or)
+			if (vp->orval)
 				val |= vp->oval;
 			else
 				val = vp->oval;
@@ -1233,10 +1235,10 @@
 				      struct gopt_range *r,
 				      struct fio_option *o)
 {
-	unsigned int *ip[4] = { td_var(gjv->o, o->off1),
-				td_var(gjv->o, o->off2),
-				td_var(gjv->o, o->off3),
-				td_var(gjv->o, o->off4) };
+	unsigned int *ip[4] = { td_var(gjv->o, o, o->off1),
+				td_var(gjv->o, o, o->off2),
+				td_var(gjv->o, o, o->off3),
+				td_var(gjv->o, o, o->off4) };
 	gint val;
 	int i;
 
@@ -1250,7 +1252,7 @@
 					struct gopt_str_val *s,
 					struct fio_option *o)
 {
-	unsigned long long *ullp = td_var(gjv->o, o->off1);
+	unsigned long long *ullp = td_var(gjv->o, o, o->off1);
 	GtkAdjustment *adj;
 	gint index;
 
@@ -1274,7 +1276,7 @@
 static void gopt_handle_str_changed(struct gopt_job_view *gjv,
 				    struct gopt_str *s, struct fio_option *o)
 {
-	char **p = td_var(gjv->o, o->off1);
+	char **p = td_var(gjv->o, o, o->off1);
 
 	if (*p)
 		free(*p);
@@ -1285,7 +1287,7 @@
 static void gopt_handle_bool_changed(struct gopt_job_view *gjv,
 				     struct gopt_bool *b, struct fio_option *o)
 {
-	unsigned int *ip = td_var(gjv->o, o->off1);
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
 	gboolean set;
 
 	set = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(b->check));
@@ -1295,7 +1297,7 @@
 static void gopt_handle_int_changed(struct gopt_job_view *gjv,
 				    struct gopt_int *i, struct fio_option *o)
 {
-	unsigned int *ip = td_var(gjv->o, o->off1);
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
 	GtkAdjustment *adj;
 	guint val;
 
@@ -1308,7 +1310,7 @@
 					  struct gopt_combo *c,
 					  struct fio_option *o)
 {
-	char **p = td_var(gjv->o, o->off1);
+	char **p = td_var(gjv->o, o, o->off1);
 
 	if (*p)
 		free(*p);
@@ -1320,7 +1322,7 @@
 					  struct gopt_combo *c,
 					  struct fio_option *o)
 {
-	unsigned int *ip = td_var(gjv->o, o->off1);
+	unsigned int *ip = td_var(gjv->o, o, o->off1);
 	gint index;
 
 	index = gtk_combo_box_get_active(GTK_COMBO_BOX(c->combo));
@@ -1433,7 +1435,7 @@
 		goto done;
 
 	while (!flist_empty(&gjv->changed_list)) {
-		gopt = flist_entry(gjv->changed_list.next, struct gopt, changed_list);
+		gopt = flist_first_entry(&gjv->changed_list, struct gopt, changed_list);
 		flist_del_init(&gopt->changed_list);
 	}
 
@@ -1577,7 +1579,7 @@
 
 	gjv = calloc(1, sizeof(*gjv));
 	INIT_FLIST_HEAD(&gjv->changed_list);
-	gco = flist_entry(gc->o_list.next, struct gfio_client_options, list);
+	gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
 	gjv->o = &gco->o;
 	gjv->dialog = dialog;
 	gjv->client = gc;
diff -Nru fio-2.1.3/graph.c fio-3.16/graph.c
--- fio-2.1.3/graph.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/graph.c	2019-09-20 01:01:52.000000000 +0000
@@ -17,11 +17,11 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
 #include <string.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <math.h>
 #include <assert.h>
 #include <stdlib.h>
@@ -687,7 +687,7 @@
 	 */
 	while (!(v->flags & GV_F_ON_PRIO)) {
 		assert(!flist_empty(&v->alias));
-		v = flist_entry(v->alias.next, struct graph_value, alias);
+		v = flist_first_entry(&v->alias, struct graph_value, alias);
 	}
 
 	prio_tree_remove(&l->prio_tree, &v->node);
@@ -698,7 +698,7 @@
 	while (!flist_empty(&v->alias)) {
 		struct graph_value *a;
 
-		a = flist_entry(v->alias.next, struct graph_value, alias);
+		a = flist_first_entry(&v->alias, struct graph_value, alias);
 		flist_del_init(&a->alias);
 
 		__graph_value_drop(l, a);
@@ -773,7 +773,7 @@
 			to_drop = 2;
 
 		while (to_drop-- && !flist_empty(&i->value_list)) {
-			x = flist_entry(i->value_list.next, struct graph_value, list);
+			x = flist_first_entry(&i->value_list, struct graph_value, list);
 			graph_value_drop(i, x);
 
 			/*
@@ -836,7 +836,7 @@
 	struct graph_value *i;
 
 	while (!flist_empty(&l->value_list)) {
-		i = flist_entry(l->value_list.next, struct graph_value, list);
+		i = flist_first_entry(&l->value_list, struct graph_value, list);
 		graph_value_drop(l, i);
 	}
 }
@@ -846,7 +846,7 @@
 	struct graph_label *i;
 
 	while (!flist_empty(&g->label_list)) {
-		i = flist_entry(g->label_list.next, struct graph_label, list);
+		i = flist_first_entry(&g->label_list, struct graph_label, list);
 		flist_del(&i->list);
 		graph_free_values(i);
 		free(i);
@@ -1010,7 +1010,7 @@
 					}
 				}
 				if (!flist_empty(&v->alias))
-					v = flist_entry(v->alias.next, struct graph_value, alias);
+					v = flist_first_entry(&v->alias, struct graph_value, alias);
 			} while (v != rootv);
 		} while ((n = prio_tree_next(&iter)) != NULL);
 
diff -Nru fio-2.1.3/hash.h fio-3.16/hash.h
--- fio-2.1.3/hash.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/hash.h	2019-09-20 01:01:52.000000000 +0000
@@ -28,15 +28,31 @@
 #error Define GOLDEN_RATIO_PRIME for your wordsize.
 #endif
 
-#define GR_PRIME_64	0x9e37fffffffc0001ULL
+/*
+ * The above primes are actively bad for hashing, since they are
+ * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
+ * real problems. Besides, the "prime" part is pointless for the
+ * multiplicative hash.
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.
+ *
+ * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static inline unsigned long __hash_long(unsigned long val)
+static inline unsigned long __hash_long(uint64_t val)
 {
-	unsigned long hash = val;
+	uint64_t hash = val;
 
 #if BITS_PER_LONG == 64
+	hash *= GOLDEN_RATIO_64;
+#else
 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	unsigned long n = hash;
+	uint64_t n = hash;
 	n <<= 18;
 	hash -= n;
 	n <<= 33;
@@ -49,9 +65,6 @@
 	hash += n;
 	n <<= 2;
 	hash += n;
-#else
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	hash *= GOLDEN_RATIO_PRIME;
 #endif
 
 	return hash;
@@ -65,7 +78,7 @@
 
 static inline uint64_t __hash_u64(uint64_t val)
 {
-	return val * GR_PRIME_64;
+	return val * GOLDEN_RATIO_64;
 }
 	
 static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
@@ -77,7 +90,7 @@
  * Bob Jenkins jhash
  */
 
-#define JHASH_INITVAL	GOLDEN_RATIO_PRIME
+#define JHASH_INITVAL	GOLDEN_RATIO_32
 
 static inline uint32_t rol32(uint32_t word, uint32_t shift)
 {
@@ -128,17 +141,17 @@
 	/* Last block: affect all 32 bits of (c) */
 	/* All the case statements fall through */
 	switch (length) {
-	case 12: c += (uint32_t) k[11] << 24;
-	case 11: c += (uint32_t) k[10] << 16;
-	case 10: c += (uint32_t) k[9] << 8;
-	case 9:  c += k[8];
-	case 8:  b += (uint32_t) k[7] << 24;
-	case 7:  b += (uint32_t) k[6] << 16;
-	case 6:  b += (uint32_t) k[5] << 8;
-	case 5:  b += k[4];
-	case 4:  a += (uint32_t) k[3] << 24;
-	case 3:  a += (uint32_t) k[2] << 16;
-	case 2:  a += (uint32_t) k[1] << 8;
+	case 12: c += (uint32_t) k[11] << 24;	/* fall through */
+	case 11: c += (uint32_t) k[10] << 16;	/* fall through */
+	case 10: c += (uint32_t) k[9] << 8;	/* fall through */
+	case 9:  c += k[8];			/* fall through */
+	case 8:  b += (uint32_t) k[7] << 24;	/* fall through */
+	case 7:  b += (uint32_t) k[6] << 16;	/* fall through */
+	case 6:  b += (uint32_t) k[5] << 8;	/* fall through */
+	case 5:  b += k[4];			/* fall through */
+	case 4:  a += (uint32_t) k[3] << 24;	/* fall through */
+	case 3:  a += (uint32_t) k[2] << 16;	/* fall through */
+	case 2:  a += (uint32_t) k[1] << 8;	/* fall through */
 	case 1:  a += k[0];
 		 __jhash_final(a, b, c);
 	case 0: /* Nothing left to add */
diff -Nru fio-2.1.3/helpers.c fio-3.16/helpers.c
--- fio-2.1.3/helpers.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/helpers.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,6 @@
-#include <stdlib.h>
 #include <errno.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <netinet/in.h>
-#include <unistd.h>
 
-#include "compiler/compiler.h"
-#include "arch/arch.h"
-#include "os/os.h"
+#include "helpers.h"
 
 #ifndef CONFIG_LINUX_FALLOCATE
 int fallocate(int fd, int mode, off_t offset, off_t len)
diff -Nru fio-2.1.3/helpers.h fio-3.16/helpers.h
--- fio-2.1.3/helpers.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/helpers.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,10 +1,9 @@
 #ifndef FIO_HELPERS_H
 #define FIO_HELPERS_H
 
-#include "compiler/compiler.h"
-
 #include <sys/types.h>
-#include <time.h>
+
+#include "os/os.h"
 
 extern int fallocate(int fd, int mode, off_t offset, off_t len);
 extern int posix_fallocate(int fd, off_t offset, off_t len);
diff -Nru fio-2.1.3/helper_thread.c fio-3.16/helper_thread.c
--- fio-2.1.3/helper_thread.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/helper_thread.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,193 @@
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
+
+#include "fio.h"
+#include "smalloc.h"
+#include "helper_thread.h"
+#include "steadystate.h"
+#include "pshared.h"
+
+static struct helper_data {
+	volatile int exit;
+	volatile int reset;
+	volatile int do_stat;
+	struct sk_out *sk_out;
+	pthread_t thread;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	struct fio_sem *startup_sem;
+} *helper_data;
+
+void helper_thread_destroy(void)
+{
+	pthread_cond_destroy(&helper_data->cond);
+	pthread_mutex_destroy(&helper_data->lock);
+	sfree(helper_data);
+}
+
+void helper_reset(void)
+{
+	if (!helper_data)
+		return;
+
+	pthread_mutex_lock(&helper_data->lock);
+
+	if (!helper_data->reset) {
+		helper_data->reset = 1;
+		pthread_cond_signal(&helper_data->cond);
+	}
+
+	pthread_mutex_unlock(&helper_data->lock);
+}
+
+void helper_do_stat(void)
+{
+	if (!helper_data)
+		return;
+
+	pthread_mutex_lock(&helper_data->lock);
+	helper_data->do_stat = 1;
+	pthread_cond_signal(&helper_data->cond);
+	pthread_mutex_unlock(&helper_data->lock);
+}
+
+bool helper_should_exit(void)
+{
+	if (!helper_data)
+		return true;
+
+	return helper_data->exit;
+}
+
+void helper_thread_exit(void)
+{
+	void *ret;
+
+	pthread_mutex_lock(&helper_data->lock);
+	helper_data->exit = 1;
+	pthread_cond_signal(&helper_data->cond);
+	pthread_mutex_unlock(&helper_data->lock);
+
+	pthread_join(helper_data->thread, &ret);
+}
+
+static void *helper_thread_main(void *data)
+{
+	struct helper_data *hd = data;
+	unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC;
+	struct timeval tv;
+	struct timespec ts, last_du, last_ss;
+	int ret = 0;
+
+	sk_out_assign(hd->sk_out);
+
+	gettimeofday(&tv, NULL);
+	ts.tv_sec = tv.tv_sec;
+	ts.tv_nsec = tv.tv_usec * 1000;
+	memcpy(&last_du, &ts, sizeof(ts));
+	memcpy(&last_ss, &ts, sizeof(ts));
+
+	fio_sem_up(hd->startup_sem);
+
+	msec_to_next_event = DISK_UTIL_MSEC;
+	while (!ret && !hd->exit) {
+		uint64_t since_du, since_ss = 0;
+
+		timespec_add_msec(&ts, msec_to_next_event);
+
+		pthread_mutex_lock(&hd->lock);
+		pthread_cond_timedwait(&hd->cond, &hd->lock, &ts);
+
+		gettimeofday(&tv, NULL);
+		ts.tv_sec = tv.tv_sec;
+		ts.tv_nsec = tv.tv_usec * 1000;
+
+		if (hd->reset) {
+			memcpy(&last_du, &ts, sizeof(ts));
+			memcpy(&last_ss, &ts, sizeof(ts));
+			hd->reset = 0;
+		}
+
+		pthread_mutex_unlock(&hd->lock);
+
+		since_du = mtime_since(&last_du, &ts);
+		if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
+			ret = update_io_ticks();
+			timespec_add_msec(&last_du, DISK_UTIL_MSEC);
+			msec_to_next_event = DISK_UTIL_MSEC;
+			if (since_du >= DISK_UTIL_MSEC)
+				msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
+		} else
+			msec_to_next_event = DISK_UTIL_MSEC - since_du;
+
+		if (hd->do_stat) {
+			hd->do_stat = 0;
+			__show_running_run_stats();
+		}
+
+		next_log = calc_log_samples();
+		if (!next_log)
+			next_log = DISK_UTIL_MSEC;
+
+		if (steadystate_enabled) {
+			since_ss = mtime_since(&last_ss, &ts);
+			if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) {
+				steadystate_check();
+				timespec_add_msec(&last_ss, since_ss);
+				if (since_ss > STEADYSTATE_MSEC)
+					next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC);
+				else
+					next_ss = STEADYSTATE_MSEC;
+			} else
+				next_ss = STEADYSTATE_MSEC - since_ss;
+                }
+
+		msec_to_next_event = min(min(next_log, msec_to_next_event), next_ss);
+		dprint(FD_HELPERTHREAD, "since_ss: %llu, next_ss: %u, next_log: %u, msec_to_next_event: %u\n", (unsigned long long)since_ss, next_ss, next_log, msec_to_next_event);
+
+		if (!is_backend)
+			print_thread_status();
+	}
+
+	fio_writeout_logs(false);
+
+	sk_out_drop();
+	return NULL;
+}
+
+int helper_thread_create(struct fio_sem *startup_sem, struct sk_out *sk_out)
+{
+	struct helper_data *hd;
+	int ret;
+
+	hd = scalloc(1, sizeof(*hd));
+
+	setup_disk_util();
+	steadystate_setup();
+
+	hd->sk_out = sk_out;
+
+	ret = mutex_cond_init_pshared(&hd->lock, &hd->cond);
+	if (ret)
+		return 1;
+
+	hd->startup_sem = startup_sem;
+
+	DRD_IGNORE_VAR(helper_data);
+
+	ret = pthread_create(&hd->thread, NULL, helper_thread_main, hd);
+	if (ret) {
+		log_err("Can't create helper thread: %s\n", strerror(ret));
+		return 1;
+	}
+
+	helper_data = hd;
+
+	dprint(FD_MUTEX, "wait on startup_sem\n");
+	fio_sem_down(startup_sem);
+	dprint(FD_MUTEX, "done waiting on startup_sem\n");
+	return 0;
+}
diff -Nru fio-2.1.3/helper_thread.h fio-3.16/helper_thread.h
--- fio-2.1.3/helper_thread.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/helper_thread.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,11 @@
+#ifndef FIO_HELPER_THREAD_H
+#define FIO_HELPER_THREAD_H
+
+extern void helper_reset(void);
+extern void helper_do_stat(void);
+extern bool helper_should_exit(void);
+extern void helper_thread_destroy(void);
+extern void helper_thread_exit(void);
+extern int helper_thread_create(struct fio_sem *, struct sk_out *);
+
+#endif
diff -Nru fio-2.1.3/HOWTO fio-3.16/HOWTO
--- fio-2.1.3/HOWTO	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/HOWTO	2019-09-20 01:01:52.000000000 +0000
@@ -1,1781 +1,4214 @@
-Table of contents
------------------
+How fio works
+-------------
 
-1. Overview
-2. How fio works
-3. Running fio
-4. Job file format
-5. Detailed list of parameters
-6. Normal output
-7. Terse output
-8. Trace file format
-9. CPU idleness profiling
-
-1.0 Overview and history
-------------------------
-fio was originally written to save me the hassle of writing special test
-case programs when I wanted to test a specific workload, either for
-performance reasons or to find/reproduce a bug. The process of writing
-such a test app can be tiresome, especially if you have to do it often.
-Hence I needed a tool that would be able to simulate a given io workload
-without resorting to writing a tailored test case again and again.
-
-A test work load is difficult to define, though. There can be any number
-of processes or threads involved, and they can each be using their own
-way of generating io. You could have someone dirtying large amounts of
-memory in an memory mapped file, or maybe several threads issuing
-reads using asynchronous io. fio needed to be flexible enough to
-simulate both of these cases, and many more.
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
 
-2.0 How fio works
------------------
-The first step in getting fio to simulate a desired io workload, is
-writing a job file describing that specific setup. A job file may contain
-any number of threads and/or files - the typical contents of the job file
-is a global section defining shared parameters, and one or more job
-sections describing the jobs involved. When run, fio parses this file
-and sets everything up as described. If we break down a job from top to
-bottom, it contains the following basic parameters:
+`I/O type`_
 
-	IO type		Defines the io pattern issued to the file(s).
-			We may only be reading sequentially from this
-			file(s), or we may be writing randomly. Or even
-			mixing reads and writes, sequentially or randomly.
+		Defines the I/O pattern issued to the file(s).  We may only be reading
+		sequentially from this file(s), or we may be writing randomly. Or even
+		mixing reads and writes, sequentially or randomly.
+		Should we be doing buffered I/O, or direct/raw I/O?
 
-	Block size	In how large chunks are we issuing io? This may be
-			a single value, or it may describe a range of
-			block sizes.
+`Block size`_
 
-	IO size		How much data are we going to be reading/writing.
+		In how large chunks are we issuing I/O? This may be a single value,
+		or it may describe a range of block sizes.
 
-	IO engine	How do we issue io? We could be memory mapping the
-			file, we could be using regular read/write, we
-			could be using splice, async io, syslet, or even
-			SG (SCSI generic sg).
+`I/O size`_
 
-	IO depth	If the io engine is async, how large a queuing
-			depth do we want to maintain?
+		How much data are we going to be reading/writing.
 
-	IO type		Should we be doing buffered io, or direct/raw io?
+`I/O engine`_
 
-	Num files	How many files are we spreading the workload over.
+		How do we issue I/O? We could be memory mapping the file, we could be
+		using regular read/write, we could be using splice, async I/O, or even
+		SG (SCSI generic sg).
 
-	Num threads	How many threads or processes should we spread
-			this workload over.
+`I/O depth`_
 
-The above are the basic parameters defined for a workload, in addition
-there's a multitude of parameters that modify other aspects of how this
-job behaves.
+		If the I/O engine is async, how large a queuing depth do we want to
+		maintain?
 
 
-3.0 Running fio
----------------
-See the README file for command line parameters, there are only a few
-of them.
+`Target file/device`_
 
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters:
+		How many files are we spreading the workload over.
 
-$ fio job_file
+`Threads, processes and job synchronization`_
 
-and it will start doing what the job_file tells it to do. You can give
-more than one job file on the command line, fio will serialize the running
-of those files. Internally that is the same as using the 'stonewall'
-parameter described the the parameter section.
-
-If the job file contains only one job, you may as well just give the
-parameters on the command line. The command line parameters are identical
-to the job parameters, with a few extra that control global parameters
-(see README). For example, for the job file parameter iodepth=2, the
-mirror command line option would be --iodepth 2 or --iodepth=2. You can
-also use the command line for giving more than one job entry. For each
---name option that fio sees, it will start a new job with that name.
-Command line entries following a --name entry will apply to that job,
-until there are no more entries or a new --name entry is seen. This is
-similar to the job file options, where each option applies to the current
-job until a new [] job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified
-in the job section requires that. Some other options may also be restricted,
-such as memory locking, io scheduler switching, and decreasing the nice value.
+		How many threads or processes should we spread this workload over.
 
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
 
-4.0 Job file format
--------------------
-As previously described, fio accepts one or more job files describing
-what it is supposed to do. The job file format is the classic ini file,
-where the names enclosed in [] brackets define the job name. You are free
-to use any ascii name you want, except 'global' which has special meaning.
-A global section sets defaults for the jobs described in that file. A job
-may override a global section parameter, and a job file may even have
-several global sections if so desired. A job is only affected by a global
-section residing above it. If the first character in a line is a ';' or a
-'#', the entire line is discarded as a comment.
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+	Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
+	or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+	enable file and memory debugging).  Currently, additional logging is
+	available for:
+
+	*process*
+			Dump info related to processes.
+	*file*
+			Dump info related to file actions.
+	*io*
+			Dump info related to I/O queuing.
+	*mem*
+			Dump info related to memory allocations.
+	*blktrace*
+			Dump info related to blktrace setup.
+	*verify*
+			Dump info related to I/O verification.
+	*all*
+			Enable all debug options.
+	*random*
+			Dump info related to random offset generation.
+	*parse*
+			Dump info related to option matching and parsing.
+	*diskutil*
+			Dump info related to disk utilization updates.
+	*job:x*
+			Dump info only related to job number x.
+	*mutex*
+			Dump info only related to mutex up/down ops.
+	*profile*
+			Dump info related to profile extensions.
+	*time*
+			Dump info related to internal time keeping.
+	*net*
+			Dump info related to networking connections.
+	*rate*
+			Dump info related to I/O rate switching.
+	*compress*
+			Dump info related to log compress/decompress.
+	*steadystate*
+			Dump info related to steadystate detection.
+	*helperthread*
+			Dump info related to the helper thread.
+	*zbd*
+			Dump info related to support for zoned block devices.
+	*?* or *help*
+			Show available debug options.
+
+.. option:: --parse-only
+
+	Parse options only, don't start any I/O.
+
+.. option:: --merge-blktrace-only
+
+	Merge blktraces only, don't start any I/O.
+
+.. option:: --output=filename
+
+	Write output to file `filename`.
+
+.. option:: --output-format=format
+
+	Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
+	formats can be selected, separated by a comma.  `terse` is a CSV based
+	format.  `json+` is like `json`, except it adds a full dump of the latency
+	buckets.
+
+.. option:: --bandwidth-log
+
+	Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+	Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+	Print statistics in selected mode AND terse, semicolon-delimited format.
+	**Deprecated**, use :option:`--output-format` instead to select multiple
+	formats.
+
+.. option:: --terse-version=version
+
+	Set terse `version` output format (default 3, or 2 or 4 or 5).
+
+.. option:: --version
+
+	Print version information and exit.
+
+.. option:: --help
+
+	Print a summary of the command line options and exit.
+
+.. option:: --cpuclock-test
+
+	Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=[test]
+
+	Test the speed of the built-in checksumming functions. If no argument is
+	given, all of them are tested. Alternatively, a comma separated list can
+	be passed, in which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+	Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+	List all commands defined by `ioengine`, or print help for `command`
+	defined by `ioengine`.  If no `ioengine` is given, list all
+	available ioengines.
+
+.. option:: --showcmd=jobfile
+
+	Convert `jobfile` to a set of command-line options.
+
+.. option:: --readonly
+
+	Turn on safety read-only checks, preventing writes and trims.  The
+	``--readonly`` option is an extra safety guard to prevent users from
+	accidentally starting a write or trim workload when that is not desired.
+	Fio will only modify the device under test if
+	`rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given.  This
+	safety net can be used as an extra precaution.
+
+.. option:: --eta=when
+
+	Specifies when real-time ETA estimate should be printed.  `when` may be
+	`always`, `never` or `auto`. `auto` is the default, it prints ETA
+	when requested if the output is a TTY. `always` disregards the output
+	type, and prints ETA when requested. `never` never prints ETA.
+
+.. option:: --eta-interval=time
+
+	By default, fio requests client ETA status roughly every second. With
+	this option, the interval is configurable. Fio imposes a minimum
+	allowed time to avoid flooding the console, less than 250 msec is
+	not supported.
+
+.. option:: --eta-newline=time
+
+	Force a new line for every `time` period passed.  When the unit is omitted,
+	the value is interpreted in seconds.
+
+.. option:: --status-interval=time
+
+	Force a full status dump of cumulative (from job start) values at `time`
+	intervals. This option does *not* provide per-period measurements. So
+	values such as bandwidth are running averages. When the time unit is omitted,
+	`time` is interpreted in seconds. Note that using this option with
+	``--output-format=json`` will yield output that technically isn't valid
+	json, since the output will be collated sets of valid json. It will need
+	to be split into valid sets of json after the run.
+
+.. option:: --section=name
+
+	Only run specified section `name` in job file.  Multiple sections can be specified.
+	The ``--section`` option allows one to combine related jobs into one file.
+	E.g. one job file could define light, moderate, and heavy sections. Tell
+	fio to run only the "heavy" section by giving ``--section=heavy``
+	command line option.  One can also specify the "write" operations in one
+	section and "verify" operation in another section.  The ``--section`` option
+	only applies to job sections.  The reserved *global* section is always
+	parsed and used.
+
+.. option:: --alloc-size=kb
+
+	Allocate additional internal smalloc pools of size `kb` in KiB.  The
+	``--alloc-size`` option increases shared memory set aside for use by fio.
+	If running large jobs with randommap enabled, fio can run out of memory.
+	Smalloc is an internal allocator for shared structures from a fixed size
+	memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+
+	NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+	in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+	All fio parser warnings are fatal, causing fio to exit with an
+	error.
+
+.. option:: --max-jobs=nr
+
+	Set the maximum number of threads/processes to support to `nr`.
+	NOTE: On Linux, it may be necessary to increase the shared-memory
+	limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
+	creating jobs.
+
+.. option:: --server=args
+
+	Start a backend server, with `args` specifying what to listen to.
+	See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+	Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+	Instead of running the jobs locally, send and run them on the given `hostname`
+	or set of `hostname`\s.  See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+	Tell fio server to load this local `file`.
+
+.. option:: --idle-prof=option
+
+	Report CPU idleness. `option` is one of the following:
+
+		**calibrate**
+			Run unit work calibration only and exit.
+
+		**system**
+			Show aggregate system idleness and unit work.
+
+		**percpu**
+			As **system** but also show per CPU idleness.
+
+.. option:: --inflate-log=log
+
+	Inflate and output compressed `log`.
+
+.. option:: --trigger-file=file
+
+	Execute trigger command when `file` exists.
+
+.. option:: --trigger-timeout=time
+
+	Execute trigger at this `time`.
+
+.. option:: --trigger=command
+
+	Set this `command` as local trigger.
+
+.. option:: --trigger-remote=command
+
+	Set this `command` as remote trigger.
+
+.. option:: --aux-path=path
+
+	Use the directory specified by `path` for generated state files instead
+	of the current working directory.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
+---------------
+
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning.  Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
+
+See the `examples/` directory for inspiration on how to write job files.  Note
+the copyright and license requirements currently apply to `examples/` files.
 
 So let's look at a really simple job file that defines two processes, each
-randomly reading from a 128MB file.
+randomly reading from a 128MiB file:
+
+.. code-block:: ini
 
-; -- start job file --
-[global]
-rw=randread
-size=128m
+    ; -- start job file --
+    [global]
+    rw=randread
+    size=128m
 
-[job1]
+    [job1]
 
-[job2]
+    [job2]
 
-; -- end job file --
+    ; -- end job file --
 
-As you can see, the job file sections themselves are empty as all the
-described parameters are shared. As no filename= option is given, fio
-makes up a filename for each of the jobs as it sees fit. On the command
-line, this job would look as follows:
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
 
 $ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
 
 
-Let's look at an example that has a number of processes writing randomly
-to files.
+Let's look at an example that has a number of processes writing randomly to
+files:
 
-; -- start job file --
-[random-writers]
-ioengine=libaio
-iodepth=4
-rw=randwrite
-bs=32k
-direct=0
-size=64m
-numjobs=4
-
-; -- end job file --
-
-Here we have no global section, as we only have one job defined anyway.
-We want to use async io here, with a depth of 4 for each file. We also
-increased the buffer size used to 32KB and define numjobs to 4 to
-fork 4 identical jobs. The result is 4 processes each randomly writing
-to their own 64MB file. Instead of using the above job file, you could
-have given the parameters on the command line. For this case, you would
-specify:
+.. code-block:: ini
 
-$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+    ; -- start job file --
+    [random-writers]
+    ioengine=libaio
+    iodepth=4
+    rw=randwrite
+    bs=32k
+    direct=0
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Here we have no *global* section, as we only have one job defined anyway.  We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
 
-4.1 Environment variables
--------------------------
+$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
 
-fio also supports environment variable expansion in job files. Any
-substring of the form "${VARNAME}" as part of an option value (in other
-words, on the right of the `='), will be expanded to the value of the
-environment variable called VARNAME.  If no such environment variable
-is defined, or VARNAME is the empty string, the empty string will be
-substituted.
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
+
+    ; -- start job file including.fio --
+    [global]
+    filename=/tmp/test
+    filesize=1m
+    include glob-include.fio
+
+    [test]
+    rw=randread
+    bs=4k
+    time_based=1
+    runtime=10
+    include test-include.fio
+    ; -- end job file including.fio --
+
+.. code-block:: ini
+
+    ; -- start job file glob-include.fio --
+    thread=1
+    group_reporting=1
+    ; -- end job file glob-include.fio --
+
+.. code-block:: ini
+
+    ; -- start job file test-include.fio --
+    ioengine=libaio
+    iodepth=4
+    ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
+
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
 
-As an example, let's look at a sample fio invocation and job file:
+As an example, let's look at a sample fio invocation and job file::
 
 $ SIZE=64m NUMJOBS=4 fio jobfile.fio
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=${SIZE}
-numjobs=${NUMJOBS}
-; -- end job file --
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=${SIZE}
+    numjobs=${NUMJOBS}
+    ; -- end job file --
 
 This will expand to the following equivalent job file at runtime:
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=64m
-numjobs=4
-; -- end job file --
+.. code-block:: ini
 
-fio ships with a few example job files, you can also look there for
-inspiration.
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=64m
+    numjobs=4
+    ; -- end job file --
 
-4.2 Reserved keywords
----------------------
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
 
 Additionally, fio has a set of reserved keywords that will be replaced
 internally with the appropriate value. Those keywords are:
 
-$pagesize	The architecture page size of the running system
-$mb_memory	Megabytes of total memory in the system
-$ncpus		Number of online available CPUs
+**$pagesize**
+
+	The architecture page size of the running system.
+
+**$mb_memory**
+
+	Megabytes of total memory in the system.
+
+**$ncpus**
+
+	Number of online available CPUs.
 
 These can be used on the command line or in the job file, and will be
-automatically substituted with the current system values when the job
-is run. Simple math is also supported on these keywords, so you can
-perform actions like:
-
-size=8*$mb_memory
-
-and get that properly expanded to 8 times the size of memory in the
-machine.
-
-
-5.0 Detailed list of parameters
--------------------------------
-
-This section describes in details each parameter associated with a job.
-Some parameters take an option of a given type, such as an integer or
-a string. The following types are used:
-
-str	String. This is a sequence of alpha characters.
-time	Integer with possible time suffix. In seconds unless otherwise
-	specified, use eg 10m for 10 minutes. Accepts s/m/h for seconds,
-	minutes, and hours.
-int	SI integer. A whole number value, which may contain a suffix
-	describing the base of the number. Accepted suffixes are k/m/g/t/p,
-	meaning kilo, mega, giga, tera, and peta. The suffix is not case
-	sensitive, and you may also include trailing 'b' (eg 'kb' is the same
-	as 'k'). So if you want to specify 4096, you could either write
-	out '4096' or just give 4k. The suffixes signify base 2 values, so
-	1024 is 1k and 1024k is 1m and so on, unless the suffix is explicitly
-	set to a base 10 value using 'kib', 'mib', 'gib', etc. If that is the
-	case, then 1000 is used as the multiplier. This can be handy for
-	disks, since manufacturers generally use base 10 values when listing
-	the capacity of a drive. If the option accepts an upper and lower
-	range, use a colon ':' or minus '-' to separate such values.  May also
-	include a prefix to indicate numbers base. If 0x is used, the number
-	is assumed to be hexadecimal.  See irange.
-bool	Boolean. Usually parsed as an integer, however only defined for
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
+
+	size=8*$mb_memory
+
+and get that properly expanded to 8 times the size of memory in the machine.
+
+
+Job file parameters
+-------------------
+
+This section describes in details each parameter associated with a job.  Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+
+	- addition (+)
+	- subtraction (-)
+	- multiplication (*)
+	- division (/)
+	- modulus (%)
+	- exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
+
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+	String: A sequence of alphanumeric characters.
+
+**time**
+	Integer with possible time suffix.  Without a unit value is interpreted as
+	seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
+	hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+	'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
+
+.. _int:
+
+**int**
+	Integer. A whole number value, which may contain an integer prefix
+	and an integer suffix:
+
+	[*integer prefix*] **number** [*integer suffix*]
+
+	The optional *integer prefix* specifies the number's base. The default
+	is decimal. *0x* specifies hexadecimal.
+
+	The optional *integer suffix* specifies the number's units, and includes an
+	optional unit prefix and an optional unit.  For quantities of data, the
+	default unit is bytes. For quantities of time, the default unit is seconds
+	unless otherwise specified.
+
+	With :option:`kb_base`\=1000, fio follows international standards for unit
+	prefixes.  To specify power-of-10 decimal values defined in the
+	International System of Units (SI):
+
+		* *K* -- means kilo (K) or 1000
+		* *M* -- means mega (M) or 1000**2
+		* *G* -- means giga (G) or 1000**3
+		* *T* -- means tera (T) or 1000**4
+		* *P* -- means peta (P) or 1000**5
+
+	To specify power-of-2 binary values defined in IEC 80000-13:
+
+		* *Ki* -- means kibi (Ki) or 1024
+		* *Mi* -- means mebi (Mi) or 1024**2
+		* *Gi* -- means gibi (Gi) or 1024**3
+		* *Ti* -- means tebi (Ti) or 1024**4
+		* *Pi* -- means pebi (Pi) or 1024**5
+
+	With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
+	from those specified in the SI and IEC 80000-13 standards to provide
+	compatibility with old scripts.  For example, 4k means 4096.
+
+	For quantities of data, an optional unit of 'B' may be included
+	(e.g., 'kB' is the same as 'k').
+
+	The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+	not milli). 'b' and 'B' both mean byte, not bit.
+
+	Examples with :option:`kb_base`\=1000:
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+		* *1 MiB*: 1048576, 1mi, 1024ki
+		* *1 MB*: 1000000, 1m, 1000k
+		* *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+		* *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+	Examples with :option:`kb_base`\=1024 (default):
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+		* *1 MiB*: 1048576, 1m, 1024k
+		* *1 MB*: 1000000, 1mi, 1000ki
+		* *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+		* *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+	To specify times (units are not case sensitive):
+
+		* *D* -- means days
+		* *H* -- means hours
+		* *M* -- means minutes
+		* *s* -- or sec means seconds (default)
+		* *ms* -- or *msec* means milliseconds
+		* *us* -- or *usec* means microseconds
+
+	If the option accepts an upper and lower range, use a colon ':' or
+	minus '-' to separate such values. See :ref:`irange <irange>`.
+	If the lower value specified happens to be larger than the upper value
+	the two values are swapped.
+
+.. _bool:
+
+**bool**
+	Boolean. Usually parsed as an integer, however only defined for
 	true and false (1 and 0).
-irange	Integer range with suffix. Allows value range to be given, such
-	as 1024-4096. A colon may also be used as the separator, eg
-	1k:4k. If the option allows two sets of ranges, they can be
-	specified with a ',' or '/' delimiter: 1k-4k/8k-32k. Also see
-	int.
-float_list	A list of floating numbers, separated by a ':' character.
-
-With the above in mind, here follows the complete list of fio job
-parameters.
-
-name=str	ASCII name of the job. This may be used to override the
-		name printed by fio for this job. Otherwise the job
-		name is used. On the command line this parameter has the
-		special purpose of also signaling the start of a new
-		job.
-
-description=str	Text description of the job. Doesn't do anything except
-		dump this text description when this job is run. It's
-		not parsed.
-
-directory=str	Prefix filenames with this directory. Used to place files
-		in a different location than "./".
-
-filename=str	Fio normally makes up a filename based on the job name,
-		thread number, and file number. If you want to share
-		files between threads in a job or several jobs, specify
-		a filename for each of them to override the default. If
-		the ioengine used is 'net', the filename is the host, port,
-		and protocol to use in the format of =host,port,protocol.
-		See ioengine=net for more. If the ioengine is file based, you
-		can specify a number of files by separating the names with a
-		':' colon. So if you wanted a job to open /dev/sda and /dev/sdb
-		as the two working files, you would use
-		filename=/dev/sda:/dev/sdb. On Windows, disk devices are
-		accessed as \\.\PhysicalDrive0 for the first device,
-		\\.\PhysicalDrive1 for the second etc. Note: Windows and
-		FreeBSD prevent write access to areas of the disk containing
-		in-use data (e.g. filesystems).
-		If the wanted filename does need to include a colon, then
-		escape that with a '\' character. For instance, if the filename
-		is "/dev/dsk/foo@3,0:c", then you would use
-		filename="/dev/dsk/foo@3,0\:c". '-' is a reserved name, meaning
-		stdin or stdout. Which of the two depends on the read/write
-		direction set.
-
-filename_format=str
-		If sharing multiple files between jobs, it is usually necessary
-		to  have fio generate the exact names that you want. By default,
-		fio will name a file based on the default file format
-		specification of jobname.jobnumber.filenumber. With this
-		option, that can be customized. Fio will recognize and replace
-		the following keywords in this string:
-
-		$jobname
-			The name of the worker thread or process.
-
-		$jobnum
-			The incremental number of the worker thread or
-			process.
-
-		$filenum
-			The incremental number of the file for that worker
-			thread or process.
-
-		To have dependent jobs share a set of files, this option can
-		be set to have fio generate filenames that are shared between
-		the two. For instance, if testfiles.$filenum is specified,
-		file number 4 for any job will be named testfiles.4. The
-		default of $jobname.$jobnum.$filenum will be used if
-		no other format specifier is given.
-
-opendir=str	Tell fio to recursively add any file it can find in this
-		directory and down the file system tree.
-
-lockfile=str	Fio defaults to not locking any files before it does
-		IO to them. If a file or file descriptor is shared, fio
-		can serialize IO to that file to make the end result
-		consistent. This is usual for emulating real workloads that
-		share files. The lock modes are:
-
-			none		No locking. The default.
-			exclusive	Only one thread/process may do IO,
-					excluding all others.
-			readwrite	Read-write locking on the file. Many
-					readers may access the file at the
-					same time, but writes get exclusive
-					access.
-
-readwrite=str
-rw=str		Type of io pattern. Accepted values are:
-
-			read		Sequential reads
-			write		Sequential writes
-			randwrite	Random writes
-			randread	Random reads
-			rw,readwrite	Sequential mixed reads and writes
-			randrw		Random mixed reads and writes
-
-		For the mixed io types, the default is to split them 50/50.
-		For certain types of io the result may still be skewed a bit,
-		since the speed may be different. It is possible to specify
-		a number of IO's to do before getting a new offset, this is
-		one by appending a ':<nr>' to the end of the string given.
-		For a random read, it would look like 'rw=randread:8' for
-		passing in an offset modifier with a value of 8. If the
-		suffix is used with a sequential IO pattern, then the value
-		specified will be added to the generated offset for each IO.
-		For instance, using rw=write:4k will skip 4k for every
-		write. It turns sequential IO into sequential IO with holes.
-		See the 'rw_sequencer' option.
-
-rw_sequencer=str If an offset modifier is given by appending a number to
-		the rw=<str> line, then this option controls how that
-		number modifies the IO offset being generated. Accepted
-		values are:
-
-			sequential	Generate sequential offset
-			identical	Generate the same offset
-
-		'sequential' is only useful for random IO, where fio would
-		normally generate a new random offset for every IO. If you
-		append eg 8 to randread, you would get a new random offset for
-		every 8 IO's. The result would be a seek for only every 8
-		IO's, instead of for every IO. Use rw=randread:8 to specify
-		that. As sequential IO is already sequential, setting
-		'sequential' for that would not result in any differences.
-		'identical' behaves in a similar fashion, except it sends
-		the same offset 8 number of times before generating a new
-		offset.
-
-kb_base=int	The base unit for a kilobyte. The defacto base is 2^10, 1024.
-		Storage manufacturers like to use 10^3 or 1000 as a base
-		ten unit instead, for obvious reasons. Allow values are
-		1024 or 1000, with 1024 being the default.
-
-unified_rw_reporting=bool	Fio normally reports statistics on a per
-		data direction basis, meaning that read, write, and trim are
-		accounted and reported separately. If this option is set,
-		the fio will sum the results and report them as "mixed"
-		instead.
-
-randrepeat=bool	For random IO workloads, seed the generator in a predictable
-		way so that results are repeatable across repetitions.
-
-use_os_rand=bool Fio can either use the random generator supplied by the OS
-		to generator random offsets, or it can use it's own internal
-		generator (based on Tausworthe). Default is to use the
-		internal generator, which is often of better quality and
-		faster.
-
-fallocate=str	Whether pre-allocation is performed when laying down files.
-		Accepted values are:
-
-			none		Do not pre-allocate space
-			posix		Pre-allocate via posix_fallocate()
-			keep		Pre-allocate via fallocate() with
-					FALLOC_FL_KEEP_SIZE set
-			0		Backward-compatible alias for 'none'
-			1		Backward-compatible alias for 'posix'
-
-		May not be available on all supported platforms. 'keep' is only
-		available on Linux.If using ZFS on Solaris this must be set to
-		'none' because ZFS doesn't support it. Default: 'posix'.
-
-fadvise_hint=bool By default, fio will use fadvise() to advise the kernel
-		on what IO patterns it is likely to issue. Sometimes you
-		want to test specific IO patterns without telling the
-		kernel about it, in which case you can disable this option.
-		If set, fio will use POSIX_FADV_SEQUENTIAL for sequential
-		IO and POSIX_FADV_RANDOM for random IO.
-
-size=int	The total size of file io for this job. Fio will run until
-		this many bytes has been transferred, unless runtime is
-		limited by other options (such as 'runtime', for instance).
-		Unless specific nrfiles and filesize options are given,
-		fio will divide this size between the available files
-		specified by the job. If not set, fio will use the full
-		size of the given files or devices. If the the files
-		do not exist, size must be given. It is also possible to
-		give size as a percentage between 1 and 100. If size=20%
-		is given, fio will use 20% of the full size of the given
-		files or devices.
-
-filesize=int	Individual file sizes. May be a range, in which case fio
-		will select sizes for files at random within the given range
-		and limited to 'size' in total (if that is given). If not
-		given, each created file is the same size.
-
-fill_device=bool
-fill_fs=bool	Sets size to something really large and waits for ENOSPC (no
-		space left on device) as the terminating condition. Only makes
-		sense with sequential write. For a read workload, the mount
-		point will be filled first then IO started on the result. This
-		option doesn't make sense if operating on a raw device node,
-		since the size of that is already known by the file system.
-		Additionally, writing beyond end-of-device will not return
-		ENOSPC there.
-
-blocksize=int
-bs=int		The block size used for the io units. Defaults to 4k. Values
-		can be given for both read and writes. If a single int is
-		given, it will apply to both. If a second int is specified
-		after a comma, it will apply to writes only. In other words,
-		the format is either bs=read_and_write or bs=read,write,trim.
-		bs=4k,8k will thus use 4k blocks for reads, 8k blocks for
-		writes, and 8k for trims. You can terminate the list with
-		a trailing comma. bs=4k,8k, would use the default value for
-		trims.. If you only wish to set the write size, you
-		can do so by passing an empty read size - bs=,8k will set
-		8k for writes and leave the read default value.
-
-blockalign=int
-ba=int		At what boundary to align random IO offsets. Defaults to
-		the same as 'blocksize' the minimum blocksize given.
-		Minimum alignment is typically 512b for using direct IO,
-		though it usually depends on the hardware block size. This
-		option is mutually exclusive with using a random map for
-		files, so it will turn off that option.
-
-blocksize_range=irange
-bsrange=irange	Instead of giving a single block size, specify a range
-		and fio will mix the issued io block sizes. The issued
-		io unit will always be a multiple of the minimum value
-		given (also see bs_unaligned). Applies to both reads and
-		writes, however a second range can be given after a comma.
-		See bs=.
-
-bssplit=str	Sometimes you want even finer grained control of the
-		block sizes issued, not just an even split between them.
-		This option allows you to weight various block sizes,
-		so that you are able to define a specific amount of
-		block sizes issued. The format for this option is:
-
-			bssplit=blocksize/percentage:blocksize/percentage
-
-		for as many block sizes as needed. So if you want to define
-		a workload that has 50% 64k blocks, 10% 4k blocks, and
-		40% 32k blocks, you would write:
-
-			bssplit=4k/10:64k/50:32k/40
-
-		Ordering does not matter. If the percentage is left blank,
-		fio will fill in the remaining values evenly. So a bssplit
-		option like this one:
-
-			bssplit=4k/50:1k/:32k/
-
-		would have 50% 4k ios, and 25% 1k and 32k ios. The percentages
-		always add up to 100, if bssplit is given a range that adds
-		up to more, it will error out.
-
-		bssplit also supports giving separate splits to reads and
-		writes. The format is identical to what bs= accepts. You
-		have to separate the read and write parts with a comma. So
-		if you want a workload that has 50% 2k reads and 50% 4k reads,
-		while having 90% 4k writes and 10% 8k writes, you would
-		specify:
-
-		bssplit=2k/50:4k/50,4k/90,8k/10
-
-blocksize_unaligned
-bs_unaligned	If this option is given, any byte size value within bsrange
-		may be used as a block range. This typically wont work with
-		direct IO, as that normally requires sector alignment.
-
-bs_is_seq_rand	If this option is set, fio will use the normal read,write
-		blocksize settings as sequential,random instead. Any random
-		read or write will use the WRITE blocksize settings, and any
-		sequential read or write will use the READ blocksize setting.
-
-zero_buffers	If this option is given, fio will init the IO buffers to
-		all zeroes. The default is to fill them with random data.
-
-refill_buffers	If this option is given, fio will refill the IO buffers
-		on every submit. The default is to only fill it at init
-		time and reuse that data. Only makes sense if zero_buffers
-		isn't specified, naturally. If data verification is enabled,
-		refill_buffers is also automatically enabled.
-
-scramble_buffers=bool	If refill_buffers is too costly and the target is
-		using data deduplication, then setting this option will
-		slightly modify the IO buffer contents to defeat normal
-		de-dupe attempts. This is not enough to defeat more clever
-		block compression attempts, but it will stop naive dedupe of
-		blocks. Default: true.
-
-buffer_compress_percentage=int	If this is set, then fio will attempt to
-		provide IO buffer content (on WRITEs) that compress to
-		the specified level. Fio does this by providing a mix of
-		random data and zeroes. Note that this is per block size
-		unit, for file/disk wide compression level that matches
-		this setting, you'll also want to set refill_buffers.
-
-buffer_compress_chunk=int	See buffer_compress_percentage. This
-		setting allows fio to manage how big the ranges of random
-		data and zeroed data is. Without this set, fio will
-		provide buffer_compress_percentage of blocksize random
-		data, followed by the remaining zeroed. With this set
-		to some chunk size smaller than the block size, fio can
-		alternate random and zeroed data throughout the IO
-		buffer.
-
-nrfiles=int	Number of files to use for this job. Defaults to 1.
-
-openfiles=int	Number of files to keep open at the same time. Defaults to
-		the same as nrfiles, can be set smaller to limit the number
-		simultaneous opens.
-
-file_service_type=str  Defines how fio decides which file from a job to
-		service next. The following types are defined:
-
-			random	Just choose a file at random.
-
-			roundrobin  Round robin over open files. This
-				is the default.
-
-			sequential  Finish one file before moving on to
-				the next. Multiple files can still be
-				open depending on 'openfiles'.
-
-		The string can have a number appended, indicating how
-		often to switch to a new file. So if option random:4 is
-		given, fio will switch to a new random file after 4 ios
-		have been issued.
-
-ioengine=str	Defines how the job issues io to the file. The following
-		types are defined:
-
-			sync	Basic read(2) or write(2) io. lseek(2) is
-				used to position the io location.
-
-			psync 	Basic pread(2) or pwrite(2) io.
-
-			vsync	Basic readv(2) or writev(2) IO.
-
-			psyncv	Basic preadv(2) or pwritev(2) IO.
-
-			libaio	Linux native asynchronous io. Note that Linux
-				may only support queued behaviour with
-				non-buffered IO (set direct=1 or buffered=0).
-				This engine defines engine specific options.
-
-			posixaio glibc posix asynchronous io.
-
-			solarisaio Solaris native asynchronous io.
-
-			windowsaio Windows native asynchronous io.
-
-			mmap	File is memory mapped and data copied
-				to/from using memcpy(3).
-
-			splice	splice(2) is used to transfer the data and
-				vmsplice(2) to transfer data from user
-				space to the kernel.
-
-			syslet-rw Use the syslet system calls to make
-				regular read/write async.
-
-			sg	SCSI generic sg v3 io. May either be
-				synchronous using the SG_IO ioctl, or if
-				the target is an sg character device
-				we use read(2) and write(2) for asynchronous
-				io.
-
-			null	Doesn't transfer any data, just pretends
-				to. This is mainly used to exercise fio
-				itself and for debugging/testing purposes.
-
-			net	Transfer over the network to given host:port.
-				Depending on the protocol used, the hostname,
-				port, listen and filename options are used to
-				specify what sort of connection to make, while
-				the protocol option determines which protocol
-				will be used.
-				This engine defines engine specific options.
-
-			netsplice Like net, but uses splice/vmsplice to
-				map data and send/receive.
-				This engine defines engine specific options.
-
-			cpuio	Doesn't transfer any data, but burns CPU
-				cycles according to the cpuload= and
-				cpucycle= options. Setting cpuload=85
-				will cause that job to do nothing but burn
-				85% of the CPU. In case of SMP machines,
-				use numjobs=<no_of_cpu> to get desired CPU
-				usage, as the cpuload only loads a single
-				CPU at the desired rate.
-
-			guasi	The GUASI IO engine is the Generic Userspace
-				Asyncronous Syscall Interface approach
-				to async IO. See
-
-				http://www.xmailserver.org/guasi-lib.html
-
-				for more info on GUASI.
-
-			rdma    The RDMA I/O engine  supports  both  RDMA
-				memory semantics (RDMA_WRITE/RDMA_READ) and
-				channel semantics (Send/Recv) for the
-				InfiniBand, RoCE and iWARP protocols.
-
-			falloc   IO engine that does regular fallocate to
-				 simulate data transfer as fio ioengine.
-				 DDIR_READ  does fallocate(,mode = keep_size,)
-				 DDIR_WRITE does fallocate(,mode = 0)
-				 DDIR_TRIM  does fallocate(,mode = punch_hole)
-
-			e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT
-				 ioctls to simulate defragment activity in
-				 request to DDIR_WRITE event
-
-			external Prefix to specify loading an external
-				IO engine object file. Append the engine
-				filename, eg ioengine=external:/tmp/foo.o
-				to load ioengine foo.o in /tmp.
-
-iodepth=int	This defines how many io units to keep in flight against
-		the file. The default is 1 for each file defined in this
-		job, can be overridden with a larger value for higher
-		concurrency. Note that increasing iodepth beyond 1 will not
-		affect synchronous ioengines (except for small degress when
-		verify_async is in use). Even async engines may impose OS
-		restrictions causing the desired depth not to be achieved.
-		This may happen on Linux when using libaio and not setting
-		direct=1, since buffered IO is not async on that OS. Keep an
-		eye on the IO depth distribution in the fio output to verify
-		that the achieved depth is as expected. Default: 1.
-
-iodepth_batch_submit=int
-iodepth_batch=int This defines how many pieces of IO to submit at once.
-		It defaults to 1 which means that we submit each IO
-		as soon as it is available, but can be raised to submit
-		bigger batches of IO at the time.
-
-iodepth_batch_complete=int This defines how many pieces of IO to retrieve
-		at once. It defaults to 1 which means that we'll ask
-		for a minimum of 1 IO in the retrieval process from
-		the kernel. The IO retrieval will go on until we
-		hit the limit set by iodepth_low. If this variable is
-		set to 0, then fio will always check for completed
-		events before queuing more IO. This helps reduce
-		IO latency, at the cost of more retrieval system calls.
-
-iodepth_low=int	The low water mark indicating when to start filling
-		the queue again. Defaults to the same as iodepth, meaning
-		that fio will attempt to keep the queue full at all times.
-		If iodepth is set to eg 16 and iodepth_low is set to 4, then
-		after fio has filled the queue of 16 requests, it will let
-		the depth drain down to 4 before starting to fill it again.
-
-direct=bool	If value is true, use non-buffered io. This is usually
-		O_DIRECT. Note that ZFS on Solaris doesn't support direct io.
-		On Windows the synchronous ioengines don't support direct io.
-
-buffered=bool	If value is true, use buffered io. This is the opposite
-		of the 'direct' option. Defaults to true.
-
-offset=int	Start io at the given offset in the file. The data before
-		the given offset will not be touched. This effectively
-		caps the file size at real_size - offset.
-
-offset_increment=int	If this is provided, then the real offset becomes
-		the offset + offset_increment * thread_number, where the
-		thread number is a counter that starts at 0 and is incremented
-		for each job. This option is useful if there are several jobs
-		which are intended to operate on a file in parallel in disjoint
-		segments, with even spacing between the starting points.
-
-number_ios=int	Fio will normally perform IOs until it has exhausted the size
-		of the region set by size=, or if it exhaust the allocated
-		time (or hits an error condition). With this setting, the
-		range/size can be set independently of the number of IOs to
-		perform. When fio reaches this number, it will exit normally
-		and report status.
-
-fsync=int	If writing to a file, issue a sync of the dirty data
-		for every number of blocks given. For example, if you give
-		32 as a parameter, fio will sync the file for every 32
-		writes issued. If fio is using non-buffered io, we may
-		not sync the file. The exception is the sg io engine, which
-		synchronizes the disk cache anyway.
-
-fdatasync=int	Like fsync= but uses fdatasync() to only sync data and not
-		metadata blocks.
-		In FreeBSD and Windows there is no fdatasync(), this falls back to
-		using fsync()
-
-sync_file_range=str:val	Use sync_file_range() for every 'val' number of
-		write operations. Fio will track range of writes that
-		have happened since the last sync_file_range() call. 'str'
-		can currently be one or more of:
-
-		wait_before	SYNC_FILE_RANGE_WAIT_BEFORE
-		write		SYNC_FILE_RANGE_WRITE
-		wait_after	SYNC_FILE_RANGE_WAIT_AFTER
-
-		So if you do sync_file_range=wait_before,write:8, fio would
-		use SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE for
-		every 8 writes. Also see the sync_file_range(2) man page.
-		This option is Linux specific.
-
-overwrite=bool	If true, writes to a file will always overwrite existing
-		data. If the file doesn't already exist, it will be
-		created before the write phase begins. If the file exists
-		and is large enough for the specified write phase, nothing
-		will be done.
-
-end_fsync=bool	If true, fsync file contents when a write stage has completed.
-
-fsync_on_close=bool	If true, fio will fsync() a dirty file on close.
-		This differs from end_fsync in that it will happen on every
-		file close, not just at the end of the job.
-
-rwmixread=int	How large a percentage of the mix should be reads.
-
-rwmixwrite=int	How large a percentage of the mix should be writes. If both
-		rwmixread and rwmixwrite is given and the values do not add
-		up to 100%, the latter of the two will be used to override
-		the first. This may interfere with a given rate setting,
-		if fio is asked to limit reads or writes to a certain rate.
-		If that is the case, then the distribution may be skewed.
-
-random_distribution=str:float	By default, fio will use a completely uniform
-		random distribution when asked to perform random IO. Sometimes
-		it is useful to skew the distribution in specific ways,
-		ensuring that some parts of the data is more hot than others.
-		fio includes the following distribution models:
-
-		random		Uniform random distribution
-		zipf		Zipf distribution
-		pareto		Pareto distribution
-
-		When using a zipf or pareto distribution, an input value
-		is also needed to define the access pattern. For zipf, this
-		is the zipf theta. For pareto, it's the pareto power. Fio
-		includes a test program, genzipf, that can be used visualize
-		what the given input values will yield in terms of hit rates.
-		If you wanted to use zipf with a theta of 1.2, you would use
-		random_distribution=zipf:1.2 as the option. If a non-uniform
-		model is used, fio will disable use of the random map.
-
-percentage_random=int	For a random workload, set how big a percentage should
-		be random. This defaults to 100%, in which case the workload
-		is fully random. It can be set from anywhere from 0 to 100.
-		Setting it to 0 would make the workload fully sequential. Any
-		setting in between will result in a random mix of sequential
-		and random IO, at the given percentages. It is possible to
-		set different values for reads, writes, and trim. To do so,
-		simply use a comma separated list. See blocksize.
-	
-norandommap	Normally fio will cover every block of the file when doing
-		random IO. If this option is given, fio will just get a
-		new random offset without looking at past io history. This
-		means that some blocks may not be read or written, and that
-		some blocks may be read/written more than once. This option
-		is mutually exclusive with verify= if and only if multiple
-		blocksizes (via bsrange=) are used, since fio only tracks
-		complete rewrites of blocks.
-
-softrandommap=bool See norandommap. If fio runs with the random block map
-		enabled and it fails to allocate the map, if this option is
-		set it will continue without a random block map. As coverage
-		will not be as complete as with random maps, this option is
-		disabled by default.
-
-random_generator=str	Fio supports the following engines for generating
-		IO offsets for random IO:
-
-		tausworthe	Strong 2^88 cycle random number generator
-		lfsr		Linear feedback shift register generator
-
-		Tausworthe is a strong random number generator, but it
-		requires tracking on the side if we want to ensure that
-		blocks are only read or written once. LFSR guarantees
-		that we never generate the same offset twice, and it's
-		also less computationally expensive. It's not a true
-		random generator, however, though for IO purposes it's
-		typically good enough. LFSR only works with single
-		block sizes, not with workloads that use multiple block
-		sizes. If used with such a workload, fio may read or write
-		some blocks multiple times.
-
-nice=int	Run the job with the given nice value. See man nice(2).
-
-prio=int	Set the io priority value of this job. Linux limits us to
-		a positive value between 0 and 7, with 0 being the highest.
-		See man ionice(1).
-
-prioclass=int	Set the io priority class. See man ionice(1).
-
-thinktime=int	Stall the job x microseconds after an io has completed before
-		issuing the next. May be used to simulate processing being
-		done by an application. See thinktime_blocks and
-		thinktime_spin.
-
-thinktime_spin=int
-		Only valid if thinktime is set - pretend to spend CPU time
-		doing something with the data received, before falling back
-		to sleeping for the rest of the period specified by
-		thinktime.
-
-thinktime_blocks=int
-		Only valid if thinktime is set - control how many blocks
-		to issue, before waiting 'thinktime' usecs. If not set,
-		defaults to 1 which will make fio wait 'thinktime' usecs
-		after every block. This effectively makes any queue depth
-		setting redundant, since no more than 1 IO will be queued
-		before we have to complete it and do our thinktime. In
-		other words, this setting effectively caps the queue depth
-		if the latter is larger.
-
-rate=int	Cap the bandwidth used by this job. The number is in bytes/sec,
-		the normal suffix rules apply. You can use rate=500k to limit
-		reads and writes to 500k each, or you can specify read and
-		writes separately. Using rate=1m,500k would limit reads to
-		1MB/sec and writes to 500KB/sec. Capping only reads or
-		writes can be done with rate=,500k or rate=500k,. The former
-		will only limit writes (to 500KB/sec), the latter will only
-		limit reads.
-
-ratemin=int	Tell fio to do whatever it can to maintain at least this
-		bandwidth. Failing to meet this requirement, will cause
-		the job to exit. The same format as rate is used for
-		read vs write separation.
-
-rate_iops=int	Cap the bandwidth to this number of IOPS. Basically the same
-		as rate, just specified independently of bandwidth. If the
-		job is given a block size range instead of a fixed value,
-		the smallest block size is used as the metric. The same format
-		as rate is used for read vs write seperation.
-
-rate_iops_min=int If fio doesn't meet this rate of IO, it will cause
-		the job to exit. The same format as rate is used for read vs
-		write seperation.
-
-max_latency=int	If set, fio will exit the job if it exceeds this maximum
-		latency. It will exit with an ETIME error.
-
-ratecycle=int	Average bandwidth for 'rate' and 'ratemin' over this number
-		of milliseconds.
-
-cpumask=int	Set the CPU affinity of this job. The parameter given is a
-		bitmask of allowed CPU's the job may run on. So if you want
-		the allowed CPUs to be 1 and 5, you would pass the decimal
-		value of (1 << 1 | 1 << 5), or 34. See man
-		sched_setaffinity(2). This may not work on all supported
-		operating systems or kernel versions. This option doesn't
-		work well for a higher CPU count than what you can store in
-		an integer mask, so it can only control cpus 1-32. For
-		boxes with larger CPU counts, use cpus_allowed.
-
-cpus_allowed=str Controls the same options as cpumask, but it allows a text
-		setting of the permitted CPUs instead. So to use CPUs 1 and
-		5, you would specify cpus_allowed=1,5. This options also
-		allows a range of CPUs. Say you wanted a binding to CPUs
-		1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
-
-numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
-		arguments allow comma delimited list of cpu numbers,
-		A-B ranges, or 'all'. Note, to enable numa options support,
-		fio must be built on a system with libnuma-dev(el) installed.
-
-numa_mem_policy=str Set this job's memory policy and corresponding NUMA
-		nodes. Format of the argements:
-			<mode>[:<nodelist>]
-		`mode' is one of the following memory policy:
-			default, prefer, bind, interleave, local
-		For `default' and `local' memory policy, no node is
-		needed to be specified.
-		For `prefer', only one node is allowed.
-		For `bind' and `interleave', it allow comma delimited
-		list of numbers, A-B ranges, or 'all'.
-
-startdelay=time	Start this job the specified number of seconds after fio
-		has started. Only useful if the job file contains several
-		jobs, and you want to delay starting some jobs to a certain
-		time.
-
-runtime=time	Tell fio to terminate processing after the specified number
-		of seconds. It can be quite hard to determine for how long
-		a specified job will run, so this parameter is handy to
-		cap the total runtime to a given time.
-
-time_based	If set, fio will run for the duration of the runtime
-		specified even if the file(s) are completely read or
-		written. It will simply loop over the same workload
-		as many times as the runtime allows.
-
-ramp_time=time	If set, fio will run the specified workload for this amount
-		of time before logging any performance numbers. Useful for
-		letting performance settle before logging results, thus
-		minimizing the runtime required for stable results. Note
-		that the ramp_time is considered lead in time for a job,
-		thus it will increase the total runtime if a special timeout
-		or runtime is specified.
-
-invalidate=bool	Invalidate the buffer/page cache parts for this file prior
-		to starting io. Defaults to true.
-
-sync=bool	Use sync io for buffered writes. For the majority of the
-		io engines, this means using O_SYNC.
-
-iomem=str
-mem=str		Fio can use various types of memory as the io unit buffer.
-		The allowed values are:
-
-			malloc	Use memory from malloc(3) as the buffers.
-
-			shm	Use shared memory as the buffers. Allocated
-				through shmget(2).
-
-			shmhuge	Same as shm, but use huge pages as backing.
-
-			mmap	Use mmap to allocate buffers. May either be
-				anonymous memory, or can be file backed if
-				a filename is given after the option. The
-				format is mem=mmap:/path/to/file.
-
-			mmaphuge Use a memory mapped huge file as the buffer
-				backing. Append filename after mmaphuge, ala
-				mem=mmaphuge:/hugetlbfs/file
-
-		The area allocated is a function of the maximum allowed
-		bs size for the job, multiplied by the io depth given. Note
-		that for shmhuge and mmaphuge to work, the system must have
-		free huge pages allocated. This can normally be checked
-		and set by reading/writing /proc/sys/vm/nr_hugepages on a
-		Linux system. Fio assumes a huge page is 4MB in size. So
-		to calculate the number of huge pages you need for a given
-		job file, add up the io depth of all jobs (normally one unless
-		iodepth= is used) and multiply by the maximum bs set. Then
-		divide that number by the huge page size. You can see the
-		size of the huge pages in /proc/meminfo. If no huge pages
-		are allocated by having a non-zero number in nr_hugepages,
-		using mmaphuge or shmhuge will fail. Also see hugepage-size.
-
-		mmaphuge also needs to have hugetlbfs mounted and the file
-		location should point there. So if it's mounted in /huge,
-		you would use mem=mmaphuge:/huge/somefile.
-
-iomem_align=int	This indiciates the memory alignment of the IO memory buffers.
-		Note that the given alignment is applied to the first IO unit
-		buffer, if using iodepth the alignment of the following buffers
-		are given by the bs used. In other words, if using a bs that is
-		a multiple of the page sized in the system, all buffers will
-		be aligned to this value. If using a bs that is not page
-		aligned, the alignment of subsequent IO memory buffers is the
-		sum of the iomem_align and bs used.
-
-hugepage-size=int
-		Defines the size of a huge page. Must at least be equal
-		to the system setting, see /proc/meminfo. Defaults to 4MB.
-		Should probably always be a multiple of megabytes, so using
-		hugepage-size=Xm is the preferred way to set this to avoid
-		setting a non-pow-2 bad value.
-
-exitall		When one job finishes, terminate the rest. The default is
-		to wait for each job to finish, sometimes that is not the
-		desired action.
-
-bwavgtime=int	Average the calculated bandwidth over the given time. Value
-		is specified in milliseconds.
-
-iopsavgtime=int	Average the calculated IOPS over the given time. Value
-		is specified in milliseconds.
-
-create_serialize=bool	If true, serialize the file creating for the jobs.
-			This may be handy to avoid interleaving of data
-			files, which may greatly depend on the filesystem
-			used and even the number of processors in the system.
-
-create_fsync=bool	fsync the data file after creation. This is the
-			default.
-
-create_on_open=bool	Don't pre-setup the files for IO, just create open()
-			when it's time to do IO to that file.
-
-create_only=bool	If true, fio will only run the setup phase of the job.
-			If files need to be laid out or updated on disk, only
-			that will be done. The actual job contents are not
-			executed.
-
-pre_read=bool	If this is given, files will be pre-read into memory before
-		starting the given IO operation. This will also clear
-		the 'invalidate' flag, since it is pointless to pre-read
-		and then drop the cache. This will only work for IO engines
-		that are seekable, since they allow you to read the same data
-		multiple times. Thus it will not work on eg network or splice
-		IO.
-
-unlink=bool	Unlink the job files when done. Not the default, as repeated
-		runs of that job would then waste time recreating the file
-		set again and again.
-
-loops=int	Run the specified number of iterations of this job. Used
-		to repeat the same workload a given number of times. Defaults
-		to 1.
-
-do_verify=bool	Run the verify phase after a write phase. Only makes sense if
-		verify is set. Defaults to 1.
-
-verify=str	If writing to a file, fio can verify the file contents
-		after each iteration of the job. The allowed values are:
-
-			md5	Use an md5 sum of the data area and store
-				it in the header of each block.
-
-			crc64	Use an experimental crc64 sum of the data
-				area and store it in the header of each
-				block.
-
-			crc32c	Use a crc32c sum of the data area and store
-				it in the header of each block.
-
-			crc32c-intel Use hardware assisted crc32c calcuation
-				provided on SSE4.2 enabled processors. Falls
-				back to regular software crc32c, if not
-				supported by the system.
-
-			crc32	Use a crc32 sum of the data area and store
-				it in the header of each block.
-
-			crc16	Use a crc16 sum of the data area and store
-				it in the header of each block.
-
-			crc7	Use a crc7 sum of the data area and store
-				it in the header of each block.
-
-			sha512	Use sha512 as the checksum function.
-
-			sha256	Use sha256 as the checksum function.
-
-			sha1	Use optimized sha1 as the checksum function.
-
-			meta	Write extra information about each io
-				(timestamp, block number etc.). The block
-				number is verified. See also verify_pattern.
-
-			null	Only pretend to verify. Useful for testing
-				internals with ioengine=null, not for much
-				else.
-
-		This option can be used for repeated burn-in tests of a
-		system to make sure that the written data is also
-		correctly read back. If the data direction given is
-		a read or random read, fio will assume that it should
-		verify a previously written file. If the data direction
-		includes any form of write, the verify will be of the
-		newly written data.
-
-verifysort=bool	If set, fio will sort written verify blocks when it deems
-		it faster to read them back in a sorted manner. This is
-		often the case when overwriting an existing file, since
-		the blocks are already laid out in the file system. You
-		can ignore this option unless doing huge amounts of really
-		fast IO where the red-black tree sorting CPU time becomes
-		significant.
-
-verify_offset=int	Swap the verification header with data somewhere else
-			in the block before writing. Its swapped back before
-			verifying.
-
-verify_interval=int	Write the verification header at a finer granularity
-			than the blocksize. It will be written for chunks the
-			size of header_interval. blocksize should divide this
-			evenly.
-
-verify_pattern=str	If set, fio will fill the io buffers with this
-		pattern. Fio defaults to filling with totally random
-		bytes, but sometimes it's interesting to fill with a known
-		pattern for io verification purposes. Depending on the
-		width of the pattern, fio will fill 1/2/3/4 bytes of the
-		buffer at the time(it can be either a decimal or a hex number).
-		The verify_pattern if larger than a 32-bit quantity has to
-		be a hex number that starts with either "0x" or "0X". Use
-		with verify=meta.
-
-verify_fatal=bool	Normally fio will keep checking the entire contents
-		before quitting on a block verification failure. If this
-		option is set, fio will exit the job on the first observed
-		failure.
-
-verify_dump=bool	If set, dump the contents of both the original data
-		block and the data block we read off disk to files. This
-		allows later analysis to inspect just what kind of data
-		corruption occurred. Off by default.
-
-verify_async=int	Fio will normally verify IO inline from the submitting
-		thread. This option takes an integer describing how many
-		async offload threads to create for IO verification instead,
-		causing fio to offload the duty of verifying IO contents
-		to one or more separate threads. If using this offload
-		option, even sync IO engines can benefit from using an
-		iodepth setting higher than 1, as it allows them to have
-		IO in flight while verifies are running.
-
-verify_async_cpus=str	Tell fio to set the given CPU affinity on the
-		async IO verification threads. See cpus_allowed for the
-		format used.
-
-verify_backlog=int	Fio will normally verify the written contents of a
-		job that utilizes verify once that job has completed. In
-		other words, everything is written then everything is read
-		back and verified. You may want to verify continually
-		instead for a variety of reasons. Fio stores the meta data
-		associated with an IO block in memory, so for large
-		verify workloads, quite a bit of memory would be used up
-		holding this meta data. If this option is enabled, fio
-		will write only N blocks before verifying these blocks.
-
-		will verify the previously written blocks before continuing
-		to write new ones.
-
-verify_backlog_batch=int	Control how many blocks fio will verify
-		if verify_backlog is set. If not set, will default to
-		the value of verify_backlog (meaning the entire queue
-		is read back and verified).  If verify_backlog_batch is
-		less than verify_backlog then not all blocks will be verified,
-		if verify_backlog_batch is larger than verify_backlog, some
-		blocks will be verified more than once.
-
-stonewall
-wait_for_previous Wait for preceeding jobs in the job file to exit, before
-		starting this one. Can be used to insert serialization
-		points in the job file. A stone wall also implies starting
-		a new reporting group.
-
-new_group	Start a new reporting group. See: group_reporting.
-
-numjobs=int	Create the specified number of clones of this job. May be
-		used to setup a larger number of threads/processes doing
-		the same thing. Each thread is reported separately; to see
-		statistics for all clones as a whole, use group_reporting in
-		conjunction with new_group.
-
-group_reporting	It may sometimes be interesting to display statistics for
-		groups of jobs as a whole instead of for each individual job.
-		This is especially true if 'numjobs' is used; looking at
-		individual thread/process output quickly becomes unwieldy.
-		To see the final report per-group instead of per-job, use
-		'group_reporting'. Jobs in a file will be part of the same
-		reporting group, unless if separated by a stonewall, or by
-		using 'new_group'.
-
-thread		fio defaults to forking jobs, however if this option is
-		given, fio will use pthread_create(3) to create threads
-		instead.
-
-zonesize=int	Divide a file into zones of the specified size. See zoneskip.
-
-zoneskip=int	Skip the specified number of bytes when zonesize data has
-		been read. The two zone options can be used to only do
-		io on zones of a file.
-
-write_iolog=str	Write the issued io patterns to the specified file. See
-		read_iolog.  Specify a separate file for each job, otherwise
-		the iologs will be interspersed and the file may be corrupt.
-
-read_iolog=str	Open an iolog with the specified file name and replay the
-		io patterns it contains. This can be used to store a
-		workload and replay it sometime later. The iolog given
-		may also be a blktrace binary file, which allows fio
-		to replay a workload captured by blktrace. See blktrace
-		for how to capture such logging data. For blktrace replay,
-		the file needs to be turned into a blkparse binary data
-		file first (blkparse <device> -o /dev/null -d file_for_fio.bin).
-
-replay_no_stall=int When replaying I/O with read_iolog the default behavior
-		is to attempt to respect the time stamps within the log and
-		replay them with the appropriate delay between IOPS.  By
-		setting this variable fio will not respect the timestamps and
-		attempt to replay them as fast as possible while still
-		respecting ordering.  The result is the same I/O pattern to a
-		given device, but different timings.
-
-replay_redirect=str While replaying I/O patterns using read_iolog the
-		default behavior is to replay the IOPS onto the major/minor
-		device that each IOP was recorded from.  This is sometimes
-		undesireable because on a different machine those major/minor
-		numbers can map to a different device.  Changing hardware on
-		the same system can also result in a different major/minor
-		mapping.  Replay_redirect causes all IOPS to be replayed onto
-		the single specified device regardless of the device it was
-		recorded from. i.e. replay_redirect=/dev/sdc would cause all
-		IO in the blktrace to be replayed onto /dev/sdc.  This means
-		multiple devices will be replayed onto a single, if the trace
-		contains multiple devices.  If you want multiple devices to be
-		replayed concurrently to multiple redirected devices you must
-		blkparse your trace into separate traces and replay them with
-		independent fio invocations.  Unfortuantely this also breaks
-		the strict time ordering between multiple device accesses.
-
-write_bw_log=str If given, write a bandwidth log of the jobs in this job
-		file. Can be used to store data of the bandwidth of the
-		jobs in their lifetime. The included fio_generate_plots
-		script uses gnuplot to turn these text files into nice
-		graphs. See write_lat_log for behaviour of given
-		filename. For this option, the suffix is _bw.log.
-
-write_lat_log=str Same as write_bw_log, except that this option stores io
-		submission, completion, and total latencies instead. If no
-		filename is given with this option, the default filename of
-		"jobname_type.log" is used. Even if the filename is given,
-		fio will still append the type of log. So if one specifies
-
-		write_lat_log=foo
-
-		The actual log names will be foo_slat.log, foo_clat.log,
-		and foo_lat.log. This helps fio_generate_plot fine the logs
-		automatically.
-
-write_bw_log=str If given, write an IOPS log of the jobs in this job
-		file. See write_bw_log.
-
-write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
-		given with this option, the default filename of
-		"jobname_type.log" is used. Even if the filename is given,
-		fio will still append the type of log.
-
-log_avg_msec=int By default, fio will log an entry in the iops, latency,
-		or bw log for every IO that completes. When writing to the
-		disk log, that can quickly grow to a very large size. Setting
-		this option makes fio average the each log entry over the
-		specified period of time, reducing the resolution of the log.
-		Defaults to 0.
-
-lockmem=int	Pin down the specified amount of memory with mlock(2). Can
-		potentially be used instead of removing memory or booting
-		with less memory to simulate a smaller amount of memory.
-		The amount specified is per worker.
-
-exec_prerun=str	Before running this job, issue the command specified
-		through system(3). Output is redirected in a file called
-		jobname.prerun.txt.
-
-exec_postrun=str After the job completes, issue the command specified
-		 though system(3). Output is redirected in a file called
-		 jobname.postrun.txt.
-
-ioscheduler=str	Attempt to switch the device hosting the file to the specified
-		io scheduler before running.
-
-disk_util=bool	Generate disk utilization statistics, if the platform
-		supports it. Defaults to on.
-
-disable_lat=bool Disable measurements of total latency numbers. Useful
-		only for cutting back the number of calls to gettimeofday,
-		as that does impact performance at really high IOPS rates.
-		Note that to really get rid of a large amount of these
-		calls, this option must be used with disable_slat and
-		disable_bw as well.
-
-disable_clat=bool Disable measurements of completion latency numbers. See
-		disable_lat.
-
-disable_slat=bool Disable measurements of submission latency numbers. See
-		disable_slat.
-
-disable_bw=bool	Disable measurements of throughput/bandwidth numbers. See
-		disable_lat.
-
-clat_percentiles=bool Enable the reporting of percentiles of
-		 completion latencies.
-
-percentile_list=float_list Overwrite the default list of percentiles
-		for completion latencies. Each number is a floating
-		number in the range (0,100], and the maximum length of
-		the list is 20. Use ':' to separate the numbers, and
-		list the numbers in ascending order. For example,
-		--percentile_list=99.5:99.9 will cause fio to report
-		the values of completion latency below which 99.5% and
-		99.9% of the observed latencies fell, respectively.
-
-clocksource=str	Use the given clocksource as the base of timing. The
-		supported options are:
-
-			gettimeofday	gettimeofday(2)
-
-			clock_gettime	clock_gettime(2)
-
-			cpu		Internal CPU clock source
-
-		cpu is the preferred clocksource if it is reliable, as it
-		is very fast (and fio is heavy on time calls). Fio will
-		automatically use this clocksource if it's supported and
-		considered reliable on the system it is running on, unless
-		another clocksource is specifically set. For x86/x86-64 CPUs,
-		this means supporting TSC Invariant.
-
-gtod_reduce=bool Enable all of the gettimeofday() reducing options
-		(disable_clat, disable_slat, disable_bw) plus reduce
-		precision of the timeout somewhat to really shrink
-		the gettimeofday() call count. With this option enabled,
-		we only do about 0.4% of the gtod() calls we would have
-		done if all time keeping was enabled.
-
-gtod_cpu=int	Sometimes it's cheaper to dedicate a single thread of
-		execution to just getting the current time. Fio (and
-		databases, for instance) are very intensive on gettimeofday()
-		calls. With this option, you can set one CPU aside for
-		doing nothing but logging current time to a shared memory
-		location. Then the other threads/processes that run IO
-		workloads need only copy that segment, instead of entering
-		the kernel with a gettimeofday() call. The CPU set aside
-		for doing these time calls will be excluded from other
-		uses. Fio will manually clear it from the CPU mask of other
-		jobs.
-
-continue_on_error=str	Normally fio will exit the job on the first observed
-		failure. If this option is set, fio will continue the job when
-		there is a 'non-fatal error' (EIO or EILSEQ) until the runtime
-		is exceeded or the I/O size specified is completed. If this
-		option is used, there are two more stats that are appended,
-		the total error count and the first error. The error field
-		given in the stats is the first error that was hit during the
-		run.
-
-		The allowed values are:
-
-			none	Exit on any IO or verify errors.
-
-			read	Continue on read errors, exit on all others.
-
-			write	Continue on write errors, exit on all others.
-
-			io	Continue on any IO error, exit on all others.
-
-			verify	Continue on verify errors, exit on all others.
-
-			all	Continue on all errors.
-
-			0		Backward-compatible alias for 'none'.
-
-			1		Backward-compatible alias for 'all'.
-
-ignore_error=str Sometimes you want to ignore some errors during test
-		 in that case you can specify error list for each error type.
-		 ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-		 errors for given error type is separated with ':'. Error
-		 may be symbol ('ENOSPC', 'ENOMEM') or integer.
-		 Example:
-			ignore_error=EAGAIN,ENOSPC:122
-		 This option will ignore EAGAIN from READ, and ENOSPC and
-		 122(EDQUOT) from WRITE.
-
-error_dump=bool If set dump every error even if it is non fatal, true
-		by default. If disabled only fatal error will be dumped
-
-cgroup=str	Add job to this control group. If it doesn't exist, it will
-		be created. The system must have a mounted cgroup blkio
-		mount point for this to work. If your system doesn't have it
-		mounted, you can do so with:
 
-		# mount -t cgroup -o blkio none /cgroup
+.. _irange:
 
-cgroup_weight=int	Set the weight of the cgroup to this value. See
-		the documentation that comes with the kernel, allowed values
-		are in the range of 100..1000.
-
-cgroup_nodelete=bool Normally fio will delete the cgroups it has created after
-		the job completion. To override this behavior and to leave
-		cgroups around after the job completion, set cgroup_nodelete=1.
-		This can be useful if one wants to inspect various cgroup
-		files after job completion. Default: false
-
-uid=int		Instead of running as the invoking user, set the user ID to
-		this value before the thread/process does any work.
-
-gid=int		Set group ID, see uid.
-
-flow_id=int	The ID of the flow. If not specified, it defaults to being a
-		global flow. See flow.
-
-flow=int	Weight in token-based flow control. If this value is used, then
-		there is a 'flow counter' which is used to regulate the
-		proportion of activity between two or more jobs. fio attempts
-		to keep this flow counter near zero. The 'flow' parameter
-		stands for how much should be added or subtracted to the flow
-		counter on each iteration of the main I/O loop. That is, if
-		one job has flow=8 and another job has flow=-1, then there
-		will be a roughly 1:8 ratio in how much one runs vs the other.
-
-flow_watermark=int	The maximum value that the absolute value of the flow
-		counter is allowed to reach before the job must wait for a
-		lower value of the counter.
+**irange**
+	Integer range with suffix. Allows value range to be given, such as
+	1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+	option allows two sets of ranges, they can be specified with a ',' or '/'
+	delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
 
-flow_sleep=int	The period of time, in microseconds, to wait after the flow
-		watermark has been exceeded before retrying operations
+**float_list**
+	A list of floating point numbers, separated by a ':' character.
 
-In addition, there are some parameters which are only valid when a specific
-ioengine is in use. These are used identically to normal parameters, with the
-caveat that when used on the command line, they must come after the ioengine
-that defines them is selected.
-
-[libaio] userspace_reap Normally, with the libaio engine in use, fio will use
-		the io_getevents system call to reap newly returned events.
-		With this flag turned on, the AIO ring will be read directly
-		from user-space to reap events. The reaping mode is only
-		enabled when polling for a minimum of 0 events (eg when
-		iodepth_batch_complete=0).
-
-[cpu] cpuload=int Attempt to use the specified percentage of CPU cycles.
-
-[cpu] cpuchunks=int Split the load into cycles of the given time. In
-		microseconds.
-
-[netsplice] hostname=str
-[net] hostname=str The host name or IP address to use for TCP or UDP based IO.
-		If the job is a TCP listener or UDP reader, the hostname is not
-		used and must be omitted unless it is a valid UDP multicast
-		address.
-
-[netsplice] port=int
-[net] port=int	The TCP or UDP port to bind to or connect to.
-
-[netsplice] interface=str
-[net] interface=str  The IP address of the network interface used to send or
-		receive UDP multicast
-
-[netsplice] ttl=int
-[net] ttl=int	Time-to-live value for outgoing UDP multicast packets.
-		Default: 1
-
-[netsplice] nodelay=bool
-[net] nodelay=bool	Set TCP_NODELAY on TCP connections.
-
-[netsplice] protocol=str
-[netsplice] proto=str
-[net] protocol=str
-[net] proto=str	The network protocol to use. Accepted values are:
-
-			tcp	Transmission control protocol
-			udp	User datagram protocol
-			unix	UNIX domain socket
-
-		When the protocol is TCP or UDP, the port must also be given,
-		as well as the hostname if the job is a TCP listener or UDP
-		reader. For unix sockets, the normal filename option should be
-		used and the port is invalid.
-
-[net] listen	For TCP network connections, tell fio to listen for incoming
-		connections rather than initiating an outgoing connection. The
-		hostname must be omitted if this option is used.
-[net] pingpong	Normaly a network writer will just continue writing data, and
-		a network reader will just consume packages. If pingpong=1
-		is set, a writer will send its normal payload to the reader,
-		then wait for the reader to send the same payload back. This
-		allows fio to measure network latencies. The submission
-		and completion latencies then measure local time spent
-		sending or receiving, and the completion latency measures
-		how long it took for the other end to receive and send back.
-		For UDP multicast traffic pingpong=1 should only be set for a
-		single reader when multiple readers are listening to the same
-		address.
-
-[e4defrag] donorname=str
-	        File will be used as a block donor(swap extents between files)
-[e4defrag] inplace=int
-		Configure donor file blocks allocation strategy
-		0(default): Preallocate donor's file on init
-		1 	  : allocate space immidietly inside defragment event,
-			    and free right after event
+With the above in mind, here follows the complete list of fio job parameters.
 
 
+Units
+~~~~~
 
-6.0 Interpreting the output
----------------------------
+.. option:: kb_base=int
 
-fio spits out a lot of output. While running, fio will display the
-status of the jobs created. An example of that would be:
+	Select the interpretation of unit prefixes in input parameters.
 
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
+		**1000**
+			Inputs comply with IEC 80000-13 and the International
+			System of Units (SI). Use:
 
-The characters inside the square brackets denote the current status of
-each thread. The possible values (in typical life cycle order) are:
+				- power-of-2 values with IEC prefixes (e.g., KiB)
+				- power-of-10 values with SI prefixes (e.g., kB)
 
-Idle	Run
-----    ---
-P		Thread setup, but not started.
-C		Thread created.
-I		Thread initialized, waiting or generating necessary data.
-	p	Thread running pre-reading file(s).
-	R	Running, doing sequential reads.
-	r	Running, doing random reads.
-	W	Running, doing sequential writes.
-	w	Running, doing random writes.
-	M	Running, doing mixed sequential reads/writes.
-	m	Running, doing mixed random reads/writes.
-	F	Running, currently waiting for fsync()
-	V	Running, doing verification of written data.
-E		Thread exited, not reaped by main thread yet.
-_		Thread reaped, or
-X		Thread reaped, exited with an error.
-K		Thread reaped, exited due to signal.
-
-The other values are fairly self explanatory - number of threads
-currently running and doing io, rate of io since last check (read speed
-listed first, then write speed), and the estimated completion percentage
-and time for the running group. It's impossible to estimate runtime of
-the following groups (if any). Note that the string is displayed in order,
-so it's possible to tell which of the jobs are currently doing what. The
-first character is the first job defined in the job file, and so forth.
-
-When fio is done (or interrupted by ctrl-c), it will show the data for
-each thread, group of threads, and disks in that order. For each data
-direction, the output looks like:
-
-Client1 (g=0): err= 0:
-  write: io=    32MB, bw=   666KB/s, iops=89 , runt= 50320msec
-    slat (msec): min=    0, max=  136, avg= 0.03, stdev= 1.92
-    clat (msec): min=    0, max=  631, avg=48.50, stdev=86.82
-    bw (KB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
-  cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
-  IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
-     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     issued r/w: total=0/32768, short=0/0
-     lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
-     lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
-
-The client number is printed, along with the group id and error of that
-thread. Below is the io statistics, here for writes. In the order listed,
-they denote:
-
-io=		Number of megabytes io performed
-bw=		Average bandwidth rate
-iops=           Average IOs performed per second
-runt=		The runtime of that thread
-	slat=	Submission latency (avg being the average, stdev being the
-		standard deviation). This is the time it took to submit
-		the io. For sync io, the slat is really the completion
-		latency, since queue/complete is one operation there. This
-		value can be in milliseconds or microseconds, fio will choose
-		the most appropriate base and print that. In the example
-		above, milliseconds is the best scale. Note: in --minimal mode
-		latencies are always expressed in microseconds.
-	clat=	Completion latency. Same names as slat, this denotes the
-		time from submission to completion of the io pieces. For
-		sync io, clat will usually be equal (or very close) to 0,
-		as the time from submit to complete is basically just
-		CPU time (io has already been done, see slat explanation).
-	bw=	Bandwidth. Same names as the xlat stats, but also includes
-		an approximate percentage of total aggregate bandwidth
-		this thread received in this group. This last value is
-		only really useful if the threads in this group are on the
-		same disk, since they are then competing for disk access.
-cpu=		CPU usage. User and system time, along with the number
-		of context switches this thread went through, usage of
-		system and user time, and finally the number of major
-		and minor page faults.
-IO depths=	The distribution of io depths over the job life time. The
-		numbers are divided into powers of 2, so for example the
-		16= entries includes depths up to that value but higher
-		than the previous entry. In other words, it covers the
-		range from 16 to 31.
-IO submit=	How many pieces of IO were submitting in a single submit
-		call. Each entry denotes that amount and below, until
-		the previous entry - eg, 8=100% mean that we submitted
-		anywhere in between 5-8 ios per submit call.
-IO complete=	Like the above submit number, but for completions instead.
-IO issued=	The number of read/write requests issued, and how many
-		of them were short.
-IO latencies=	The distribution of IO completion latencies. This is the
-		time from when IO leaves fio and when it gets completed.
-		The numbers follow the same pattern as the IO depths,
-		meaning that 2=1.6% means that 1.6% of the IO completed
-		within 2 msecs, 20=12.8% means that 12.8% of the IO
-		took more than 10 msecs, but less than (or equal to) 20 msecs.
+		**1024**
+			Compatibility mode (default).  To avoid breaking old scripts:
 
-After each client has been listed, the group statistics are printed. They
-will look like this:
+				- power-of-2 values with SI prefixes
+				- power-of-10 values with IEC prefixes
 
-Run status group 0 (all jobs):
-   READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec
-  WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec
+	See :option:`bs` for more details on input parameters.
 
-For each data direction, it prints:
+	Outputs always use correct prefixes.  Most outputs include both
+	side-by-side, like::
 
-io=		Number of megabytes io performed.
-aggrb=		Aggregate bandwidth of threads in this group.
-minb=		The minimum average bandwidth a thread saw.
-maxb=		The maximum average bandwidth a thread saw.
-mint=		The smallest runtime of the threads in that group.
-maxt=		The longest runtime of the threads in that group.
+		bw=2383.3kB/s (2327.4KiB/s)
 
-And finally, the disk statistics are printed. They will look like this:
+	If only one value is reported, then kb_base selects the one to use:
 
-Disk stats (read/write):
-  sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+		**1000** -- SI prefixes
 
-Each value is printed for both reads and writes, with reads first. The
-numbers denote:
+		**1024** -- IEC prefixes
 
-ios=		Number of ios performed by all groups.
-merge=		Number of merges io the io scheduler.
-ticks=		Number of ticks we kept the disk busy.
-io_queue=	Total time spent in the disk queue.
-util=		The disk utilization. A value of 100% means we kept the disk
-		busy constantly, 50% would be a disk idling half of the time.
+.. option:: unit_base=int
 
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the USR1 signal.
-You can also get regularly timed dumps by using the --status-interval
-parameter, or by creating a file in /tmp named fio-dump-status. If fio
-sees this file, it will unlink it and dump the current output status.
+	Base unit for reporting.  Allowed values are:
 
+	**0**
+		Use auto-detection (default).
+	**8**
+		Byte based.
+	**1**
+		Bit based.
 
-7.0 Terse output
-----------------
 
-For scripted usage where you typically want to generate tables or graphs
-of the results, fio can output the results in a semicolon separated format.
-The format is one long line of values, such as:
-
-2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
-A description of this job goes here.
-
-The job description (if provided) follows on a second line.
-
-To enable terse output, use the --minimal command line option. The first
-value is the version of the terse output format. If the output has to
-be changed for some reason, this number will be incremented by 1 to
-signify that change.
-
-Split up, the format is as follows:
-
-	terse version, fio version, jobname, groupid, error
-	READ status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, deviation (usec)
-		Completion latency: min, max, mean, deviation (usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, deviation (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, deviation
-	WRITE status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, deviation (usec)
-		Completion latency: min, max, mean, deviation (usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, deviation (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, deviation
-	CPU usage: user, system, context switches, major faults, minor faults
-	IO depths: <=1, 2, 4, 8, 16, 32, >=64
-	IO latencies microseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
-	IO latencies milliseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-	Disk utilization: Disk name, Read ios, write ios,
-			  Read merges, write merges,
-			  Read ticks, write ticks,
-			  Time spent in queue, disk utilization percentage
-	Additional Info (dependant on continue_on_error, default off): total # errors, first error code
-
-	Additional Info (dependant on description being set): Text description
-
-Completion latency percentiles can be a grouping of up to 20 sets, so
-for the terse output fio writes all of them. Each field will look like this:
-
-	1.00%=6112
-
-which is the Xth percentile, and the usec latency associated with it.
-
-For disk utilization, all disks used by fio are shown. So for each disk
-there will be a disk utilization section.
-
-
-8.0 Trace file format
----------------------
-There are two trace file format that you can encounter. The older (v1) format
-is unsupported since version 1.20-rc3 (March 2008). It will still be described
-below in case that you get an old trace and want to understand it.
+Job description
+~~~~~~~~~~~~~~~
 
-In any case the trace is a simple text file with a single action per line.
+.. option:: name=str
 
+	ASCII name of the job. This may be used to override the name printed by fio
+	for this job. Otherwise the job name is used. On the command line this
+	parameter has the special purpose of also signaling the start of a new job.
 
-8.1 Trace file format v1
-------------------------
-Each line represents a single io action in the following format:
+.. option:: description=str
 
-rw, offset, length
+	Text description of the job. Doesn't do anything except dump this text
+	description when this job is run. It's not parsed.
 
-where rw=0/1 for read/write, and the offset and length entries being in bytes.
+.. option:: loops=int
 
-This format is not supported in Fio versions => 1.20-rc3.
+	Run the specified number of iterations of this job. Used to repeat the same
+	workload a given number of times. Defaults to 1.
 
+.. option:: numjobs=int
 
-8.2 Trace file format v2
-------------------------
-The second version of the trace file format was added in Fio version 1.17.
-It allows to access more then one file per trace and has a bigger set of
-possible file actions.
+	Create the specified number of clones of this job. Each clone of job
+	is spawned as an independent thread or process. May be used to setup a
+	larger number of threads/processes doing the same thing. Each thread is
+	reported separately; to see statistics for all clones as a whole, use
+	:option:`group_reporting` in conjunction with :option:`new_group`.
+	See :option:`--max-jobs`.  Default: 1.
 
-The first line of the trace file has to be:
 
-fio version 2 iolog
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
 
-Following this can be lines in two different formats, which are described below.
+.. option:: runtime=time
+
+	Tell fio to terminate processing after the specified period of time.  It
+	can be quite hard to determine for how long a specified job will run, so
+	this parameter is handy to cap the total runtime to a given time.  When
+	the unit is omitted, the value is interpreted in seconds.
+
+.. option:: time_based
+
+	If set, fio will run for the duration of the :option:`runtime` specified
+	even if the file(s) are completely read or written. It will simply loop over
+	the same workload as many times as the :option:`runtime` allows.
+
+.. option:: startdelay=irange(time)
+
+	Delay the start of job for the specified amount of time.  Can be a single
+	value or a range.  When given as a range, each thread will choose a value
+	randomly from within the range.  Value is in seconds if a unit is omitted.
+
+.. option:: ramp_time=time
+
+	If set, fio will run the specified workload for this amount of time before
+	logging any performance numbers. Useful for letting performance settle
+	before logging results, thus minimizing the runtime required for stable
+	results. Note that the ``ramp_time`` is considered lead in time for a job,
+	thus it will increase the total runtime if a special timeout or
+	:option:`runtime` is specified.  When the unit is omitted, the value is
+	given in seconds.
+
+.. option:: clocksource=str
+
+	Use the given clocksource as the base of timing. The supported options are:
+
+		**gettimeofday**
+			:manpage:`gettimeofday(2)`
+
+		**clock_gettime**
+			:manpage:`clock_gettime(2)`
+
+		**cpu**
+			Internal CPU clock source
+
+	cpu is the preferred clocksource if it is reliable, as it is very fast (and
+	fio is heavy on time calls). Fio will automatically use this clocksource if
+	it's supported and considered reliable on the system it is running on,
+	unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+	means supporting TSC Invariant.
+
+.. option:: gtod_reduce=bool
+
+	Enable all of the :manpage:`gettimeofday(2)` reducing options
+	(:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+	reduce precision of the timeout somewhat to really shrink the
+	:manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+	about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+	time keeping was enabled.
+
+.. option:: gtod_cpu=int
+
+	Sometimes it's cheaper to dedicate a single thread of execution to just
+	getting the current time. Fio (and databases, for instance) are very
+	intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+	one CPU aside for doing nothing but logging current time to a shared memory
+	location. Then the other threads/processes that run I/O workloads need only
+	copy that segment, instead of entering the kernel with a
+	:manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+	calls will be excluded from other uses. Fio will manually clear it from the
+	CPU mask of other jobs.
+
+
+Target file/device
+~~~~~~~~~~~~~~~~~~
+
+.. option:: directory=str
+
+	Prefix filenames with this directory. Used to place files in a different
+	location than :file:`./`.  You can specify a number of directories by
+	separating the names with a ':' character. These directories will be
+	assigned equally distributed to job clones created by :option:`numjobs` as
+	long as they are using generated filenames. If specific `filename(s)` are
+	set fio will use the first listed directory, and thereby matching the
+	`filename` semantic (which generates a file for each clone if not
+	specified, but lets all clones use the same file if set).
+
+	See the :option:`filename` option for information on how to escape "``:``" and
+	"``\``" characters within the directory path itself.
+
+	Note: To control the directory fio will use for internal state files
+	use :option:`--aux-path`.
+
+.. option:: filename=str
+
+	Fio normally makes up a `filename` based on the job name, thread number, and
+	file number (see :option:`filename_format`). If you want to share files
+	between threads in a job or several
+	jobs with fixed file paths, specify a `filename` for each of them to override
+	the default. If the ioengine is file based, you can specify a number of files
+	by separating the names with a ':' colon. So if you wanted a job to open
+	:file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+	``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+	specified, :option:`nrfiles` is ignored. The size of regular files specified
+	by this option will be :option:`size` divided by number of files unless an
+	explicit size is specified by :option:`filesize`.
+
+	Each colon and backslash in the wanted path must be escaped with a ``\``
+	character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+	would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+	:file:`F:\\filename` then you would use ``filename=F\:\\filename``.
+
+	On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+	the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+	Note: Windows and FreeBSD prevent write access to areas
+	of the disk containing in-use data (e.g. filesystems).
+
+	The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
+	of the two depends on the read/write direction set.
+
+.. option:: filename_format=str
+
+	If sharing multiple files between jobs, it is usually necessary to have fio
+	generate the exact names that you want. By default, fio will name a file
+	based on the default file format specification of
+	:file:`jobname.jobnumber.filenumber`. With this option, that can be
+	customized. Fio will recognize and replace the following keywords in this
+	string:
+
+		**$jobname**
+				The name of the worker thread or process.
+		**$jobnum**
+				The incremental number of the worker thread or process.
+		**$filenum**
+				The incremental number of the file for that worker thread or
+				process.
+
+	To have dependent jobs share a set of files, this option can be set to have
+	fio generate filenames that are shared between the two. For instance, if
+	:file:`testfiles.$filenum` is specified, file number 4 for any job will be
+	named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+	will be used if no other format specifier is given.
+
+	If you specify a path then the directories will be created up to the
+	main directory for the file.  So for example if you specify
+	``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
+	created before the file setup part of the job.  If you specify
+	:option:`directory` then the path will be relative that directory,
+	otherwise it is treated as the absolute path.
+
+.. option:: unique_filename=bool
+
+	To avoid collisions between networked clients, fio defaults to prefixing any
+	generated filenames (with a directory specified) with the source of the
+	client connecting. To disable this behavior, set this option to 0.
+
+.. option:: opendir=str
+
+	Recursively open any files below directory `str`.
+
+.. option:: lockfile=str
+
+	Fio defaults to not locking any files before it does I/O to them. If a file
+	or file descriptor is shared, fio can serialize I/O to that file to make the
+	end result consistent. This is usual for emulating real workloads that share
+	files. The lock modes are:
+
+		**none**
+			No locking. The default.
+		**exclusive**
+			Only one thread or process may do I/O at a time, excluding all
+			others.
+		**readwrite**
+			Read-write locking on the file. Many readers may
+			access the file at the same time, but writes get exclusive access.
+
+.. option:: nrfiles=int
+
+	Number of files to use for this job. Defaults to 1. The size of files
+	will be :option:`size` divided by this unless explicit size is specified by
+	:option:`filesize`. Files are created for each thread separately, and each
+	file will have a file number within its name by default, as explained in
+	:option:`filename` section.
+
+
+.. option:: openfiles=int
+
+	Number of files to keep open at the same time. Defaults to the same as
+	:option:`nrfiles`, can be set smaller to limit the number simultaneous
+	opens.
+
+.. option:: file_service_type=str
+
+	Defines how fio decides which file from a job to service next. The following
+	types are defined:
+
+		**random**
+			Choose a file at random.
+
+		**roundrobin**
+			Round robin over opened files. This is the default.
+
+		**sequential**
+			Finish one file before moving on to the next. Multiple files can
+			still be open depending on :option:`openfiles`.
+
+		**zipf**
+			Use a *Zipf* distribution to decide what file to access.
+
+		**pareto**
+			Use a *Pareto* distribution to decide what file to access.
+
+		**normal**
+			Use a *Gaussian* (normal) distribution to decide what file to
+			access.
+
+		**gauss**
+			Alias for normal.
+
+	For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+	tell fio how many I/Os to issue before switching to a new file. For example,
+	specifying ``file_service_type=random:8`` would cause fio to issue
+	8 I/Os before selecting a new file at random. For the non-uniform
+	distributions, a floating point postfix can be given to influence how the
+	distribution is skewed. See :option:`random_distribution` for a description
+	of how that would work.
+
+.. option:: ioscheduler=str
+
+	Attempt to switch the device hosting the file to the specified I/O scheduler
+	before running.
+
+.. option:: create_serialize=bool
+
+	If true, serialize the file creation for the jobs.  This may be handy to
+	avoid interleaving of data files, which may greatly depend on the filesystem
+	used and even the number of processors in the system.  Default: true.
+
+.. option:: create_fsync=bool
+
+	:manpage:`fsync(2)` the data file after creation. This is the default.
+
+.. option:: create_on_open=bool
+
+	If true, don't pre-create files but allow the job's open() to create a file
+	when it's time to do I/O.  Default: false -- pre-create all necessary files
+	when the job starts.
+
+.. option:: create_only=bool
+
+	If true, fio will only run the setup phase of the job.  If files need to be
+	laid out or updated on disk, only that will be done -- the actual job contents
+	are not executed.  Default: false.
+
+.. option:: allow_file_create=bool
+
+	If true, fio is permitted to create files as part of its workload.  If this
+	option is false, then fio will error out if
+	the files it needs to use don't already exist. Default: true.
+
+.. option:: allow_mounted_write=bool
+
+	If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+	to what appears to be a mounted device or partition. This should help catch
+	creating inadvertently destructive tests, not realizing that the test will
+	destroy data on the mounted file system. Note that some platforms don't allow
+	writing against a mounted device regardless of this option. Default: false.
+
+.. option:: pre_read=bool
+
+	If this is given, files will be pre-read into memory before starting the
+	given I/O operation. This will also clear the :option:`invalidate` flag,
+	since it is pointless to pre-read and then drop the cache. This will only
+	work for I/O engines that are seek-able, since they allow you to read the
+	same data multiple times. Thus it will not work on non-seekable I/O engines
+	(e.g. network, splice). Default: false.
+
+.. option:: unlink=bool
+
+	Unlink the job files when done. Not the default, as repeated runs of that
+	job would then waste time recreating the file set again and again. Default:
+	false.
+
+.. option:: unlink_each_loop=bool
+
+	Unlink job files after each iteration or loop.  Default: false.
+
+.. option:: zonemode=str
+
+	Accepted values are:
+
+		**none**
+				The :option:`zonerange`, :option:`zonesize` and
+				:option:`zoneskip` parameters are ignored.
+		**strided**
+				I/O happens in a single zone until
+				:option:`zonesize` bytes have been transferred.
+				After that number of bytes has been
+				transferred processing of the next zone
+				starts.
+		**zbd**
+				Zoned block device mode. I/O happens
+				sequentially in each zone, even if random I/O
+				has been selected. Random I/O happens across
+				all zones instead of being restricted to a
+				single zone. The :option:`zoneskip` parameter
+				is ignored. :option:`zonerange` and
+				:option:`zonesize` must be identical.
+
+.. option:: zonerange=int
+
+	Size of a single zone. See also :option:`zonesize` and
+	:option:`zoneskip`.
+
+.. option:: zonesize=int
+
+	For :option:`zonemode` =strided, this is the number of bytes to
+	transfer before skipping :option:`zoneskip` bytes. If this parameter
+	is smaller than :option:`zonerange` then only a fraction of each zone
+	with :option:`zonerange` bytes will be accessed.  If this parameter is
+	larger than :option:`zonerange` then each zone will be accessed
+	multiple times before skipping to the next zone.
+
+	For :option:`zonemode` =zbd, this is the size of a single zone. The
+	:option:`zonerange` parameter is ignored in this mode.
+
+.. option:: zoneskip=int
+
+	For :option:`zonemode` =strided, the number of bytes to skip after
+	:option:`zonesize` bytes of data have been transferred. This parameter
+	must be zero for :option:`zonemode` =zbd.
+
+.. option:: read_beyond_wp=bool
+
+	This parameter applies to :option:`zonemode` =zbd only.
+
+	Zoned block devices are block devices that consist of multiple zones.
+	Each zone has a type, e.g. conventional or sequential. A conventional
+	zone can be written at any offset that is a multiple of the block
+	size. Sequential zones must be written sequentially. The position at
+	which a write must occur is called the write pointer. A zoned block
+	device can be either drive managed, host managed or host aware. For
+	host managed devices the host must ensure that writes happen
+	sequentially. Fio recognizes host managed devices and serializes
+	writes to sequential zones for these devices.
+
+	If a read occurs in a sequential zone beyond the write pointer then
+	the zoned block device will complete the read without reading any data
+	from the storage medium. Since such reads lead to unrealistically high
+	bandwidth and IOPS numbers fio only reads beyond the write pointer if
+	explicitly told to do so. Default: false.
+
+.. option:: max_open_zones=int
+
+	When running a random write test across an entire drive many more
+	zones will be open than in a typical application workload. Hence this
+	command line option that allows to limit the number of open zones. The
+	number of open zones is defined as the number of zones to which write
+	commands are issued.
+
+.. option:: zone_reset_threshold=float
+
+	A number between zero and one that indicates the ratio of logical
+	blocks with data to the total number of logical blocks in the test
+	above which zones should be reset periodically.
+
+.. option:: zone_reset_frequency=float
+
+	A number between zero and one that indicates how often a zone reset
+	should be issued if the zone reset threshold has been exceeded. A zone
+	reset is submitted after each (1 / zone_reset_frequency) write
+	requests. This and the previous parameter can be used to simulate
+	garbage collection activity.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+	If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+	OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
+	ioengines don't support direct I/O.  Default: false.
+
+.. option:: atomic=bool
+
+	If value is true, attempt to use atomic direct I/O. Atomic writes are
+	guaranteed to be stable once acknowledged by the operating system. Only
+	Linux supports O_ATOMIC right now.
+
+.. option:: buffered=bool
+
+	If value is true, use buffered I/O. This is the opposite of the
+	:option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+	Type of I/O pattern. Accepted values are:
+
+		**read**
+				Sequential reads.
+		**write**
+				Sequential writes.
+		**trim**
+				Sequential trims (Linux block devices and SCSI
+				character devices only).
+		**randread**
+				Random reads.
+		**randwrite**
+				Random writes.
+		**randtrim**
+				Random trims (Linux block devices and SCSI
+				character devices only).
+		**rw,readwrite**
+				Sequential mixed reads and writes.
+		**randrw**
+				Random mixed reads and writes.
+		**trimwrite**
+				Sequential trim+write sequences. Blocks will be trimmed first,
+				then the same blocks will be written to.
+
+	Fio defaults to read if the option is not specified.  For the mixed I/O
+	types, the default is to split them 50/50.  For certain types of I/O the
+	result may still be skewed a bit, since the speed may be different.
+
+	It is possible to specify the number of I/Os to do before getting a new
+	offset by appending ``:<nr>`` to the end of the string given.  For a
+	random read, it would look like ``rw=randread:8`` for passing in an offset
+	modifier with a value of 8. If the suffix is used with a sequential I/O
+	pattern, then the *<nr>* value specified will be **added** to the generated
+	offset for each I/O turning sequential I/O into sequential I/O with holes.
+	For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
+	the :option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+	If an offset modifier is given by appending a number to the ``rw=<str>``
+	line, then this option controls how that number modifies the I/O offset
+	being generated. Accepted values are:
+
+		**sequential**
+			Generate sequential offset.
+		**identical**
+			Generate the same offset.
+
+	``sequential`` is only useful for random I/O, where fio would normally
+	generate a new random offset for every I/O. If you append e.g. 8 to randread,
+	you would get a new random offset for every 8 I/Os. The result would be a
+	seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
+	to specify that. As sequential I/O is already sequential, setting
+	``sequential`` for that would not result in any differences.  ``identical``
+	behaves in a similar fashion, except it sends the same offset 8 number of
+	times before generating a new offset.
+
+.. option:: unified_rw_reporting=bool
+
+	Fio normally reports statistics on a per data direction basis, meaning that
+	reads, writes, and trims are accounted and reported separately. If this
+	option is set fio sums the results and report them as "mixed" instead.
+
+.. option:: randrepeat=bool
+
+	Seed the random number generator used for random I/O patterns in a
+	predictable way so the pattern is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+	Seed all random number generators in a predictable way so results are
+	repeatable across runs.  Default: false.
+
+.. option:: randseed=int
+
+	Seed the random number generators based on this seed value, to be able to
+	control what sequence of output is being generated.  If not set, the random
+	sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+	Whether pre-allocation is performed when laying down files.
+	Accepted values are:
+
+		**none**
+			Do not pre-allocate space.
+
+		**native**
+			Use a platform's native pre-allocation call but fall back to
+			**none** behavior if it fails/is not implemented.
+
+		**posix**
+			Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+		**keep**
+			Pre-allocate via :manpage:`fallocate(2)` with
+			FALLOC_FL_KEEP_SIZE set.
+
+		**0**
+			Backward-compatible alias for **none**.
+
+		**1**
+			Backward-compatible alias for **posix**.
+
+	May not be available on all supported platforms. **keep** is only available
+	on Linux. If using ZFS on Solaris this cannot be set to **posix**
+	because ZFS doesn't support pre-allocation. Default: **native** if any
+	pre-allocation methods are available, **none** if not.
+
+.. option:: fadvise_hint=str
+
+	Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
+	advise the kernel on what I/O patterns are likely to be issued.
+	Accepted values are:
+
+		**0**
+			Backwards-compatible hint for "no hint".
+
+		**1**
+			Backwards compatible hint for "advise with fio workload type". This
+			uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+			for a sequential workload.
+
+		**sequential**
+			Advise using **FADV_SEQUENTIAL**.
+
+		**random**
+			Advise using **FADV_RANDOM**.
+
+.. option:: write_hint=str
+
+	Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+	from a write. Only supported on Linux, as of version 4.13. Accepted
+	values are:
+
+		**none**
+			No particular life time associated with this file.
+
+		**short**
+			Data written to this file has a short life time.
+
+		**medium**
+			Data written to this file has a medium life time.
+
+		**long**
+			Data written to this file has a long life time.
+
+		**extreme**
+			Data written to this file has a very long life time.
+
+	The values are all relative to each other, and no absolute meaning
+	should be associated with them.
+
+.. option:: offset=int
+
+	Start I/O at the provided offset in the file, given as either a fixed size in
+	bytes or a percentage. If a percentage is given, the generated offset will be
+	aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
+	provided. Data before the given offset will not be touched. This
+	effectively caps the file size at `real_size - offset`. Can be combined with
+	:option:`size` to constrain the start and end range of the I/O workload.
+	A percentage can be specified by a number between 1 and 100 followed by '%',
+	for example, ``offset=20%`` to specify 20%.
+
+.. option:: offset_align=int
+
+	If set to non-zero value, the byte offset generated by a percentage ``offset``
+	is aligned upwards to this value. Defaults to 0 meaning that a percentage
+	offset is aligned to the minimum block size.
+
+.. option:: offset_increment=int
+
+	If this is provided, then the real offset becomes `offset + offset_increment
+	* thread_number`, where the thread number is a counter that starts at 0 and
+	is incremented for each sub-job (i.e. when :option:`numjobs` option is
+	specified). This option is useful if there are several jobs which are
+	intended to operate on a file in parallel disjoint segments, with even
+	spacing between the starting points. Percentages can be used for this option.
+	If a percentage is given, the generated offset will be aligned to the minimum
+	``blocksize`` or to the value of ``offset_align`` if provided.
+
+.. option:: number_ios=int
+
+	Fio will normally perform I/Os until it has exhausted the size of the region
+	set by :option:`size`, or if it exhaust the allocated time (or hits an error
+	condition). With this setting, the range/size can be set independently of
+	the number of I/Os to perform. When fio reaches this number, it will exit
+	normally and report status. Note that this does not extend the amount of I/O
+	that will be done, it will only stop fio if this condition is met before
+	other end-of-job criteria.
+
+.. option:: fsync=int
+
+	If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+	the dirty data for every number of blocks given. For example, if you give 32
+	as a parameter, fio will sync the file after every 32 writes issued. If fio is
+	using non-buffered I/O, we may not sync the file. The exception is the sg
+	I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+	means fio does not periodically issue and wait for a sync to complete. Also
+	see :option:`end_fsync` and :option:`fsync_on_close`.
+
+.. option:: fdatasync=int
+
+	Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+	not metadata blocks.  In Windows, FreeBSD, and DragonFlyBSD there is no
+	:manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+	Defaults to 0, which means fio does not periodically issue and wait for a
+	data-only sync to complete.
+
+.. option:: write_barrier=int
+
+	Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:int
+
+	Use :manpage:`sync_file_range(2)` for every `int` number of write
+	operations. Fio will track range of writes that have happened since the last
+	:manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+		**wait_before**
+			SYNC_FILE_RANGE_WAIT_BEFORE
+		**write**
+			SYNC_FILE_RANGE_WRITE
+		**wait_after**
+			SYNC_FILE_RANGE_WAIT_AFTER
+
+	So if you do ``sync_file_range=wait_before,write:8``, fio would use
+	``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+	writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
+	Linux specific.
+
+.. option:: overwrite=bool
+
+	If true, writes to a file will always overwrite existing data. If the file
+	doesn't already exist, it will be created before the write phase begins. If
+	the file exists and is large enough for the specified write phase, nothing
+	will be done. Default: false.
+
+.. option:: end_fsync=bool
+
+	If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+	Default: false.
+
+.. option:: fsync_on_close=bool
+
+	If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
+	from :option:`end_fsync` in that it will happen on every file close, not
+	just at the end of the job.  Default: false.
+
+.. option:: rwmixread=int
+
+	Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+	Percentage of a mixed workload that should be writes. If both
+	:option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+	add up to 100%, the latter of the two will be used to override the
+	first. This may interfere with a given rate setting, if fio is asked to
+	limit reads or writes to a certain rate.  If that is the case, then the
+	distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[,str:float][,str:float]
+
+	By default, fio will use a completely uniform random distribution when asked
+	to perform random I/O. Sometimes it is useful to skew the distribution in
+	specific ways, ensuring that some parts of the data is more hot than others.
+	fio includes the following distribution models:
+
+		**random**
+				Uniform random distribution
+
+		**zipf**
+				Zipf distribution
+
+		**pareto**
+				Pareto distribution
+
+		**normal**
+				Normal (Gaussian) distribution
+
+		**zoned**
+				Zoned random distribution
+
+		**zoned_abs**
+				Zone absolute random distribution
+
+	When using a **zipf** or **pareto** distribution, an input value is also
+	needed to define the access pattern. For **zipf**, this is the `Zipf
+	theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+	program, :command:`fio-genzipf`, that can be used visualize what the given input
+	values will yield in terms of hit rates.  If you wanted to use **zipf** with
+	a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+	option. If a non-uniform model is used, fio will disable use of the random
+	map. For the **normal** distribution, a normal (Gaussian) deviation is
+	supplied as a value between 0 and 100.
+
+	For a **zoned** distribution, fio supports specifying percentages of I/O
+	access that should fall within what range of the file or device. For
+	example, given a criteria of:
+
+		* 60% of accesses should be to the first 10%
+		* 30% of accesses should be to the next 20%
+		* 8% of accesses should be to the next 30%
+		* 2% of accesses should be to the next 40%
+
+	we can define that through zoning of the random accesses. For the above
+	example, the user would do::
+
+		random_distribution=zoned:60/10:30/20:8/30:2/40
+
+	A **zoned_abs** distribution works exactly like the **zoned**, except
+	that it takes absolute sizes. For example, let's say you wanted to
+	define access according to the following criteria:
+
+		* 60% of accesses should be to the first 20G
+		* 30% of accesses should be to the next 100G
+		* 10% of accesses should be to the next 500G
+
+	we can define an absolute zoning distribution with:
+
+		random_distribution=zoned_abs=60/20G:30/100G:10/500g
+
+	For both **zoned** and **zoned_abs**, fio supports defining up to
+	256 separate zones.
+
+	Similarly to how :option:`bssplit` works for setting ranges and
+	percentages of block sizes. Like :option:`bssplit`, it's possible to
+	specify separate zones for reads, writes, and trims. If just one set
+	is given, it'll apply to all of them. This goes for both **zoned**
+	**zoned_abs** distributions.
+
+.. option:: percentage_random=int[,int][,int]
+
+	For a random workload, set how big a percentage should be random. This
+	defaults to 100%, in which case the workload is fully random. It can be set
+	from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
+	sequential. Any setting in between will result in a random mix of sequential
+	and random I/O, at the given percentages.  Comma-separated values may be
+	specified for reads, writes, and trims as described in :option:`blocksize`.
+
+.. option:: norandommap
+
+	Normally fio will cover every block of the file when doing random I/O. If
+	this option is given, fio will just get a new random offset without looking
+	at past I/O history. This means that some blocks may not be read or written,
+	and that some blocks may be read/written more than once. If this option is
+	used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+	only intact blocks are verified, i.e., partially-overwritten blocks are
+	ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
+	the same block to be overwritten, which can cause verification errors.  Either
+	do not use norandommap in this case, or also use the lfsr random generator.
+
+.. option:: softrandommap=bool
+
+	See :option:`norandommap`. If fio runs with the random block map enabled and
+	it fails to allocate the map, if this option is set it will continue without
+	a random block map. As coverage will not be as complete as with random maps,
+	this option is disabled by default.
+
+.. option:: random_generator=str
+
+	Fio supports the following engines for generating I/O offsets for random I/O:
+
+		**tausworthe**
+			Strong 2^88 cycle random number generator.
+		**lfsr**
+			Linear feedback shift register generator.
+		**tausworthe64**
+			Strong 64-bit 2^258 cycle random number generator.
+
+	**tausworthe** is a strong random number generator, but it requires tracking
+	on the side if we want to ensure that blocks are only read or written
+	once. **lfsr** guarantees that we never generate the same offset twice, and
+	it's also less computationally expensive. It's not a true random generator,
+	however, though for I/O purposes it's typically good enough. **lfsr** only
+	works with single block sizes, not with workloads that use multiple block
+	sizes. If used with such a workload, fio may read or write some blocks
+	multiple times. The default value is **tausworthe**, unless the required
+	space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+	selected automatically.
+
+
+Block size
+~~~~~~~~~~
+
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
+
+	The block size in bytes used for I/O units. Default: 4096.  A single value
+	applies to reads, writes, and trims.  Comma-separated values may be
+	specified for reads, writes, and trims.  A value not terminated in a comma
+	applies to subsequent types.
+
+	Examples:
+
+		**bs=256k**
+			means 256k for reads, writes and trims.
+
+		**bs=8k,32k**
+			means 8k for reads, 32k for writes and trims.
+
+		**bs=8k,32k,**
+			means 8k for reads, 32k for writes, and default for trims.
+
+		**bs=,8k**
+			means default for reads, 8k for writes and trims.
+
+		**bs=,8k,**
+			means default for reads, 8k for writes, and default for trims.
+
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
+
+	A range of block sizes in bytes for I/O units.  The issued I/O unit will
+	always be a multiple of the minimum size, unless
+	:option:`blocksize_unaligned` is set.
+
+	Comma-separated ranges may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	Example: ``bsrange=1k-4k,2k-8k``.
+
+.. option:: bssplit=str[,str][,str]
+
+	Sometimes you want even finer grained control of the block sizes
+	issued, not just an even split between them.  This option allows you to
+	weight various block sizes, so that you are able to define a specific
+	amount of block sizes issued. The format for this option is::
+
+		bssplit=blocksize/percentage:blocksize/percentage
+
+	for as many block sizes as needed. So if you want to define a workload
+	that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
+	write::
+
+		bssplit=4k/10:64k/50:32k/40
+
+	Ordering does not matter. If the percentage is left blank, fio will
+	fill in the remaining values evenly. So a bssplit option like this one::
+
+		bssplit=4k/50:1k/:32k/
+
+	would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
+	add up to 100, if bssplit is given a range that adds up to more, it
+	will error out.
+
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+	If you want a workload that has 50% 2k reads and 50% 4k reads, while
+	having 90% 4k writes and 10% 8k writes, you would specify::
+
+		bssplit=2k/50:4k/50,4k/90:8k/10
+
+	Fio supports defining up to 64 different weights for each data
+	direction.
+
+.. option:: blocksize_unaligned, bs_unaligned
+
+	If set, fio will issue I/O units with any size within
+	:option:`blocksize_range`, not just multiples of the minimum size.  This
+	typically won't work with direct I/O, as that normally requires sector
+	alignment.
+
+.. option:: bs_is_seq_rand=bool
+
+	If this option is set, fio will use the normal read,write blocksize settings
+	as sequential,random blocksize settings instead. Any random read or write
+	will use the WRITE blocksize settings, and any sequential read or write will
+	use the READ blocksize settings.
+
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
+
+	Boundary to which fio will align random I/O units.  Default:
+	:option:`blocksize`.  Minimum alignment is typically 512b for using direct
+	I/O, though it usually depends on the hardware block size. This option is
+	mutually exclusive with using a random map for files, so it will turn off
+	that option.  Comma-separated values may be specified for reads, writes, and
+	trims as described in :option:`blocksize`.
+
+
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
+
+.. option:: zero_buffers
+
+	Initialize buffers with all zeros. Default: fill buffers with random data.
+
+.. option:: refill_buffers
+
+	If this option is given, fio will refill the I/O buffers on every
+	submit. Only makes sense if :option:`zero_buffers` isn't specified,
+	naturally. Defaults to being unset i.e., the buffer is only filled at
+	init time and the data in it is reused when possible but if any of
+	:option:`verify`, :option:`buffer_compress_percentage` or
+	:option:`dedupe_percentage` are enabled then `refill_buffers` is also
+	automatically enabled.
+
+.. option:: scramble_buffers=bool
+
+	If :option:`refill_buffers` is too costly and the target is using data
+	deduplication, then setting this option will slightly modify the I/O buffer
+	contents to defeat normal de-dupe attempts. This is not enough to defeat
+	more clever block compression attempts, but it will stop naive dedupe of
+	blocks. Default: true.
+
+.. option:: buffer_compress_percentage=int
+
+	If this is set, then fio will attempt to provide I/O buffer content
+	(on WRITEs) that compresses to the specified level. Fio does this by
+	providing a mix of random data followed by fixed pattern data. The
+	fixed pattern is either zeros, or the pattern specified by
+	:option:`buffer_pattern`. If the `buffer_pattern` option is used, it
+	might skew the compression ratio slightly. Setting
+	`buffer_compress_percentage` to a value other than 100 will also
+	enable :option:`refill_buffers` in order to reduce the likelihood that
+	adjacent blocks are so similar that they over compress when seen
+	together. See :option:`buffer_compress_chunk` for how to set a finer or
+	coarser granularity for the random/fixed data region. Defaults to unset
+	i.e., buffer data will not adhere to any compression level.
+
+.. option:: buffer_compress_chunk=int
+
+	This setting allows fio to manage how big the random/fixed data region
+	is when using :option:`buffer_compress_percentage`. When
+	`buffer_compress_chunk` is set to some non-zero value smaller than the
+	block size, fio can repeat the random/fixed region throughout the I/O
+	buffer at the specified interval (which particularly useful when
+	bigger block sizes are used for a job). When set to 0, fio will use a
+	chunk size that matches the block size resulting in a single
+	random/fixed region within the I/O buffer. Defaults to 512. When the
+	unit is omitted, the value is interpreted in bytes.
+
+.. option:: buffer_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern or with the contents
+	of a file. If not set, the contents of I/O buffers are defined by the other
+	options related to buffer contents. The setting can be any pattern of bytes,
+	and can be prefixed with 0x for hex values. It may also be a string, where
+	the string must then be wrapped with ``""``. Or it may also be a filename,
+	where the filename must be wrapped with ``''`` in which case the file is
+	opened and read. Note that not all the file contents will be read if that
+	would cause the buffers to overflow. So, for example::
+
+		buffer_pattern='filename'
+
+	or::
+
+		buffer_pattern="abcd"
+
+	or::
+
+		buffer_pattern=-12
+
+	or::
+
+		buffer_pattern=0xdeadface
+
+	Also you can combine everything together in any order::
+
+		buffer_pattern=0xdeadface"abcd"-12'filename'
+
+.. option:: dedupe_percentage=int
+
+	If set, fio will generate this percentage of identical buffers when
+	writing. These buffers will be naturally dedupable. The contents of the
+	buffers depend on what other buffer compression settings have been set. It's
+	possible to have the individual buffers either fully compressible, or not at
+	all -- this option only controls the distribution of unique buffers. Setting
+	this option will also enable :option:`refill_buffers` to prevent every buffer
+	being identical.
+
+.. option:: invalidate=bool
+
+	Invalidate the buffer/page cache parts of the files to be used prior to
+	starting I/O if the platform and file type support it.  Defaults to true.
+	This will be ignored if :option:`pre_read` is also specified for the
+	same job.
+
+.. option:: sync=bool
+
+	Use synchronous I/O for buffered writes. For the majority of I/O engines,
+	this means using O_SYNC. Default: false.
+
+.. option:: iomem=str, mem=str
+
+	Fio can use various types of memory as the I/O unit buffer.  The allowed
+	values are:
+
+		**malloc**
+			Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
+			type.
+
+		**shm**
+			Use shared memory as the buffers. Allocated through
+			:manpage:`shmget(2)`.
+
+		**shmhuge**
+			Same as shm, but use huge pages as backing.
+
+		**mmap**
+			Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
+			be file backed if a filename is given after the option. The format
+			is `mem=mmap:/path/to/file`.
+
+		**mmaphuge**
+			Use a memory mapped huge file as the buffer backing. Append filename
+			after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
+
+		**mmapshared**
+			Same as mmap, but use a MMAP_SHARED mapping.
+
+		**cudamalloc**
+			Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+			The :option:`ioengine` must be `rdma`.
+
+	The area allocated is a function of the maximum allowed bs size for the job,
+	multiplied by the I/O depth given. Note that for **shmhuge** and
+	**mmaphuge** to work, the system must have free huge pages allocated. This
+	can normally be checked and set by reading/writing
+	:file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+	is 4MiB in size. So to calculate the number of huge pages you need for a
+	given job file, add up the I/O depth of all jobs (normally one unless
+	:option:`iodepth` is used) and multiply by the maximum bs set. Then divide
+	that number by the huge page size. You can see the size of the huge pages in
+	:file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
+	number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
+	see :option:`hugepage-size`.
+
+	**mmaphuge** also needs to have hugetlbfs mounted and the file location
+	should point there. So if it's mounted in :file:`/huge`, you would use
+	`mem=mmaphuge:/huge/somefile`.
+
+.. option:: iomem_align=int, mem_align=int
+
+	This indicates the memory alignment of the I/O memory buffers.  Note that
+	the given alignment is applied to the first I/O unit buffer, if using
+	:option:`iodepth` the alignment of the following buffers are given by the
+	:option:`bs` used. In other words, if using a :option:`bs` that is a
+	multiple of the page sized in the system, all buffers will be aligned to
+	this value. If using a :option:`bs` that is not page aligned, the alignment
+	of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+	:option:`bs` used.
+
+.. option:: hugepage-size=int
+
+	Defines the size of a huge page. Must at least be equal to the system
+	setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
+	always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
+	preferred way to set this to avoid setting a non-pow-2 bad value.
+
+.. option:: lockmem=int
+
+	Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+	simulate a smaller amount of memory.  The amount specified is per worker.
+
+
+I/O size
+~~~~~~~~
+
+.. option:: size=int
+
+	The total size of file I/O for each thread of this job. Fio will run until
+	this many bytes has been transferred, unless runtime is limited by other options
+	(such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
+	Fio will divide this size between the available files determined by options
+	such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+	specified by the job. If the result of division happens to be 0, the size is
+	set to the physical size of the given files or devices if they exist.
+	If this option is not specified, fio will use the full size of the given
+	files or devices.  If the files do not exist, size must be given. It is also
+	possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+	given, fio will use 20% of the full size of the given files or devices.
+	Can be combined with :option:`offset` to constrain the start and end range
+	that I/O will be done within.
+
+.. option:: io_size=int, io_limit=int
+
+	Normally fio operates within the region set by :option:`size`, which means
+	that the :option:`size` option sets both the region and size of I/O to be
+	performed. Sometimes that is not what you want. With this option, it is
+	possible to define just the amount of I/O that fio should do. For instance,
+	if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+	will perform I/O within the first 20GiB but exit when 5GiB have been
+	done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+	and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+	the 0..20GiB region.
+
+.. option:: filesize=irange(int)
+
+	Individual file sizes. May be a range, in which case fio will select sizes
+	for files at random within the given range and limited to :option:`size` in
+	total (if that is given). If not given, each created file is the same size.
+	This option overrides :option:`size` in terms of file size, which means
+	this value is used as a fixed size or possible range of each file.
+
+.. option:: file_append=bool
+
+	Perform I/O after the end of the file. Normally fio will operate within the
+	size of a file. If this option is set, then fio will append to the file
+	instead. This has identical behavior to setting :option:`offset` to the size
+	of a file.  This option is ignored on non-regular files.
+
+.. option:: fill_device=bool, fill_fs=bool
+
+	Sets size to something really large and waits for ENOSPC (no space left on
+	device) as the terminating condition. Only makes sense with sequential
+	write. For a read workload, the mount point will be filled first then I/O
+	started on the result. This option doesn't make sense if operating on a raw
+	device node, since the size of that is already known by the file system.
+	Additionally, writing beyond end-of-device will not return ENOSPC there.
+
+
+I/O engine
+~~~~~~~~~~
+
+.. option:: ioengine=str
+
+	Defines how the job issues I/O to the file. The following types are defined:
+
+		**sync**
+			Basic :manpage:`read(2)` or :manpage:`write(2)`
+			I/O. :manpage:`lseek(2)` is used to position the I/O location.
+			See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
+
+		**psync**
+			Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
+			all supported operating systems except for Windows.
+
+		**vsync**
+			Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
+			queuing by coalescing adjacent I/Os into a single submission.
+
+		**pvsync**
+			Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
+
+		**pvsync2**
+			Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
+
+		**io_uring**
+			Fast Linux native asynchronous I/O. Supports async IO
+			for both direct and buffered IO.
+			This engine defines engine specific options.
+
+		**libaio**
+			Linux native asynchronous I/O. Note that Linux may only support
+			queued behavior with non-buffered I/O (set ``direct=1`` or
+			``buffered=0``).
+			This engine defines engine specific options.
+
+		**posixaio**
+			POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+			:manpage:`aio_write(3)`.
+
+		**solarisaio**
+			Solaris native asynchronous I/O.
+
+		**windowsaio**
+			Windows native asynchronous I/O.  Default on Windows.
+
+		**mmap**
+			File is memory mapped with :manpage:`mmap(2)` and data copied
+			to/from using :manpage:`memcpy(3)`.
+
+		**splice**
+			:manpage:`splice(2)` is used to transfer the data and
+			:manpage:`vmsplice(2)` to transfer data from user space to the
+			kernel.
+
+		**sg**
+			SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+			ioctl, or if the target is an sg character device we use
+			:manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+			I/O. Requires :option:`filename` option to specify either block or
+			character devices. This engine supports trim operations.
+			The sg engine includes engine specific options.
+
+		**null**
+			Doesn't transfer any data, just pretends to.  This is mainly used to
+			exercise fio itself and for debugging/testing purposes.
+
+		**net**
+			Transfer over the network to given ``host:port``.  Depending on the
+			:option:`protocol` used, the :option:`hostname`, :option:`port`,
+			:option:`listen` and :option:`filename` options are used to specify
+			what sort of connection to make, while the :option:`protocol` option
+			determines which protocol will be used.  This engine defines engine
+			specific options.
+
+		**netsplice**
+			Like **net**, but uses :manpage:`splice(2)` and
+			:manpage:`vmsplice(2)` to map data and send/receive.
+			This engine defines engine specific options.
+
+		**cpuio**
+			Doesn't transfer any data, but burns CPU cycles according to the
+			:option:`cpuload` and :option:`cpuchunks` options. Setting
+			:option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+			of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
+			to get desired CPU usage, as the cpuload only loads a
+			single CPU at the desired rate. A job never finishes unless there is
+			at least one non-cpuio job.
+
+		**guasi**
+			The GUASI I/O engine is the Generic Userspace Asynchronous Syscall
+			Interface approach to async I/O. See
+
+			http://www.xmailserver.org/guasi-lib.html
+
+			for more info on GUASI.
+
+		**rdma**
+			The RDMA I/O engine supports both RDMA memory semantics
+			(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+			InfiniBand, RoCE and iWARP protocols. This engine defines engine
+			specific options.
+
+		**falloc**
+			I/O engine that does regular fallocate to simulate data transfer as
+			fio ioengine.
+
+			DDIR_READ
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+
+			DDIR_WRITE
+				does fallocate(,mode = 0).
+
+			DDIR_TRIM
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+
+		**ftruncate**
+			I/O engine that sends :manpage:`ftruncate(2)` operations in response
+			to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+			size to the current block offset. :option:`blocksize` is ignored.
+
+		**e4defrag**
+			I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+			defragment activity in request to DDIR_WRITE event.
+
+		**rados**
+			I/O engine supporting direct access to Ceph Reliable Autonomic
+			Distributed Object Store (RADOS) via librados. This ioengine
+			defines engine specific options.
+
+		**rbd**
+			I/O engine supporting direct access to Ceph Rados Block Devices
+			(RBD) via librbd without the need to use the kernel rbd driver. This
+			ioengine defines engine specific options.
+
+		**http**
+			I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+			a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
+
+			This engine only supports direct IO of iodepth=1; you need to scale this
+			via numjobs. blocksize defines the size of the objects to be created.
+
+			TRIM is translated to object deletion.
+
+		**gfapi**
+			Using GlusterFS libgfapi sync interface to direct access to
+			GlusterFS volumes without having to go through FUSE.  This ioengine
+			defines engine specific options.
+
+		**gfapi_async**
+			Using GlusterFS libgfapi async interface to direct access to
+			GlusterFS volumes without having to go through FUSE. This ioengine
+			defines engine specific options.
+
+		**libhdfs**
+			Read and write through Hadoop (HDFS).  The :option:`filename` option
+			is used to specify host,port of the hdfs name-node to connect.  This
+			engine interprets offsets a little differently.  In HDFS, files once
+			created cannot be modified so random writes are not possible. To
+			imitate this the libhdfs engine expects a bunch of small files to be
+			created over HDFS and will randomly pick a file from them
+			based on the offset generated by fio backend (see the example
+			job file to create such files, use ``rw=write`` option). Please
+			note, it may be necessary to set environment variables to work
+			with HDFS/libhdfs properly.  Each job uses its own connection to
+			HDFS.
+
+		**mtd**
+			Read, write and erase an MTD character device (e.g.,
+			:file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+			underlying device type, the I/O may have to go in a certain pattern,
+			e.g., on NAND, writing sequentially to erase blocks and discarding
+			before overwriting. The `trimwrite` mode works well for this
+			constraint.
+
+		**pmemblk**
+			Read and write using filesystem DAX to a file on a filesystem
+			mounted with DAX on a persistent memory device through the PMDK
+			libpmemblk library.
+
+		**dev-dax**
+			Read and write using device DAX to a persistent memory device (e.g.,
+			/dev/dax0.0) through the PMDK libpmem library.
+
+		**external**
+			Prefix to specify loading an external I/O engine object file. Append
+			the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+			ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+			absolute or relative. See :file:`engines/skeleton_external.c` for
+			details of writing an external I/O engine.
+
+		**filecreate**
+			Simply create the files and do no I/O to them.  You still need to
+			set  `filesize` so that all the accounting still occurs, but no
+			actual I/O will be done other than creating the file.
+
+		**libpmem**
+			Read and write using mmap I/O to a file on a filesystem
+			mounted with DAX on a persistent memory device through the PMDK
+			libpmem library.
+
+		**ime_psync**
+			Synchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine is very basic and issues calls to IME whenever an IO is
+			queued.
+
+		**ime_psyncv**
+			Synchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine uses iovecs and will try to stack as much IOs as possible
+			(if the IOs are "contiguous" and the IO depth is not exceeded)
+			before issuing a call to IME.
+
+		**ime_aio**
+			Asynchronous read and write using DDN's Infinite Memory Engine (IME).
+			This engine will try to stack as much IOs as possible by creating
+			requests for IME. FIO will then decide when to commit these requests.
+		**libiscsi**
+			Read and write iscsi lun with libiscsi.
+		**nbd**
+			Read and write a Network Block Device (NBD).
+
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition, there are some parameters which are only valid when a specific
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
+
+.. option:: hipri : [io_uring]
+
+	If this option is set, fio will attempt to use polled IO completions.
+	Normal IO completions generate interrupts to signal the completion of
+	IO, polled completions do not. Hence they are require active reaping
+	by the application. The benefits are more efficient IO for high IOPS
+	scenarios, and lower latencies for low queue depth IO.
+
+.. option:: fixedbufs : [io_uring]
+
+	If fio is asked to do direct IO, then Linux will map pages for each
+	IO call, and release them when IO is done. If this option is set, the
+	pages are pre-mapped before IO is started. This eliminates the need to
+	map and release for each IO. This is more efficient, and reduces the
+	IO latency as well.
+
+.. option:: registerfiles : [io_uring]
+	With this option, fio registers the set of files being used with the
+	kernel. This avoids the overhead of managing file counts in the kernel,
+	making the submission and completion part more lightweight. Required
+	for the below :option:`sqthread_poll` option.
+
+.. option:: sqthread_poll : [io_uring]
+
+	Normally fio will submit IO by issuing a system call to notify the
+	kernel of available items in the SQ ring. If this option is set, the
+	act of submitting IO will be done by a polling thread in the kernel.
+	This frees up cycles for fio, at the cost of using more CPU in the
+	system.
+
+.. option:: sqthread_poll_cpu : [io_uring]
+
+	When :option:`sqthread_poll` is set, this option provides a way to
+	define which CPU should be used for the polling thread.
+
+.. option:: userspace_reap : [libaio]
+
+	Normally, with the libaio engine in use, fio will use the
+	:manpage:`io_getevents(2)` system call to reap newly returned events.  With
+	this flag turned on, the AIO ring will be read directly from user-space to
+	reap events. The reaping mode is only enabled when polling for a minimum of
+	0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
+
+.. option:: hipri : [pvsync2]
+
+	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+	than normal.
+
+.. option:: hipri_percentage : [pvsync2]
+
+	When hipri is set this determines the probability of a pvsync2 I/O being high
+	priority. The default is 100%.
+
+.. option:: cpuload=int : [cpuio]
+
+	Attempt to use the specified percentage of CPU cycles. This is a mandatory
+	option when using cpuio I/O engine.
+
+.. option:: cpuchunks=int : [cpuio]
+
+	Split the load into cycles of the given time. In microseconds.
+
+.. option:: exit_on_io_done=bool : [cpuio]
+
+	Detect when I/O threads are done, then exit.
+
+.. option:: namenode=str : [libhdfs]
+
+	The hostname or IP address of a HDFS cluster namenode to contact.
+
+.. option:: port=int
+
+   [libhdfs]
+
+		The listening port of the HFDS cluster namenode.
+
+   [netsplice], [net]
+
+		The TCP or UDP port to bind to or connect to. If this is used with
+		:option:`numjobs` to spawn multiple instances of the same job type, then
+		this will be the starting port number since fio will use a range of
+		ports.
+
+   [rdma]
+
+		The port to use for RDMA-CM communication. This should be the same value
+		on the client and the server side.
+
+.. option:: hostname=str : [netsplice] [net] [rdma]
+
+	The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.  If the job
+	is a TCP listener or UDP reader, the hostname is not used and must be omitted
+	unless it is a valid UDP multicast address.
+
+.. option:: interface=str : [netsplice] [net]
+
+	The IP address of the network interface used to send or receive UDP
+	multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+	Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+	Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str, proto=str : [netsplice] [net]
+
+	The network protocol to use. Accepted values are:
+
+	**tcp**
+		Transmission control protocol.
+	**tcpv6**
+		Transmission control protocol V6.
+	**udp**
+		User datagram protocol.
+	**udpv6**
+		User datagram protocol V6.
+	**unix**
+		UNIX domain socket.
+
+	When the protocol is TCP or UDP, the port must also be given, as well as the
+	hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+	normal :option:`filename` option should be used and the port is invalid.
+
+.. option:: listen : [netsplice] [net]
+
+	For TCP network connections, tell fio to listen for incoming connections
+	rather than initiating an outgoing connection. The :option:`hostname` must
+	be omitted if this option is used.
+
+.. option:: pingpong : [netsplice] [net]
+
+	Normally a network writer will just continue writing data, and a network
+	reader will just consume packages. If ``pingpong=1`` is set, a writer will
+	send its normal payload to the reader, then wait for the reader to send the
+	same payload back. This allows fio to measure network latencies. The
+	submission and completion latencies then measure local time spent sending or
+	receiving, and the completion latency measures how long it took for the
+	other end to receive and send back.  For UDP multicast traffic
+	``pingpong=1`` should only be set for a single reader when multiple readers
+	are listening to the same address.
+
+.. option:: window_size : [netsplice] [net]
+
+	Set the desired socket buffer size for the connection.
+
+.. option:: mss : [netsplice] [net]
+
+	Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+	File will be used as a block donor (swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+	Configure donor file blocks allocation strategy:
+
+	**0**
+		Default. Preallocate donor's file on init.
+	**1**
+		Allocate space immediately inside defragment event, and free right
+		after event.
+
+.. option:: clustername=str : [rbd,rados]
+
+	Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+	Specifies the name of the RBD.
+
+.. option:: pool=str : [rbd,rados]
+
+	Specifies the name of the Ceph pool containing RBD or RADOS data.
+
+.. option:: clientname=str : [rbd,rados]
+
+	Specifies the username (without the 'client.' prefix) used to access the
+	Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+	the full *type.id* string. If no type. prefix is given, fio will add
+	'client.' by default.
+
+.. option:: busy_poll=bool : [rbd,rados]
+
+        Poll store instead of waiting for completion. Usually this provides better
+        throughput at cost of higher(up to 100%) CPU utilization.
+
+.. option:: skip_bad=bool : [mtd]
+
+	Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+	libhdfs will create chunk in this HDFS directory.
+
+.. option:: chunk_size : [libhdfs]
+
+	The size of the chunk to use for each file.
+
+.. option:: verb=str : [rdma]
+
+	The RDMA verb to use on this side of the RDMA ioengine connection. Valid
+	values are write, read, send and recv. These correspond to the equivalent
+	RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
+	specified on the client side of the connection. See the examples folder.
+
+.. option:: bindname=str : [rdma]
+
+	The name to use to bind the local RDMA-CM connection to a local RDMA device.
+	This could be a hostname or an IPv4 or IPv6 address. On the server side this
+	will be passed into the rdma_bind_addr() function and on the client site it
+	will be used in the rdma_resolve_add() function. This can be useful when
+	multiple paths exist between the client and the server or in certain loopback
+	configurations.
+
+.. option:: readfua=bool : [sg]
+
+	With readfua option set to 1, read operations include
+	the force unit access (fua) flag. Default is 0.
+
+.. option:: writefua=bool : [sg]
+
+	With writefua option set to 1, write operations include
+	the force unit access (fua) flag. Default is 0.
+
+.. option:: sg_write_mode=str : [sg]
+
+	Specify the type of write commands to issue. This option can take three values:
+
+	**write**
+		This is the default where write opcodes are issued as usual.
+	**verify**
+		Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+		directs the device to carry out a medium verification with no data
+		comparison. The writefua option is ignored with this selection.
+	**same**
+		Issue WRITE SAME commands. This transfers a single block to the device
+		and writes this same block of data to a contiguous sequence of LBAs
+		beginning at the specified offset. fio's block size parameter specifies
+		the amount of data written with each command. However, the amount of data
+		actually transferred to the device is equal to the device's block
+		(sector) size. For a device with 512 byte sectors, blocksize=8k will
+		write 16 sectors with each command. fio will still generate 8k of data
+		for each command but only the first 512 bytes will be used and
+		transferred to the device. The writefua option is ignored with this
+		selection.
+
+.. option:: http_host=str : [http]
+
+	Hostname to connect to. For S3, this could be the bucket hostname.
+	Default is **localhost**
+
+.. option:: http_user=str : [http]
+
+	Username for HTTP authentication.
+
+.. option:: http_pass=str : [http]
+
+	Password for HTTP authentication.
+
+.. option:: https=str : [http]
+
+	Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
+	will enable HTTPS, but disable SSL peer verification (use with
+	caution!). Default is **off**
+
+.. option:: http_mode=str : [http]
+
+	Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
+	Default is **webdav**
+
+.. option:: http_s3_region=str : [http]
+
+	The S3 region/zone string.
+	Default is **us-east-1**
+
+.. option:: http_s3_key=str : [http]
+
+	The S3 secret key.
+
+.. option:: http_s3_keyid=str : [http]
+
+	The S3 key/access id.
+
+.. option:: http_swift_auth_token=str : [http]
+
+	The Swift auth token. See the example configuration file on how
+	to retrieve this.
+
+.. option:: http_verbose=int : [http]
+
+	Enable verbose requests from libcurl. Useful for debugging. 1
+	turns on verbose logging from libcurl, 2 additionally enables
+	HTTP IO tracing. Default is **0**
+
+.. option:: uri=str : [nbd]
+
+	Specify the NBD URI of the server to test.  The string
+	is a standard NBD URI
+	(see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
+	Example URIs: nbd://localhost:10809
+	nbd+unix:///?socket=/tmp/socket
+	nbds://tlshost/exportname
+
+I/O depth
+~~~~~~~~~
+
+.. option:: iodepth=int
+
+	Number of I/O units to keep in flight against the file.  Note that
+	increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+	for small degrees when :option:`verify_async` is in use).  Even async
+	engines may impose OS restrictions causing the desired depth not to be
+	achieved.  This may happen on Linux when using libaio and not setting
+	:option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
+	eye on the I/O depth distribution in the fio output to verify that the
+	achieved depth is as expected. Default: 1.
+
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
+
+	This defines how many pieces of I/O to submit at once.  It defaults to 1
+	which means that we submit each I/O as soon as it is available, but can be
+	raised to submit bigger batches of I/O at the time. If it is set to 0 the
+	:option:`iodepth` value will be used.
+
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
+
+	This defines how many pieces of I/O to retrieve at once. It defaults to 1
+	which means that we'll ask for a minimum of 1 I/O in the retrieval process
+	from the kernel. The I/O retrieval will go on until we hit the limit set by
+	:option:`iodepth_low`. If this variable is set to 0, then fio will always
+	check for completed events before queuing more I/O. This helps reduce I/O
+	latency, at the cost of more retrieval system calls.
+
+.. option:: iodepth_batch_complete_max=int
+
+	This defines maximum pieces of I/O to retrieve at once. This variable should
+	be used along with :option:`iodepth_batch_complete_min`\=int variable,
+	specifying the range of min and max amount of I/O which should be
+	retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
+	value.
+
+	Example #1::
+
+		iodepth_batch_complete_min=1
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we will retrieve at least 1 I/O and up to the whole
+	submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+	Example #2::
+
+		iodepth_batch_complete_min=0
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we can retrieve up to the whole submitted queue depth, but
+	if none of I/O has been completed yet, we will NOT wait and immediately exit
+	the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+	The low water mark indicating when to start filling the queue
+	again. Defaults to the same as :option:`iodepth`, meaning that fio will
+	attempt to keep the queue full at all times.  If :option:`iodepth` is set to
+	e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+	16 requests, it will let the depth drain down to 4 before starting to fill
+	it again.
+
+.. option:: serialize_overlap=bool
+
+	Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+	When two or more I/Os are submitted simultaneously, there is no guarantee that
+	the I/Os will be processed or completed in the submitted order. Further, if
+	two or more of those I/Os are writes, any overlapping region between them can
+	become indeterminate/undefined on certain storage. These issues can cause
+	verification to fail erratically when at least one of the racing I/Os is
+	changing data and the overlapping region has a non-zero size. Setting
+	``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+	serializing in-flight I/Os that have a non-zero overlap. Note that setting
+	this option can reduce both performance and the :option:`iodepth` achieved.
+
+	This option only applies to I/Os issued for a single job except when it is
+	enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
+	will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
+	enabled.
+
+	Default: false.
+
+.. option:: io_submit_mode=str
+
+	This option controls how fio submits the I/O to the I/O engine. The default
+	is `inline`, which means that the fio job threads submit and reap I/O
+	directly. If set to `offload`, the job threads will offload I/O submission
+	to a dedicated pool of I/O threads. This requires some coordination and thus
+	has a bit of extra overhead, especially for lower queue depth I/O where it
+	can increase latencies. The benefit is that fio can manage submission rates
+	independently of the device completion rates. This avoids skewed latency
+	reporting if I/O gets backed up on the device side (the coordinated omission
+	problem).
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinktime=time
+
+	Stall the job for the specified period of time after an I/O has completed before issuing the
+	next. May be used to simulate processing being done by an application.
+	When the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`thinktime_blocks` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+	Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+	something with the data received, before falling back to sleeping for the
+	rest of the period specified by :option:`thinktime`.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: thinktime_blocks=int
+
+	Only valid if :option:`thinktime` is set - control how many blocks to issue,
+	before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+	fio wait :option:`thinktime` usecs after every block. This effectively makes any
+	queue depth setting redundant, since no more than 1 I/O will be queued
+	before we have to complete it and do our :option:`thinktime`. In other words, this
+	setting effectively caps the queue depth if the latter is larger.
+
+.. option:: rate=int[,int][,int]
+
+	Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+	suffix rules apply.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+	For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+	500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
+	`rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+	latter will only limit reads.
+
+.. option:: rate_min=int[,int][,int]
+
+	Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+	to meet this requirement will cause the job to exit.  Comma-separated values
+	may be specified for reads, writes, and trims as described in
+	:option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+	Cap the bandwidth to this number of IOPS. Basically the same as
+	:option:`rate`, just specified independently of bandwidth. If the job is
+	given a block size range instead of a fixed value, the smallest block size
+	is used as the metric.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+	If fio doesn't meet this rate of I/O, it will cause the job to exit.
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+	This option controls how fio manages rated I/O submissions. The default is
+	`linear`, which submits I/O in a linear fashion with fixed delays between
+	I/Os that gets adjusted based on I/O completion rates. If this is set to
+	`poisson`, fio will submit I/O based on a more real world random request
+	flow, known as the Poisson process
+	(https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+	10^6 / IOPS for the given workload.
+
+.. option:: rate_ignore_thinktime=bool
+
+	By default, fio will attempt to catch up to the specified rate setting,
+	if any kind of thinktime setting was used. If this option is set, then
+	fio will ignore the thinktime and continue doing IO at the specified
+	rate, instead of entering a catch-up mode after thinktime is done.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+	If set, fio will attempt to find the max performance point that the given
+	workload will run at while maintaining a latency below this target.  When
+	the unit is omitted, the value is interpreted in microseconds.  See
+	:option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+	Used with :option:`latency_target` to specify the sample window that the job
+	is run at varying queue depths to test the performance.  When the unit is
+	omitted, the value is interpreted in microseconds.
+
+.. option:: latency_percentile=float
+
+	The percentage of I/Os that must fall within the criteria specified by
+	:option:`latency_target` and :option:`latency_window`. If not set, this
+	defaults to 100.0, meaning that all I/Os must be equal or below to the value
+	set by :option:`latency_target`.
+
+.. option:: max_latency=time
+
+	If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+	maximum latency. When the unit is omitted, the value is interpreted in
+	microseconds.
+
+.. option:: rate_cycle=int
+
+	Average bandwidth for :option:`rate` and :option:`rate_min` over this number
+	of milliseconds. Defaults to 1000.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+	Write the issued I/O patterns to the specified file. See
+	:option:`read_iolog`.  Specify a separate file for each job, otherwise the
+	iologs will be interspersed and the file may be corrupt.
+
+.. option:: read_iolog=str
+
+	Open an iolog with the specified filename and replay the I/O patterns it
+	contains. This can be used to store a workload and replay it sometime
+	later. The iolog given may also be a blktrace binary file, which allows fio
+	to replay a workload captured by :command:`blktrace`. See
+	:manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+	replay, the file needs to be turned into a blkparse binary data file first
+	(``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+	You can specify a number of files by separating the names with a ':'
+	character. See the :option:`filename` option for information on how to
+	escape ':' and '\' characters within the file names. These files will
+	be sequentially assigned to job clones created by :option:`numjobs`.
+
+.. option:: read_iolog_chunked=bool
+
+	Determines how iolog is read. If false(default) entire :option:`read_iolog`
+	will be read at once. If selected true, input from iolog will be read
+	gradually. Useful when iolog is very large, or it is generated.
+
+.. option:: merge_blktrace_file=str
+
+	When specified, rather than replaying the logs passed to :option:`read_iolog`,
+	the logs go through a merge phase which aggregates them into a single
+	blktrace. The resulting file is then passed on as the :option:`read_iolog`
+	parameter. The intention here is to make the order of events consistent.
+	This limits the influence of the scheduler compared to replaying multiple
+	blktraces via concurrent jobs.
+
+.. option:: merge_blktrace_scalars=float_list
+
+	This is a percentage based option that is index paired with the list of
+	files passed to :option:`read_iolog`. When merging is performed, scale
+	the time of each event by the corresponding amount. For example,
+	``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
+	and the second trace in realtime. This knob is separately tunable from
+	:option:`replay_time_scale` which scales the trace during runtime and
+	does not change the output of the merge unlike this option.
+
+.. option:: merge_blktrace_iters=float_list
+
+	This is a whole number option that is index paired with the list of files
+	passed to :option:`read_iolog`. When merging is performed, run each trace
+	for the specified number of iterations. For example,
+	``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
+	and the second trace for one iteration.
+
+.. option:: replay_no_stall=bool
+
+	When replaying I/O with :option:`read_iolog` the default behavior is to
+	attempt to respect the timestamps within the log and replay them with the
+	appropriate delay between IOPS. By setting this variable fio will not
+	respect the timestamps and attempt to replay them as fast as possible while
+	still respecting ordering. The result is the same I/O pattern to a given
+	device, but different timings.
+
+.. option:: replay_time_scale=int
+
+	When replaying I/O with :option:`read_iolog`, fio will honor the
+	original timing in the trace. With this option, it's possible to scale
+	the time. It's a percentage option, if set to 50 it means run at 50%
+	the original IO rate in the trace. If set to 200, run at twice the
+	original IO rate. Defaults to 100.
+
+.. option:: replay_redirect=str
+
+	While replaying I/O patterns using :option:`read_iolog` the default behavior
+	is to replay the IOPS onto the major/minor device that each IOP was recorded
+	from.  This is sometimes undesirable because on a different machine those
+	major/minor numbers can map to a different device.  Changing hardware on the
+	same system can also result in a different major/minor mapping.
+	``replay_redirect`` causes all I/Os to be replayed onto the single specified
+	device regardless of the device it was recorded
+	from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
+	in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
+	multiple devices will be replayed onto a single device, if the trace
+	contains multiple devices. If you want multiple devices to be replayed
+	concurrently to multiple redirected devices you must blkparse your trace
+	into separate traces and replay them with independent fio invocations.
+	Unfortunately this also breaks the strict time ordering between multiple
+	device accesses.
+
+.. option:: replay_align=int
+
+	Force alignment of the byte offsets in a trace to this value. The value
+	must be a power of 2.
+
+.. option:: replay_scale=int
+
+	Scale byte offsets down by this factor when replaying traces. Should most
+	likely use :option:`replay_align` as well.
+
+.. option:: replay_skip=str
+
+	Sometimes it's useful to skip certain IO types in a replay trace.
+	This could be, for instance, eliminating the writes in the trace.
+	Or not replaying the trims/discards, if you are redirecting to
+	a device that doesn't support them. This option takes a comma
+	separated list of read, write, trim, sync.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+	Fio defaults to creating jobs by using fork, however if this option is
+	given, fio will create jobs by using POSIX Threads' function
+	:manpage:`pthread_create(3)` to create threads instead.
+
+.. option:: wait_for=str
+
+	If set, the current job won't be started until all workers of the specified
+	waitee job are done.
+
+	``wait_for`` operates on the job name basis, so there are a few
+	limitations. First, the waitee must be defined prior to the waiter job
+	(meaning no forward references). Second, if a job is being referenced as a
+	waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+	Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+	On Windows, values less than -15 set the process class to "High"; -1 through
+	-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+	priority class.
+
+.. option:: prio=int
+
+	Set the I/O priority value of this job. Linux limits us to a positive value
+	between 0 and 7, with 0 being the highest.  See man
+	:manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+	systems since meaning of priority may differ.
+
+.. option:: prioclass=int
+
+	Set the I/O priority class. See man :manpage:`ionice(1)`.
+
+.. option:: cpus_allowed=str
+
+	Controls the same options as :option:`cpumask`, but accepts a textual
+	specification of the permitted CPUs instead and CPUs are indexed from 0. So
+	to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
+	allows a range of CPUs to be specified -- say you wanted a binding to CPUs
+	0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
+
+	On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
+	processor group will be used and affinity settings are inherited from the
+	system. An fio build configured to target Windows 7 makes options that set
+	CPUs processor group aware and values will set both the processor group
+	and a CPU from within that group. For example, on a system where processor
+	group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
+	values between 0 and 39 will bind CPUs from processor group 0 and
+	``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
+	group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
+	single ``cpus_allowed`` option must be from the same processor group. For
+	Windows fio builds not built for Windows 7, CPUs will only be selected from
+	(and be relative to) whatever processor group fio happens to be running in
+	and CPUs from other processor groups cannot be used.
+
+.. option:: cpus_allowed_policy=str
+
+	Set the policy of how fio distributes the CPUs specified by
+	:option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
+
+		**shared**
+			All jobs will share the CPU set specified.
+		**split**
+			Each job will get a unique CPU from the CPU set.
+
+	**shared** is the default behavior, if the option isn't specified. If
+	**split** is specified, then fio will will assign one cpu per job. If not
+	enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+	in the set.
+
+.. option:: cpumask=int
+
+	Set the CPU affinity of this job. The parameter given is a bit mask of
+	allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+	and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+	:manpage:`sched_setaffinity(2)`. This may not work on all supported
+	operating systems or kernel versions. This option doesn't work well for a
+	higher CPU count than what you can store in an integer mask, so it can only
+	control cpus 1-32. For boxes with larger CPU counts, use
+	:option:`cpus_allowed`.
+
+.. option:: numa_cpu_nodes=str
+
+	Set this job running on specified NUMA nodes' CPUs. The arguments allow
+	comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+	NUMA options support, fio must be built on a system with libnuma-dev(el)
+	installed.
+
+.. option:: numa_mem_policy=str
+
+	Set this job's memory policy and corresponding NUMA nodes. Format of the
+	arguments::
+
+		<mode>[:<nodelist>]
+
+	``mode`` is one of the following memory policies: ``default``, ``prefer``,
+	``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+	policies, no node needs to be specified.  For ``prefer``, only one node is
+	allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
+	follows: a comma delimited list of numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+	Add job to this control group. If it doesn't exist, it will be created. The
+	system must have a mounted cgroup blkio mount point for this to work. If
+	your system doesn't have it mounted, you can do so with::
+
+		# mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+	Set the weight of the cgroup to this value. See the documentation that comes
+	with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+	Normally fio will delete the cgroups it has created after the job
+	completion. To override this behavior and to leave cgroups around after the
+	job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
+	to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+	The ID of the flow. If not specified, it defaults to being a global
+	flow. See :option:`flow`.
+
+.. option:: flow=int
+
+	Weight in token-based flow control. If this value is used, then there is a
+	'flow counter' which is used to regulate the proportion of activity between
+	two or more jobs. Fio attempts to keep this flow counter near zero. The
+	``flow`` parameter stands for how much should be added or subtracted to the
+	flow counter on each iteration of the main I/O loop. That is, if one job has
+	``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
+	ratio in how much one runs vs the other.
+
+.. option:: flow_watermark=int
+
+	The maximum value that the absolute value of the flow counter is allowed to
+	reach before the job must wait for a lower value of the counter.
+
+.. option:: flow_sleep=int
+
+	The period of time, in microseconds, to wait after the flow watermark has
+	been exceeded before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+	Wait for preceding jobs in the job file to exit, before starting this
+	one. Can be used to insert serialization points in the job file. A stone
+	wall also implies starting a new reporting group, see
+	:option:`group_reporting`.
+
+.. option:: exitall
+
+	By default, fio will continue running all other jobs when one job finishes
+	but sometimes this is not the desired action.  Setting ``exitall`` will
+	instead make fio terminate all other jobs when one job finishes.
+
+.. option:: exec_prerun=str
+
+	Before running this job, issue the command specified through
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+	After the job completes, issue the command specified though
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+	Instead of running as the invoking user, set the user ID to this value
+	before the thread/process does any work.
+
+.. option:: gid=int
+
+	Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+	Do not perform specified workload, only verify data still matches previous
+	invocation of this workload. This option allows one to check data multiple
+	times at a later date without overwriting it. This option makes sense only
+	for workloads that write data, and does not support workloads with the
+	:option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+	Run the verify phase after a write phase. Only valid if :option:`verify` is
+	set. Default: true.
+
+.. option:: verify=str
+
+	If writing to a file, fio can verify the file contents after each iteration
+	of the job. Each verification method also implies verification of special
+	header, which is written to the beginning of each block. This header also
+	includes meta information, like offset of the block, block number, timestamp
+	when block was written, etc.  :option:`verify` can be combined with
+	:option:`verify_pattern` option.  The allowed values are:
+
+		**md5**
+			Use an md5 sum of the data area and store it in the header of
+			each block.
+
+		**crc64**
+			Use an experimental crc64 sum of the data area and store it in the
+			header of each block.
+
+		**crc32c**
+			Use a crc32c sum of the data area and store it in the header of
+			each block. This will automatically use hardware acceleration
+			(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+			fall back to software crc32c if none is found. Generally the
+			fastest checksum fio supports when hardware accelerated.
+
+		**crc32c-intel**
+			Synonym for crc32c.
+
+		**crc32**
+			Use a crc32 sum of the data area and store it in the header of each
+			block.
+
+		**crc16**
+			Use a crc16 sum of the data area and store it in the header of each
+			block.
+
+		**crc7**
+			Use a crc7 sum of the data area and store it in the header of each
+			block.
+
+		**xxhash**
+			Use xxhash as the checksum function. Generally the fastest software
+			checksum that fio supports.
+
+		**sha512**
+			Use sha512 as the checksum function.
+
+		**sha256**
+			Use sha256 as the checksum function.
+
+		**sha1**
+			Use optimized sha1 as the checksum function.
+
+		**sha3-224**
+			Use optimized sha3-224 as the checksum function.
+
+		**sha3-256**
+			Use optimized sha3-256 as the checksum function.
+
+		**sha3-384**
+			Use optimized sha3-384 as the checksum function.
+
+		**sha3-512**
+			Use optimized sha3-512 as the checksum function.
+
+		**meta**
+			This option is deprecated, since now meta information is included in
+			generic verification header and meta verification happens by
+			default. For detailed information see the description of the
+			:option:`verify` setting. This option is kept because of
+			compatibility's sake with old configurations. Do not use it.
+
+		**pattern**
+			Verify a strict pattern. Normally fio includes a header with some
+			basic information and checksumming, but if this option is set, only
+			the specific pattern set with :option:`verify_pattern` is verified.
+
+		**null**
+			Only pretend to verify. Useful for testing internals with
+			:option:`ioengine`\=null, not for much else.
+
+	This option can be used for repeated burn-in tests of a system to make sure
+	that the written data is also correctly read back. If the data direction
+	given is a read or random read, fio will assume that it should verify a
+	previously written file. If the data direction includes any form of write,
+	the verify will be of the newly written data.
+
+	To avoid false verification errors, do not use the norandommap option when
+	verifying data with async I/O engines and I/O depths > 1.  Or use the
+	norandommap and the lfsr random generator together to avoid writing to the
+	same offset with muliple outstanding I/Os.
+
+.. option:: verify_offset=int
+
+	Swap the verification header with data somewhere else in the block before
+	writing. It is swapped back before verifying.
+
+.. option:: verify_interval=int
+
+	Write the verification header at a finer granularity than the
+	:option:`blocksize`. It will be written for chunks the size of
+	``verify_interval``. :option:`blocksize` should divide this evenly.
+
+.. option:: verify_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+	filling with totally random bytes, but sometimes it's interesting to fill
+	with a known pattern for I/O verification purposes. Depending on the width
+	of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+	be either a decimal or a hex number).  The ``verify_pattern`` if larger than
+	a 32-bit quantity has to be a hex number that starts with either "0x" or
+	"0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+	format, which means that for each block offset will be written and then
+	verified back, e.g.::
+
+		verify_pattern=%o
+
+	Or use combination of everything::
+
+		verify_pattern=0xff%o"abcd"-12
+
+.. option:: verify_fatal=bool
+
+	Normally fio will keep checking the entire contents before quitting on a
+	block verification failure. If this option is set, fio will exit the job on
+	the first observed failure. Default: false.
+
+.. option:: verify_dump=bool
+
+	If set, dump the contents of both the original data block and the data block
+	we read off disk to files. This allows later analysis to inspect just what
+	kind of data corruption occurred. Off by default.
+
+.. option:: verify_async=int
+
+	Fio will normally verify I/O inline from the submitting thread. This option
+	takes an integer describing how many async offload threads to create for I/O
+	verification instead, causing fio to offload the duty of verifying I/O
+	contents to one or more separate threads. If using this offload option, even
+	sync I/O engines can benefit from using an :option:`iodepth` setting higher
+	than 1, as it allows them to have I/O in flight while verifies are running.
+	Defaults to 0 async threads, i.e. verification is not asynchronous.
+
+.. option:: verify_async_cpus=str
+
+	Tell fio to set the given CPU affinity on the async I/O verification
+	threads. See :option:`cpus_allowed` for the format used.
+
+.. option:: verify_backlog=int
+
+	Fio will normally verify the written contents of a job that utilizes verify
+	once that job has completed. In other words, everything is written then
+	everything is read back and verified. You may want to verify continually
+	instead for a variety of reasons. Fio stores the meta data associated with
+	an I/O block in memory, so for large verify workloads, quite a bit of memory
+	would be used up holding this meta data. If this option is enabled, fio will
+	write only N blocks before verifying these blocks.
+
+.. option:: verify_backlog_batch=int
+
+	Control how many blocks fio will verify if :option:`verify_backlog` is
+	set. If not set, will default to the value of :option:`verify_backlog`
+	(meaning the entire queue is read back and verified).  If
+	``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+	blocks will be verified, if ``verify_backlog_batch`` is larger than
+	:option:`verify_backlog`, some blocks will be verified more than once.
+
+.. option:: verify_state_save=bool
+
+	When a job exits during the write phase of a verify workload, save its
+	current state. This allows fio to replay up until that point, if the verify
+	state is loaded for the verify read phase. The format of the filename is,
+	roughly::
+
+		<type>-<jobname>-<jobindex>-verify.state.
+
+	<type> is "local" for a local run, "sock" for a client/server socket
+	connection, and "ip" (192.168.0.1, for instance) for a networked
+	client/server connection. Defaults to true.
+
+.. option:: verify_state_load=bool
+
+	If a verify termination trigger was used, fio stores the current write state
+	of each thread. This can be used at verification time so that fio knows how
+	far it should verify.  Without this information, fio will run a full
+	verification pass, according to the settings in the job file used.  Default
+	false.
+
+.. option:: trim_percentage=int
+
+	Number of verify blocks to discard/trim.
+
+.. option:: trim_verify_zero=bool
+
+	Verify that trim/discarded blocks are returned as zeros.
+
+.. option:: trim_backlog=int
+
+	Trim after this number of blocks are written.
+
+.. option:: trim_backlog_batch=int
+
+	Trim this number of I/O blocks.
+
+.. option:: experimental_verify=bool
+
+	Enable experimental verification.
+
+Steady state
+~~~~~~~~~~~~
+
+.. option:: steadystate=str:float, ss=str:float
+
+	Define the criterion and limit for assessing steady state performance. The
+	first parameter designates the criterion whereas the second parameter sets
+	the threshold. When the criterion falls below the threshold for the
+	specified duration, the job will stop. For example, `iops_slope:0.1%` will
+	direct fio to terminate the job when the least squares regression slope
+	falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+	this will apply to all jobs in the group. Below is the list of available
+	steady state assessment criteria. All assessments are carried out using only
+	data from the rolling collection window. Threshold limits can be expressed
+	as a fixed value or as a percentage of the mean in the collection window.
+
+	When using this feature, most jobs should include the :option:`time_based`
+	and :option:`runtime` options or the :option:`loops` option so that fio does not
+	stop running after it has covered the full size of the specified file(s) or device(s).
+
+		**iops**
+			Collect IOPS data. Stop the job if all individual IOPS measurements
+			are within the specified limit of the mean IOPS (e.g., ``iops:2``
+			means that all individual IOPS values must be within 2 of the mean,
+			whereas ``iops:0.2%`` means that all individual IOPS values must be
+			within 0.2% of the mean IOPS to terminate the job).
+
+		**iops_slope**
+			Collect IOPS data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+		**bw**
+			Collect bandwidth data. Stop the job if all individual bandwidth
+			measurements are within the specified limit of the mean bandwidth.
+
+		**bw_slope**
+			Collect bandwidth data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
+
+	A rolling window of this duration will be used to judge whether steady state
+	has been reached. Data will be collected once per second. The default is 0
+	which disables steady state detection.  When the unit is omitted, the
+	value is interpreted in seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+	Allow the job to run for the specified duration before beginning data
+	collection for checking the steady state job termination criterion. The
+	default is 0.  When the unit is omitted, the value is interpreted in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+	If set, this generates bw/clat/iops log with per file private filenames. If
+	not set, jobs with identical names will share the log filename. Default:
+	true.
+
+.. option:: group_reporting
+
+	It may sometimes be interesting to display statistics for groups of jobs as
+	a whole instead of for each individual job.  This is especially true if
+	:option:`numjobs` is used; looking at individual thread/process output
+	quickly becomes unwieldy.  To see the final report per-group instead of
+	per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+	same reporting group, unless if separated by a :option:`stonewall`, or by
+	using :option:`new_group`.
+
+.. option:: new_group
+
+	Start a new reporting group. See: :option:`group_reporting`.  If not given,
+	all jobs in a file will be part of the same reporting group, unless
+	separated by a :option:`stonewall`.
+
+.. option:: stats=bool
+
+	By default, fio collects and shows final output results for all jobs
+	that run. If this option is set to 0, then fio will ignore it in
+	the final stat output.
+
+.. option:: write_bw_log=str
+
+	If given, write a bandwidth log for this job. Can be used to store data of
+	the bandwidth of the jobs in their lifetime.
+
+	If no str argument is given, the default filename of
+	:file:`jobname_type.x.log` is used. Even when the argument is given, fio
+	will still append the type of log. So if one specifies::
+
+		write_bw_log=foo
+
+	The actual log name will be :file:`foo_bw.x.log` where `x` is the index
+	of the job (`1..N`, where `N` is the number of jobs). If
+	:option:`per_job_logs` is false, then the filename will not include the
+	`.x` job index.
+
+	The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+	text files into nice graphs. See `Log File Formats`_ for how data is
+	structured within the file.
+
+.. option:: write_lat_log=str
+
+	Same as :option:`write_bw_log`, except this option creates I/O
+	submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
+	:file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
+	latency files instead. See :option:`write_bw_log` for details about
+	the filename format and `Log File Formats`_ for how data is structured
+	within the files.
+
+.. option:: write_hist_log=str
+
+	Same as :option:`write_bw_log` but writes an I/O completion latency
+	histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
+	file will be empty unless :option:`log_hist_msec` has also been set.
+	See :option:`write_bw_log` for details about the filename format and
+	`Log File Formats`_ for how data is structured within the file.
+
+.. option:: write_iops_log=str
+
+	Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
+	:file:`name_iops.x.log`) instead. Because fio defaults to individual
+	I/O logging, the value entry in the IOPS log will be 1 unless windowed
+	logging (see :option:`log_avg_msec`) has been enabled. See
+	:option:`write_bw_log` for details about the filename format and `Log
+	File Formats`_ for how data is structured within the file.
+
+.. option:: log_avg_msec=int
+
+	By default, fio will log an entry in the iops, latency, or bw log for every
+	I/O that completes. When writing to the disk log, that can quickly grow to a
+	very large size. Setting this option makes fio average the each log entry
+	over the specified period of time, reducing the resolution of the log.  See
+	:option:`log_max_value` as well. Defaults to 0, logging all entries.
+	Also see `Log File Formats`_.
+
+.. option:: log_hist_msec=int
+
+	Same as :option:`log_avg_msec`, but logs entries for completion latency
+	histograms. Computing latency percentiles from averages of intervals using
+	:option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+	histogram entries over the specified period of time, reducing log sizes for
+	high IOPS devices while retaining percentile accuracy.  See
+	:option:`log_hist_coarseness` and :option:`write_hist_log` as well.
+	Defaults to 0, meaning histogram logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+	Integer ranging from 0 to 6, defining the coarseness of the resolution of
+	the histogram logs enabled with :option:`log_hist_msec`. For each increment
+	in coarseness, fio outputs half as many bins. Defaults to 0, for which
+	histogram logs contain 1216 latency bins. See :option:`write_hist_log`
+	and `Log File Formats`_.
+
+.. option:: log_max_value=bool
+
+	If :option:`log_avg_msec` is set, fio logs the average over that window. If
+	you instead want to log the maximum value, set this option to 1. Defaults to
+	0, meaning that averaged values are logged.
+
+.. option:: log_offset=bool
+
+	If this is set, the iolog options will include the byte offset for the I/O
+	entry as well as the other data values. Defaults to 0 meaning that
+	offsets are not present in logs. Also see `Log File Formats`_.
+
+.. option:: log_compression=int
+
+	If this is set, fio will compress the I/O logs as it goes, to keep the
+	memory footprint lower. When a log reaches the specified size, that chunk is
+	removed and compressed in the background. Given that I/O logs are fairly
+	highly compressible, this yields a nice memory savings for longer runs. The
+	downside is that the compression will consume some background CPU cycles, so
+	it may impact the run. This, however, is also true if the logging ends up
+	consuming most of the system memory.  So pick your poison. The I/O logs are
+	saved normally at the end of a run, by decompressing the chunks and storing
+	them in the specified log file. This feature depends on the availability of
+	zlib.
+
+.. option:: log_compression_cpus=str
+
+	Define the set of CPUs that are allowed to handle online log compression for
+	the I/O jobs. This can provide better isolation between performance
+	sensitive jobs, and background compression work. See
+	:option:`cpus_allowed` for the format used.
+
+.. option:: log_store_compressed=bool
+
+	If set, fio will store the log files in a compressed format. They can be
+	decompressed with fio, using the :option:`--inflate-log` command line
+	parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+	If set, fio will log Unix timestamps to the log files produced by enabling
+	write_type_log for each log type, instead of the default zero-based
+	timestamps.
+
+.. option:: block_error_percentiles=bool
+
+	If set, record errors in trim block-sized units from writes and trims and
+	output a histogram of how many trims it took to get to errors, and what kind
+	of error was encountered.
+
+.. option:: bwavgtime=int
+
+	Average the calculated bandwidth over the given time. Value is specified in
+	milliseconds. If the job also does bandwidth logging through
+	:option:`write_bw_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+	Average the calculated IOPS over the given time. Value is specified in
+	milliseconds. If the job also does IOPS logging through
+	:option:`write_iops_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: disk_util=bool
+
+	Generate disk utilization statistics, if the platform supports it.
+	Default: true.
+
+.. option:: disable_lat=bool
+
+	Disable measurements of total latency numbers. Useful only for cutting back
+	the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+	performance at really high IOPS rates.  Note that to really get rid of a
+	large amount of these calls, this option must be used with
+	:option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+	Disable measurements of completion latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+	Disable measurements of submission latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+	Disable measurements of throughput/bandwidth numbers. See
+	:option:`disable_lat`.
+
+.. option:: clat_percentiles=bool
+
+	Enable the reporting of percentiles of completion latencies.  This
+	option is mutually exclusive with :option:`lat_percentiles`.
+
+.. option:: lat_percentiles=bool
+
+	Enable the reporting of percentiles of I/O latencies. This is similar
+	to :option:`clat_percentiles`, except that this includes the
+	submission latency. This option is mutually exclusive with
+	:option:`clat_percentiles`.
+
+.. option:: percentile_list=float_list
+
+	Overwrite the default list of percentiles for completion latencies and
+	the block error histogram.  Each number is a floating number in the
+	range (0,100], and the maximum length of the list is 20. Use ``:`` to
+	separate the numbers, and list the numbers in ascending order. For
+	example, ``--percentile_list=99.5:99.9`` will cause fio to report the
+	values of completion latency below which 99.5% and 99.9% of the observed
+	latencies fell, respectively.
+
+.. option:: significant_figures=int
+
+	If using :option:`--output-format` of `normal`, set the significant
+	figures to this	value. Higher values will yield more precise IOPS and
+	throughput units, while lower values will round. Requires a minimum
+	value of 1 and a maximum value of 10. Defaults to 4.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+	When one job finishes in error, terminate the rest. The default is to wait
+	for each job to finish.
+
+.. option:: continue_on_error=str
+
+	Normally fio will exit the job on the first observed failure. If this option
+	is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+	EILSEQ) until the runtime is exceeded or the I/O size specified is
+	completed. If this option is used, there are two more stats that are
+	appended, the total error count and the first error. The error field given
+	in the stats is the first error that was hit during the run.
+
+	The allowed values are:
+
+		**none**
+			Exit on any I/O or verify errors.
+
+		**read**
+			Continue on read errors, exit on all others.
+
+		**write**
+			Continue on write errors, exit on all others.
+
+		**io**
+			Continue on any I/O error, exit on all others.
+
+		**verify**
+			Continue on verify errors, exit on all others.
+
+		**all**
+			Continue on all errors.
+
+		**0**
+			Backward-compatible alias for 'none'.
+
+		**1**
+			Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+	Sometimes you want to ignore some errors during test in that case you can
+	specify error list for each error type, instead of only being able to
+	ignore the default 'non-fatal error' using :option:`continue_on_error`.
+	``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+	given error type is separated with ':'. Error may be symbol ('ENOSPC',
+	'ENOMEM') or integer.  Example::
+
+		ignore_error=EAGAIN,ENOSPC:122
+
+	This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+	WRITE. This option works by overriding :option:`continue_on_error` with
+	the list of errors for each error type if any.
+
+.. option:: error_dump=bool
+
+	If set dump every error even if it is non fatal, true by default. If
+	disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+	The predefined workload to run.  Current profiles are:
+
+		**tiobench**
+			Threaded I/O bench (tiotest/tiobench) like workload.
+
+		**act**
+			Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile.  For example::
+
+	$ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+	:noindex:
+
+	Devices to use.
+
+.. option:: load=int
+	:noindex:
+
+	ACT load multiplier.  Default: 1.
+
+.. option:: test-duration=time
+	:noindex:
+
+	How long the entire test takes to run.  When the unit is omitted, the value
+	is given in seconds.  Default: 24h.
+
+.. option:: threads-per-queue=int
+	:noindex:
+
+	Number of read I/O threads per device.  Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+	:noindex:
+
+	Number of 512B blocks to read at the time.  Default: 3.
+
+.. option:: large-block-op-kbytes=int
+	:noindex:
+
+	Size of large block ops in KiB (writes).  Default: 131072.
+
+.. option:: prep
+	:noindex:
+
+	Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+	:noindex:
+
+	Size in MiB.
+
+.. option:: block=int
+	:noindex:
+
+	Block size in bytes.  Default: 4096.
+
+.. option:: numruns=int
+	:noindex:
+
+	Number of runs.
+
+.. option:: dir=str
+	:noindex:
+
+	Test directory.
+
+.. option:: threads=int
+	:noindex:
+
+	Number of threads.
+
+Interpreting the output
+-----------------------
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+		--rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+		--runtime=2m --rw=rw
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the first set of square brackets denote the current status of
+each thread.  The first character is the first job defined in the job file, and so
+forth.  The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run |                                                           |
++======+=====+===========================================================+
+| P    |     | Thread setup, but not started.                            |
++------+-----+-----------------------------------------------------------+
+| C    |     | Thread created.                                           |
++------+-----+-----------------------------------------------------------+
+| I    |     | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+|      |  p  | Thread running pre-reading file(s).                       |
++------+-----+-----------------------------------------------------------+
+|      |  /  | Thread is in ramp period.                                 |
++------+-----+-----------------------------------------------------------+
+|      |  R  | Running, doing sequential reads.                          |
++------+-----+-----------------------------------------------------------+
+|      |  r  | Running, doing random reads.                              |
++------+-----+-----------------------------------------------------------+
+|      |  W  | Running, doing sequential writes.                         |
++------+-----+-----------------------------------------------------------+
+|      |  w  | Running, doing random writes.                             |
++------+-----+-----------------------------------------------------------+
+|      |  M  | Running, doing mixed sequential reads/writes.             |
++------+-----+-----------------------------------------------------------+
+|      |  m  | Running, doing mixed random reads/writes.                 |
++------+-----+-----------------------------------------------------------+
+|      |  D  | Running, doing sequential trims.                          |
++------+-----+-----------------------------------------------------------+
+|      |  d  | Running, doing random trims.                              |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
++------+-----+-----------------------------------------------------------+
+|      |  V  | Running, doing verification of written data.              |
++------+-----+-----------------------------------------------------------+
+| f    |     | Thread finishing.                                         |
++------+-----+-----------------------------------------------------------+
+| E    |     | Thread exited, not reaped by main thread yet.             |
++------+-----+-----------------------------------------------------------+
+| _    |     | Thread reaped.                                            |
++------+-----+-----------------------------------------------------------+
+| X    |     | Thread reaped, exited with an error.                      |
++------+-----+-----------------------------------------------------------+
+| K    |     | Thread reaped, exited due to signal.                      |
++------+-----+-----------------------------------------------------------+
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+		--time_based --rate=2512k --bs=256K --numjobs=10 \
+		--name=readers --rw=read --name=writers --rw=write
+
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what.  In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+		--direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+		--bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+	Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+	  write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+	    slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+	    clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+	     lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+	    clat percentiles (usec):
+	     |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+	     | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+	     | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+	     | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+	     | 99.99th=[78119]
+	   bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+	   iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+	  lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+	  lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+	  lat (msec)   : 100=0.65%
+	  cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+	  IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+	     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+	     issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+	     latency   : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed.  Below are the I/O statistics for each data direction performed (showing
+writes in the example above).  In the order listed, they denote:
+
+**read/write/trim**
+		The string before the colon shows the I/O direction the statistics
+		are for.  **IOPS** is the average I/Os performed per second.  **BW**
+		is the average bandwidth rate shown as: value in power of 2 format
+		(value in power of 10 format).  The last two values show: (**total
+		I/O performed** in power of 2 format / **runtime** of that thread).
+
+**slat**
+		Submission latency (**min** being the minimum, **max** being the
+		maximum, **avg** being the average, **stdev** being the standard
+		deviation).  This is the time it took to submit the I/O.  For
+		sync I/O this row is not displayed as the slat is really the
+		completion latency (since queue/complete is one operation there).
+		This value can be in nanoseconds, microseconds or milliseconds ---
+		fio will choose the most appropriate base and print that (in the
+		example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
+		latencies are always expressed in microseconds.
+
+**clat**
+		Completion latency. Same names as slat, this denotes the time from
+		submission to completion of the I/O pieces. For sync I/O, clat will
+		usually be equal (or very close) to 0, as the time from submit to
+		complete is basically just CPU time (I/O has already been done, see slat
+		explanation).
+
+**lat**
+		Total latency. Same names as slat and clat, this denotes the time from
+		when fio created the I/O unit to completion of the I/O operation.
+
+**bw**
+		Bandwidth statistics based on samples. Same names as the xlat stats,
+		but also includes the number of samples taken (**samples**) and an
+		approximate percentage of total aggregate bandwidth this thread
+		received in its group (**per**). This last value is only really
+		useful if the threads in this group are on the same disk, since they
+		are then competing for disk access.
+
+**iops**
+		IOPS statistics based on samples. Same names as bw.
+
+**lat (nsec/usec/msec)**
+		The distribution of I/O completion latencies. This is the time from when
+		I/O leaves fio and when it gets completed. Unlike the separate
+		read/write/trim sections above, the data here and in the remaining
+		sections apply to all I/Os for the reporting group. 250=0.04% means that
+		0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+		of the I/Os required 250 to 499us for completion.
+
+**cpu**
+		CPU usage. User and system time, along with the number of context
+		switches this thread went through, usage of system and user time, and
+		finally the number of major and minor page faults. The CPU utilization
+		numbers are averages for the jobs in that reporting group, while the
+		context and fault counters are summed.
+
+**IO depths**
+		The distribution of I/O depths over the job lifetime.  The numbers are
+		divided into powers of 2 and each entry covers depths from that value
+		up to those that are lower than the next entry -- e.g., 16= covers
+		depths from 16 to 31.  Note that the range covered by a depth
+		distribution entry can be different to the range covered by the
+		equivalent submit/complete distribution entry.
+
+**IO submit**
+		How many pieces of I/O were submitting in a single submit call. Each
+		entry denotes that amount and below, until the previous entry -- e.g.,
+		16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+		call.  Note that the range covered by a submit distribution entry can
+		be different to the range covered by the equivalent depth distribution
+		entry.
+
+**IO complete**
+		Like the above submit number, but for completions instead.
+
+**IO issued rwt**
+		The number of read/write/trim requests issued, and how many of them were
+		short or dropped.
+
+**IO latency**
+		These values are for :option:`latency_target` and related options. When
+		these options are engaged, this section describes the I/O depth required
+		to meet the specified latency target.
+
+..
+	Example output was based on the following:
+	TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+		--rate_process=poisson --io_limit=32M --name=read --bs=128k \
+		--rate=11M --name=write --rw=write --bs=2k --rate=700k
+
+After each client has been listed, the group statistics are printed. They
+will look like this::
+
+    Run status group 0 (all jobs):
+       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
+
+For each data direction it prints:
+
+**bw**
+		Aggregate bandwidth of threads in this group followed by the
+		minimum and maximum bandwidth of all the threads in this group.
+		Values outside of brackets are power-of-2 format and those
+		within are the equivalent value in a power-of-10 format.
+**io**
+		Aggregate I/O performed of all threads in this group. The
+		format is the same as bw.
+**run**
+		The smallest and longest runtimes of the threads in this group.
+
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
+
+  Disk stats (read/write):
+    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
+
+**ios**
+		Number of I/Os performed by all groups.
+**merge**
+		Number of merges performed by the I/O scheduler.
+**ticks**
+		Number of ticks we kept the disk busy.
+**in_queue**
+		Total time spent in the disk queue.
+**util**
+		The disk utilization. A value of 100% means we kept the disk
+		busy constantly, 50% would be a disk idling half of the time.
+
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal.  You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
+
+
+Terse output
+------------
+
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format.  The format
+is one long line of values, such as::
+
+    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+    A description of this job goes here.
+
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
+
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+
+    ::
+
+        terse version, fio version [v3], jobname, groupid, error
+
+    READ status::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    WRITE status:
+
+    ::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    TRIM status [all but version 3]:
+
+        Fields are similar to READ/WRITE status.
+
+    CPU usage::
+
+        user, system, context switches, major faults, minor faults
+
+    I/O depths::
+
+        <=1, 2, 4, 8, 16, 32, >=64
+
+    I/O latencies microseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+    I/O latencies milliseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+    Disk utilization [v3]::
+
+        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+        time spent in queue, disk utilization percentage
+
+    Additional Info (dependent on continue_on_error, default off)::
+
+        total # errors, first error code
+
+    Additional Info (dependent on description being set)::
+
+        Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
+
+        1.00%=6112
+
+which is the Xth percentile, and the `usec` latency associated with it.
+
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+	"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
+
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
+
+Each line represents a single I/O action in the following format::
+
+	rw, offset, length
+
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
+
+This format is not supported in fio versions >= 1.20-rc3.
+
+
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
+
+The second version of the trace file format was added in fio version 1.17.  It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
+
+The first line of the trace file has to be::
+
+    fio version 2 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+    filename action
+
+The `filename` is given as an absolute path. The `action` can be one of these:
+
+**add**
+		Add the given `filename` to the trace.
+**open**
+		Open the file with the given `filename`. The `filename` has to have
+		been added with the **add** action before.
+**close**
+		Close the file with the given `filename`. The file has to have been
+		opened before.
+
+
+The file I/O action format::
+
+    filename action offset length
+
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
+
+**wait**
+	   Wait for `offset` microseconds. Everything below 100 is discarded.
+	   The time is relative to the previous `wait` statement.
+**read**
+	   Read `length` bytes beginning from `offset`.
+**write**
+	   Write `length` bytes beginning from `offset`.
+**sync**
+	   :manpage:`fsync(2)` the file.
+**datasync**
+	   :manpage:`fdatasync(2)` the file.
+**trim**
+	   Trim the given file from the given `offset` for `length` bytes.
+
+
+I/O Replay - Merging Traces
+---------------------------
+
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+
+Merging is integrated into I/O replay and done when a
+:option:`merge_blktrace_file` is specified. The list of files passed to
+:option:`read_iolog` go through the merge process and output a single file
+stored to the specified file. The output file is passed on as if it were the
+only file passed to :option:`read_iolog`. An example would look like::
+
+	$ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
+
+Creating only the merged file can be done by passing the command line argument
+:option:`merge-blktrace-only`.
+
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to :option:`read_iolog`.
+
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with :option:`merge_blktrace_iters`. It is index paired with
+:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
+
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done::
+
+	$ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
+
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+
+
+Verification and triggers
+-------------------------
+
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+1) Storing the write state of each job.
+2) Executing a trigger command.
+
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
+
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
+
+	server# fio --server
+
+and on the client, we'll fire off the workload::
+
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
+
+	echo b > /proc/sysrq-trigger
+
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
+
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load stored write state, a read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+
+
+Log File Formats
+----------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`)
+
+*Time* for the log entry is always in milliseconds. The *value* logged depends
+on the type of log, it will be one of the following:
+
+    **Latency log**
+		Value is latency in nsecs
+    **Bandwidth log**
+		Value is in KiB/sec
+    **IOPS log**
+		Value is IOPS
+
+*Data direction* is one of the following:
+
+	**0**
+		I/O is a READ
+	**1**
+		I/O is a WRITE
+	**2**
+		I/O is a TRIM
+
+The entry's *block size* is always in bytes. The *offset* is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
+
+Fio defaults to logging every individual I/O but when windowed logging is set
+through :option:`log_avg_msec`, either the average (by default) or the maximum
+(:option:`log_max_value` is set) *value* seen over the specified period of time
+is recorded. Each *data direction* seen within the window period will aggregate
+its values in a separate row. Further, when using windowed logging the *block
+size* and *offset* entries will always contain 0.
+
+
+Client/Server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+	$ fio --server=args
+
+where `args` defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+   Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+   Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+   Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+   Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+	fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Fio can connect to multiple servers this way::
 
-The file management format:
+    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
 
-filename action
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
 
-The filename is given as an absolute path. The action can be one of these:
+   fio --client=server --remote-config /path/to/file.fio
 
-add          Add the given filename to the trace
-open         Open the file with the given filename. The filename has to have
-             been added with the add action before.
-close        Close the file with the given filename. The file has to have been
-             opened before.
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
 
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option.  For example, here is an example :file:`host.list`
+file containing 2 hostnames::
 
-The file io action format:
+	host1.your.dns.domain
+	host2.your.dns.domain
 
-filename action offset length
+The fio command would then be::
 
-The filename is given as an absolute path, and has to have been added and opened
-before it can be used with this format. The offset and length are given in
-bytes. The action can be one of these:
+    fio --client=host.list <job file(s)>
 
-wait       Wait for 'offset' microseconds. Everything below 100 is discarded.
-read       Read 'length' bytes beginning from 'offset'
-write      Write 'length' bytes beginning from 'offset'
-sync       fsync() the file
-datasync   fdatasync() the file
-trim       trim the given file from the given 'offset' for 'length' bytes
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
 
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
 
-9.0 CPU idleness profiling
---------------------------
-In some cases, we want to understand CPU overhead in a test. For example,
-we test patches for the specific goodness of whether they reduce CPU usage.
-fio implements a balloon approach to create a thread per CPU that runs at
-idle priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each
-CPU can be derived accordingly.
+	/mnt/nfs/fio/192.168.10.120.fileio.tmp
+	/mnt/nfs/fio/192.168.10.121.fileio.tmp
 
-An unit work is defined as touching a full page of unsigned characters. Mean
-and standard deviation of time to complete an unit work is reported in "unit
-work" section. Options can be chosen to report detailed percpu idleness or
-overall system idleness by aggregating percpu stats.
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
diff -Nru fio-2.1.3/idletime.c fio-3.16/idletime.c
--- fio-2.1.3/idletime.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/idletime.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,5 @@
 #include <math.h>
+#include "fio.h"
 #include "json.h"
 #include "idletime.h"
 
@@ -11,7 +12,7 @@
 static double calibrate_unit(unsigned char *data)
 {
 	unsigned long t, i, j, k;
-	struct timeval tps;
+	struct timespec tps;
 	double tunit = 0.0;
 
 	for (i = 0; i < CALIBRATE_RUNS; i++) {
@@ -43,16 +44,26 @@
 	return tunit / CALIBRATE_SCALE;
 }
 
+static void free_cpu_affinity(struct idle_prof_thread *ipt)
+{
+#if defined(FIO_HAVE_CPU_AFFINITY)
+	fio_cpuset_exit(&ipt->cpu_mask);
+#endif
+}
+
 static int set_cpu_affinity(struct idle_prof_thread *ipt)
 {
 #if defined(FIO_HAVE_CPU_AFFINITY)
-	os_cpu_mask_t cpu_mask;
+	if (fio_cpuset_init(&ipt->cpu_mask)) {
+		log_err("fio: cpuset init failed\n");
+		return -1;
+	}
 
-	memset(&cpu_mask, 0, sizeof(cpu_mask));
-	fio_cpu_set(&cpu_mask, ipt->cpu);
+	fio_cpu_set(&ipt->cpu_mask, ipt->cpu);
 
-	if (fio_setaffinity(gettid(), cpu_mask)) {
+	if (fio_setaffinity(gettid(), ipt->cpu_mask)) {
 		log_err("fio: fio_setaffinity failed\n");
+		fio_cpuset_exit(&ipt->cpu_mask);
 		return -1;
 	}
 
@@ -73,8 +84,10 @@
 	pthread_mutex_lock(&ipt->init_lock);
 
 	/* exit if any other thread failed to start */
-	if (ipc.status == IDLE_PROF_STATUS_ABORT)
+	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
+		pthread_mutex_unlock(&ipt->init_lock);
 		return NULL;
+	}
 
 	retval = set_cpu_affinity(ipt);
 	if (retval == -1) {
@@ -96,7 +109,7 @@
 	if (retval == -1) {
 		ipt->state = TD_EXITED;
 		pthread_mutex_unlock(&ipt->init_lock);
-		return NULL;
+		goto do_exit;
 	}
 
 	ipt->state = TD_INITIALIZED;
@@ -109,12 +122,16 @@
 	pthread_mutex_lock(&ipt->start_lock);
 
 	/* exit if other threads failed to initialize */
-	if (ipc.status == IDLE_PROF_STATUS_ABORT)
-		return NULL;
+	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
+		pthread_mutex_unlock(&ipt->start_lock);
+		goto do_exit;
+	}
 
 	/* exit if we are doing calibration only */
-	if (ipc.status == IDLE_PROF_STATUS_CALI_STOP)
-		return NULL;
+	if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) {
+		pthread_mutex_unlock(&ipt->start_lock);
+		goto do_exit;
+	}
 
 	fio_gettime(&ipt->tps, NULL);
 	ipt->state = TD_RUNNING;
@@ -137,6 +154,8 @@
 	ipt->state = TD_EXITED;
 	pthread_mutex_unlock(&ipt->start_lock);
 
+do_exit:
+	free_cpu_affinity(ipt);
 	return NULL;
 }
 
@@ -165,7 +184,6 @@
 void fio_idle_prof_init(void)
 {
 	int i, ret;
-	struct timeval tp;
 	struct timespec ts;
 	pthread_attr_t tattr;
 	struct idle_prof_thread *ipt;
@@ -242,7 +260,7 @@
 
 		if ((ret = pthread_detach(ipt->thread))) {
 			/* log error and let the thread spin */
-			log_err("fio: pthread_detatch %s\n", strerror(ret));
+			log_err("fio: pthread_detach %s\n", strerror(ret));
 		}
 	}
 
@@ -264,9 +282,8 @@
 		pthread_mutex_lock(&ipt->init_lock);
 		while ((ipt->state != TD_EXITED) &&
 		       (ipt->state!=TD_INITIALIZED)) {
-			fio_gettime(&tp, NULL);
-			ts.tv_sec = tp.tv_sec + 1;
-			ts.tv_nsec = tp.tv_usec * 1000;
+			fio_gettime(&ts, NULL);
+			ts.tv_sec += 1;
 			pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts);
 		}
 		pthread_mutex_unlock(&ipt->init_lock);
@@ -307,7 +324,6 @@
 {
 	int i;
 	uint64_t runt;
-	struct timeval tp;
 	struct timespec ts;
 	struct idle_prof_thread *ipt;
 
@@ -325,9 +341,8 @@
 		pthread_mutex_lock(&ipt->start_lock);
 		while ((ipt->state != TD_EXITED) &&
 		       (ipt->state!=TD_NOT_CREATED)) {
-			fio_gettime(&tp, NULL);
-			ts.tv_sec = tp.tv_sec + 1;
-			ts.tv_nsec = tp.tv_usec * 1000;
+			fio_gettime(&ts, NULL);
+			ts.tv_sec += 1;
 			/* timed wait in case a signal is not received */
 			pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts);
 		}
@@ -336,7 +351,10 @@
 		/* calculate idleness */
 		if (ipc.cali_mean != 0.0) {
 			runt = utime_since(&ipt->tps, &ipt->tpe);
-			ipt->idleness = ipt->loops * ipc.cali_mean / runt;
+			if (runt)
+				ipt->idleness = ipt->loops * ipc.cali_mean / runt;
+			else
+				ipt->idleness = 0.0;
 		} else
 			ipt->idleness = 0.0;
 	}
@@ -407,7 +425,7 @@
 		fio_idle_prof_init();
 		fio_idle_prof_start();
 		fio_idle_prof_stop();
-		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL);
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, NULL);
 		return 1;
 	} else if (strcmp("system", args) == 0) {
 		ipc.opt = IDLE_PROF_OPT_SYSTEM;
@@ -425,7 +443,8 @@
 #endif
 }
 
-void show_idle_prof_stats(int output, struct json_object *parent)
+void show_idle_prof_stats(int output, struct json_object *parent,
+			  struct buf_output *out)
 {
 	int i, nr_cpus = ipc.nr_cpus;
 	struct json_object *tmp;
@@ -433,33 +452,29 @@
 
 	if (output == FIO_OUTPUT_NORMAL) {
 		if (ipc.opt > IDLE_PROF_OPT_CALI)
-			log_info("\nCPU idleness:\n");
+			log_buf(out, "\nCPU idleness:\n");
 		else if (ipc.opt == IDLE_PROF_OPT_CALI)
-			log_info("CPU idleness:\n");
+			log_buf(out, "CPU idleness:\n");
 
 		if (ipc.opt >= IDLE_PROF_OPT_SYSTEM)
-			log_info("  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));
+			log_buf(out, "  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));
 
 		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
-			log_info("  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
+			log_buf(out, "  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
 			for (i = 1; i < nr_cpus; i++)
-				log_info(", %3.2f%%", fio_idle_prof_cpu_stat(i));
-			log_info("\n");
+				log_buf(out, ", %3.2f%%", fio_idle_prof_cpu_stat(i));
+			log_buf(out, "\n");
 		}
 
 		if (ipc.opt >= IDLE_PROF_OPT_CALI) {
-			log_info("  unit work: mean=%3.2fus,", ipc.cali_mean);
-			log_info(" stddev=%3.2f\n", ipc.cali_stddev);
+			log_buf(out, "  unit work: mean=%3.2fus,", ipc.cali_mean);
+			log_buf(out, " stddev=%3.2f\n", ipc.cali_stddev);
 		}
 
-		/* dynamic mem allocations can now be freed */
-		if (ipc.opt != IDLE_PROF_OPT_NONE)
-			fio_idle_prof_cleanup();
-
 		return;
 	}
 
-	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output == FIO_OUTPUT_JSON)) {
+	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output & FIO_OUTPUT_JSON)) {
 		if (!parent)
 			return;
 
@@ -479,7 +494,5 @@
 
 		json_object_add_value_float(tmp, "unit_mean", ipc.cali_mean);
 		json_object_add_value_float(tmp, "unit_stddev", ipc.cali_stddev);
-		
-		fio_idle_prof_cleanup();
 	}
 }
diff -Nru fio-2.1.3/idletime.h fio-3.16/idletime.h
--- fio-2.1.3/idletime.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/idletime.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,7 +1,9 @@
 #ifndef FIO_IDLETIME_H
 #define FIO_IDLETIME_H
 
-#include "fio.h"
+#include <sys/time.h>
+#include <sys/types.h>
+#include "os/os.h"
 
 #define CALIBRATE_RUNS  10
 #define CALIBRATE_SCALE 1000
@@ -25,15 +27,17 @@
 	pthread_t thread;
 	int cpu;
 	int state;
-	struct timeval tps;
-	struct timeval tpe;
-	double cali_time; /* microseconds to finish a unit wrok */
+	struct timespec tps;
+	struct timespec tpe;
+	double cali_time; /* microseconds to finish a unit work */
 	double loops;
 	double idleness;
 	unsigned char *data;             /* bytes to be touched */
 	pthread_cond_t  cond;
 	pthread_mutex_t init_lock;
 	pthread_mutex_t start_lock;
+
+	os_cpu_mask_t cpu_mask;
 };
 
 struct idle_prof_common {
@@ -52,6 +56,8 @@
 extern void fio_idle_prof_start(void);
 extern void fio_idle_prof_stop(void);
 
-extern void show_idle_prof_stats(int, struct json_object *);
+extern void show_idle_prof_stats(int, struct json_object *, struct buf_output *);
+
+extern void fio_idle_prof_cleanup(void);
 
 #endif
diff -Nru fio-2.1.3/init.c fio-3.16/init.c
--- fio-2.1.3/init.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/init.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,13 +4,17 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <fcntl.h>
 #include <ctype.h>
 #include <string.h>
 #include <errno.h>
 #include <sys/ipc.h>
 #include <sys/types.h>
-#include <sys/stat.h>
+#include <dlfcn.h>
+#ifdef CONFIG_VALGRIND_DEV
+#include <valgrind/drd.h>
+#else
+#define DRD_IGNORE_VAR(x) do { } while (0)
+#endif
 
 #include "fio.h"
 #ifndef FIO_NO_HAVE_SHM_H
@@ -24,9 +28,16 @@
 #include "profile.h"
 #include "server.h"
 #include "idletime.h"
-
-#include "lib/getopt.h"
-#include "lib/strcasestr.h"
+#include "filelock.h"
+#include "steadystate.h"
+#include "blktrace.h"
+
+#include "oslib/getopt.h"
+#include "oslib/strcasestr.h"
+
+#include "crc/test.h"
+#include "lib/pow2.h"
+#include "lib/memcpy.h"
 
 const char fio_version_string[] = FIO_VERSION;
 
@@ -34,42 +45,50 @@
 
 static char **ini_file;
 static int max_jobs = FIO_MAX_JOBS;
-static int dump_cmdline;
-static int def_timeout;
-static int parse_only;
+static bool dump_cmdline;
+static bool parse_only;
+static bool merge_blktrace_only;
 
 static struct thread_data def_thread;
 struct thread_data *threads = NULL;
+static char **job_sections;
+static int nr_job_sections;
 
-int exitall_on_terminate = 0;
+bool exitall_on_terminate = false;
 int output_format = FIO_OUTPUT_NORMAL;
 int eta_print = FIO_ETA_AUTO;
+unsigned int eta_interval_msec = 1000;
 int eta_new_line = 0;
 FILE *f_out = NULL;
 FILE *f_err = NULL;
-char **job_sections = NULL;
-int nr_job_sections = 0;
 char *exec_profile = NULL;
 int warnings_fatal = 0;
 int terse_version = 3;
-int is_backend = 0;
+bool is_backend = false;
+bool is_local_backend = false;
 int nr_clients = 0;
-int log_syslog = 0;
+bool log_syslog = false;
 
-int write_bw_log = 0;
-int read_only = 0;
+bool write_bw_log = false;
+bool read_only = false;
 int status_interval = 0;
 
-static int write_lat_log;
+char *trigger_file = NULL;
+long long trigger_timeout = 0;
+char *trigger_cmd = NULL;
+char *trigger_remote_cmd = NULL;
+
+char *aux_path = NULL;
 
 static int prev_group_jobs;
 
 unsigned long fio_debug = 0;
 unsigned int fio_debug_jobno = -1;
 unsigned int *fio_debug_jobp = NULL;
+unsigned int *fio_warned = NULL;
 
 static char cmd_optstr[256];
-static int did_arg;
+static bool did_arg;
 
 #define FIO_CLIENT_FLAG		(1 << 16)
 
@@ -84,31 +103,31 @@
 		.val		= 'o' | FIO_CLIENT_FLAG,
 	},
 	{
-		.name		= (char *) "timeout",
-		.has_arg	= required_argument,
-		.val		= 't' | FIO_CLIENT_FLAG,
-	},
-	{
 		.name		= (char *) "latency-log",
 		.has_arg	= required_argument,
 		.val		= 'l' | FIO_CLIENT_FLAG,
 	},
 	{
 		.name		= (char *) "bandwidth-log",
-		.has_arg	= required_argument,
+		.has_arg	= no_argument,
 		.val		= 'b' | FIO_CLIENT_FLAG,
 	},
 	{
 		.name		= (char *) "minimal",
-		.has_arg	= optional_argument,
+		.has_arg	= no_argument,
 		.val		= 'm' | FIO_CLIENT_FLAG,
 	},
 	{
 		.name		= (char *) "output-format",
-		.has_arg	= optional_argument,
+		.has_arg	= required_argument,
 		.val		= 'F' | FIO_CLIENT_FLAG,
 	},
 	{
+		.name		= (char *) "append-terse",
+		.has_arg	= optional_argument,
+		.val		= 'f',
+	},
+	{
 		.name		= (char *) "version",
 		.has_arg	= no_argument,
 		.val		= 'v' | FIO_CLIENT_FLAG,
@@ -144,6 +163,11 @@
 		.val		= 'e' | FIO_CLIENT_FLAG,
 	},
 	{
+		.name		= (char *) "eta-interval",
+		.has_arg	= required_argument,
+		.val		= 'O' | FIO_CLIENT_FLAG,
+	},
+	{
 		.name		= (char *) "eta-newline",
 		.has_arg	= required_argument,
 		.val		= 'E' | FIO_CLIENT_FLAG,
@@ -163,6 +187,13 @@
 		.has_arg	= required_argument,
 		.val		= 'x' | FIO_CLIENT_FLAG,
 	},
+#ifdef CONFIG_ZLIB
+	{
+		.name		= (char *) "inflate-log",
+		.has_arg	= required_argument,
+		.val		= 'X' | FIO_CLIENT_FLAG,
+	},
+#endif
 	{
 		.name		= (char *) "alloc-size",
 		.has_arg	= required_argument,
@@ -203,11 +234,26 @@
 		.val		= 'C',
 	},
 	{
+		.name		= (char *) "remote-config",
+		.has_arg	= required_argument,
+		.val		= 'R',
+	},
+	{
 		.name		= (char *) "cpuclock-test",
 		.has_arg	= no_argument,
 		.val		= 'T',
 	},
 	{
+		.name		= (char *) "crctest",
+		.has_arg	= optional_argument,
+		.val		= 'G',
+	},
+	{
+		.name		= (char *) "memcpytest",
+		.has_arg	= optional_argument,
+		.val		= 'M',
+	},
+	{
 		.name		= (char *) "idle-prof",
 		.has_arg	= required_argument,
 		.val		= 'I',
@@ -215,7 +261,37 @@
 	{
 		.name		= (char *) "status-interval",
 		.has_arg	= required_argument,
-		.val		= 'L',
+		.val		= 'L' | FIO_CLIENT_FLAG,
+	},
+	{
+		.name		= (char *) "trigger-file",
+		.has_arg	= required_argument,
+		.val		= 'W',
+	},
+	{
+		.name		= (char *) "trigger-timeout",
+		.has_arg	= required_argument,
+		.val		= 'B',
+	},
+	{
+		.name		= (char *) "trigger",
+		.has_arg	= required_argument,
+		.val		= 'H',
+	},
+	{
+		.name		= (char *) "trigger-remote",
+		.has_arg	= required_argument,
+		.val		= 'J',
+	},
+	{
+		.name		= (char *) "aux-path",
+		.has_arg	= required_argument,
+		.val		= 'K',
+	},
+	{
+		.name		= (char *) "merge-blktrace-only",
+		.has_arg	= no_argument,
+		.val		= 'A' | FIO_CLIENT_FLAG,
 	},
 	{
 		.name		= NULL,
@@ -224,27 +300,39 @@
 
 void free_threads_shm(void)
 {
-	struct shmid_ds sbuf;
-
 	if (threads) {
 		void *tp = threads;
+#ifndef CONFIG_NO_SHM
+		struct shmid_ds sbuf;
 
 		threads = NULL;
 		shmdt(tp);
 		shmctl(shm_id, IPC_RMID, &sbuf);
 		shm_id = -1;
+#else
+		threads = NULL;
+		free(tp);
+#endif
 	}
 }
 
-void free_shm(void)
+static void free_shm(void)
 {
 	if (threads) {
-		file_hash_exit();
 		flow_exit();
 		fio_debug_jobp = NULL;
+		fio_warned = NULL;
 		free_threads_shm();
 	}
 
+	free(trigger_file);
+	free(trigger_cmd);
+	free(trigger_remote_cmd);
+	trigger_file = trigger_cmd = trigger_remote_cmd = NULL;
+
+	options_free(fio_options, &def_thread.o);
+	fio_filelock_exit();
+	file_hash_exit();
 	scleanup();
 }
 
@@ -256,7 +344,7 @@
  */
 static int setup_thread_area(void)
 {
-	void *hash;
+	int i;
 
 	if (threads)
 		return 0;
@@ -268,9 +356,9 @@
 	do {
 		size_t size = max_jobs * sizeof(struct thread_data);
 
-		size += file_hash_size;
-		size += sizeof(unsigned int);
+		size += 2 * sizeof(unsigned int);
 
+#ifndef CONFIG_NO_SHM
 		shm_id = shmget(0, size, IPC_CREAT | 0600);
 		if (shm_id != -1)
 			break;
@@ -278,10 +366,16 @@
 			perror("shmget");
 			break;
 		}
+#else
+		threads = malloc(size);
+		if (threads)
+			break;
+#endif
 
 		max_jobs >>= 1;
 	} while (max_jobs);
 
+#ifndef CONFIG_NO_SHM
 	if (shm_id == -1)
 		return 1;
 
@@ -290,23 +384,90 @@
 		perror("shmat");
 		return 1;
 	}
+	if (shm_attach_to_open_removed())
+		shmctl(shm_id, IPC_RMID, NULL);
+#endif
 
 	memset(threads, 0, max_jobs * sizeof(struct thread_data));
-	hash = (void *) threads + max_jobs * sizeof(struct thread_data);
-	fio_debug_jobp = (void *) hash + file_hash_size;
+	for (i = 0; i < max_jobs; i++)
+		DRD_IGNORE_VAR(threads[i]);
+	fio_debug_jobp = (unsigned int *)(threads + max_jobs);
 	*fio_debug_jobp = -1;
-	file_hash_init(hash);
+	fio_warned = fio_debug_jobp + 1;
+	*fio_warned = 0;
 
 	flow_init();
 
 	return 0;
 }
 
+static void dump_print_option(struct print_option *p)
+{
+	const char *delim;
+
+	if (!strcmp("description", p->name))
+		delim = "\"";
+	else
+		delim = "";
+
+	log_info("--%s%s", p->name, p->value ? "" : " ");
+	if (p->value)
+		log_info("=%s%s%s ", delim, p->value, delim);
+}
+
+static void dump_opt_list(struct thread_data *td)
+{
+	struct flist_head *entry;
+	struct print_option *p;
+
+	if (flist_empty(&td->opt_list))
+		return;
+
+	flist_for_each(entry, &td->opt_list) {
+		p = flist_entry(entry, struct print_option, list);
+		dump_print_option(p);
+	}
+}
+
+static void fio_dump_options_free(struct thread_data *td)
+{
+	while (!flist_empty(&td->opt_list)) {
+		struct print_option *p;
+
+		p = flist_first_entry(&td->opt_list, struct print_option, list);
+		flist_del_init(&p->list);
+		free(p->name);
+		free(p->value);
+		free(p);
+	}
+}
+
+static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
+{
+	struct flist_head *entry;
+
+	if (flist_empty(&src->opt_list))
+		return;
+
+	flist_for_each(entry, &src->opt_list) {
+		struct print_option *srcp, *dstp;
+
+		srcp = flist_entry(entry, struct print_option, list);
+		dstp = malloc(sizeof(*dstp));
+		dstp->name = strdup(srcp->name);
+		if (srcp->value)
+			dstp->value = strdup(srcp->value);
+		else
+			dstp->value = NULL;
+		flist_add_tail(&dstp->list, &dst->opt_list);
+	}
+}
+
 /*
  * Return a free job structure.
  */
-static struct thread_data *get_new_job(int global, struct thread_data *parent,
-				       int preserve_eo)
+static struct thread_data *get_new_job(bool global, struct thread_data *parent,
+				       bool preserve_eo, const char *jobname)
 {
 	struct thread_data *td;
 
@@ -325,7 +486,12 @@
 	td = &threads[thread_number++];
 	*td = *parent;
 
+	INIT_FLIST_HEAD(&td->opt_list);
+	if (parent != &def_thread)
+		copy_opt_list(td, parent);
+
 	td->io_ops = NULL;
+	td->io_ops_init = 0;
 	if (!preserve_eo)
 		td->eo = NULL;
 
@@ -337,8 +503,12 @@
 	profile_add_hooks(td);
 
 	td->thread_number = thread_number;
+	td->subjob_number = 0;
 
-	if (!parent || !parent->o.group_reporting)
+	if (jobname)
+		td->o.name = strdup(jobname);
+
+	if (!parent->o.group_reporting || parent == &def_thread)
 		stat_number++;
 
 	return td;
@@ -356,30 +526,36 @@
 		log_info("fio: %s\n", td->verror);
 
 	fio_options_free(td);
+	fio_dump_options_free(td);
 	if (td->io_ops)
 		free_ioengine(td);
 
+	if (td->o.name)
+		free(td->o.name);
+
 	memset(&threads[td->thread_number - 1], 0, sizeof(*td));
 	thread_number--;
 }
 
 static int __setup_rate(struct thread_data *td, enum fio_ddir ddir)
 {
-	unsigned int bs = td->o.min_bs[ddir];
+	unsigned long long bs = td->o.min_bs[ddir];
 
 	assert(ddir_rw(ddir));
 
 	if (td->o.rate[ddir])
 		td->rate_bps[ddir] = td->o.rate[ddir];
 	else
-		td->rate_bps[ddir] = td->o.rate_iops[ddir] * bs;
+		td->rate_bps[ddir] = (uint64_t) td->o.rate_iops[ddir] * bs;
 
 	if (!td->rate_bps[ddir]) {
 		log_err("rate lower than supported\n");
 		return -1;
 	}
 
-	td->rate_pending_usleep[ddir] = 0;
+	td->rate_next_io_time[ddir] = 0;
+	td->rate_io_issue_bytes[ddir] = 0;
+	td->last_usec[ddir] = 0;
 	return 0;
 }
 
@@ -407,6 +583,17 @@
 }
 
 /*
+ * <3 Johannes
+ */
+static unsigned int gcd(unsigned int m, unsigned int n)
+{
+	if (!n)
+		return m;
+
+	return gcd(n, m % n);
+}
+
+/*
  * Lazy way of fixing up options that depend on each other. We could also
  * define option callback handlers, but this is easier.
  */
@@ -415,13 +602,19 @@
 	struct thread_options *o = &td->o;
 	int ret = 0;
 
-#ifndef FIO_HAVE_PSHARED_MUTEX
+	if (read_only && (td_write(td) || td_trim(td))) {
+		log_err("fio: trim and write operations are not allowed"
+			 " with the --readonly parameter.\n");
+		ret |= 1;
+	}
+
+#ifndef CONFIG_PSHARED
 	if (!o->use_thread) {
 		log_info("fio: this platform does not support process shared"
 			 " mutexes, forcing use of threads. Use the 'thread'"
 			 " option to get rid of this warning.\n");
 		o->use_thread = 1;
-		ret = warnings_fatal;
+		ret |= warnings_fatal;
 	}
 #endif
 
@@ -429,26 +622,43 @@
 		log_err("fio: read iolog overrides write_iolog\n");
 		free(o->write_iolog_file);
 		o->write_iolog_file = NULL;
-		ret = warnings_fatal;
+		ret |= warnings_fatal;
+	}
+
+	if (o->zone_mode == ZONE_MODE_NONE && o->zone_size) {
+		log_err("fio: --zonemode=none and --zonesize are not compatible.\n");
+		ret |= 1;
+	}
+
+	if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
+		log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
+		ret |= 1;
+	}
+
+	if (o->zone_mode == ZONE_MODE_NOT_SPECIFIED) {
+		if (o->zone_size)
+			o->zone_mode = ZONE_MODE_STRIDED;
+		else
+			o->zone_mode = ZONE_MODE_NONE;
 	}
 
 	/*
-	 * only really works with 1 file
+	 * Strided zone mode only really works with 1 file.
 	 */
-	if (o->zone_size && o->open_files > 1)
-		o->zone_size = 0;
+	if (o->zone_mode == ZONE_MODE_STRIDED && o->open_files > 1)
+		o->zone_mode = ZONE_MODE_NONE;
 
 	/*
 	 * If zone_range isn't specified, backward compatibility dictates it
 	 * should be made equal to zone_size.
 	 */
-	if (o->zone_size && !o->zone_range)
+	if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_range)
 		o->zone_range = o->zone_size;
 
 	/*
 	 * Reads can do overwrites, we always need to pre-create the file
 	 */
-	if (td_read(td) || td_rw(td))
+	if (td_read(td))
 		o->overwrite = 1;
 
 	if (!o->min_bs[DDIR_READ])
@@ -464,7 +674,6 @@
 	if (!o->max_bs[DDIR_TRIM])
 		o->max_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
 
-
 	o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]);
 	o->rw_min_bs = min(o->min_bs[DDIR_TRIM], o->rw_min_bs);
 
@@ -484,20 +693,27 @@
 	    !o->norandommap) {
 		log_err("fio: Any use of blockalign= turns off randommap\n");
 		o->norandommap = 1;
-		ret = warnings_fatal;
+		ret |= warnings_fatal;
 	}
 
 	if (!o->file_size_high)
 		o->file_size_high = o->file_size_low;
 
+	if (o->start_delay_high) {
+		if (!o->start_delay_orig)
+			o->start_delay_orig = o->start_delay;
+		o->start_delay = rand_between(&td->delay_state,
+						o->start_delay_orig,
+						o->start_delay_high);
+	}
+
 	if (o->norandommap && o->verify != VERIFY_NONE
 	    && !fixed_block_size(o))  {
 		log_err("fio: norandommap given for variable block sizes, "
-			"verify disabled\n");
-		o->verify = VERIFY_NONE;
-		ret = warnings_fatal;
+			"verify limited\n");
+		ret |= warnings_fatal;
 	}
-	if (o->bs_unaligned && (o->odirect || td->io_ops->flags & FIO_RAWIO))
+	if (o->bs_unaligned && (o->odirect || td_ioengine_flagged(td, FIO_RAWIO)))
 		log_err("fio: bs_unaligned may not work with raw io\n");
 
 	/*
@@ -518,6 +734,23 @@
 	if (o->iodepth_batch > o->iodepth || !o->iodepth_batch)
 		o->iodepth_batch = o->iodepth;
 
+	/*
+	 * If max batch complete number isn't set or set incorrectly,
+	 * default to the same as iodepth_batch_complete_min
+	 */
+	if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
+		o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
+
+	/*
+	 * There's no need to check for in-flight overlapping IOs if the job
+	 * isn't changing data or the maximum iodepth is guaranteed to be 1
+	 * when we are not in offload mode
+	 */
+	if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) &&
+	    (!(td_write(td) || td_trim(td)) || o->iodepth == 1) &&
+	    o->io_submit_mode != IO_MODE_OFFLOAD)
+		o->serialize_overlap = 0;
+
 	if (o->nr_files > td->files_index)
 		o->nr_files = td->files_index;
 
@@ -529,57 +762,101 @@
 	    ((o->ratemin[DDIR_READ] + o->ratemin[DDIR_WRITE] + o->ratemin[DDIR_TRIM]) &&
 	    (o->rate_iops_min[DDIR_READ] + o->rate_iops_min[DDIR_WRITE] + o->rate_iops_min[DDIR_TRIM]))) {
 		log_err("fio: rate and rate_iops are mutually exclusive\n");
-		ret = 1;
+		ret |= 1;
 	}
-	if ((o->rate[DDIR_READ] < o->ratemin[DDIR_READ]) ||
-	    (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE]) ||
-	    (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM]) ||
-	    (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ]) ||
-	    (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE]) ||
-	    (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM])) {
+	if ((o->rate[DDIR_READ] && (o->rate[DDIR_READ] < o->ratemin[DDIR_READ])) ||
+	    (o->rate[DDIR_WRITE] && (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE])) ||
+	    (o->rate[DDIR_TRIM] && (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM])) ||
+	    (o->rate_iops[DDIR_READ] && (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ])) ||
+	    (o->rate_iops[DDIR_WRITE] && (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE])) ||
+	    (o->rate_iops[DDIR_TRIM] && (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM]))) {
 		log_err("fio: minimum rate exceeds rate\n");
-		ret = 1;
+		ret |= 1;
 	}
 
 	if (!o->timeout && o->time_based) {
 		log_err("fio: time_based requires a runtime/timeout setting\n");
 		o->time_based = 0;
-		ret = warnings_fatal;
+		ret |= warnings_fatal;
 	}
 
 	if (o->fill_device && !o->size)
 		o->size = -1ULL;
 
 	if (o->verify != VERIFY_NONE) {
-		if (td_write(td) && o->do_verify && o->numjobs > 1) {
-			log_info("Multiple writers may overwrite blocks that "
-				"belong to other jobs. This can cause "
+		if (td_write(td) && o->do_verify && o->numjobs > 1 &&
+		    (o->filename ||
+		     !(o->unique_filename &&
+		       strstr(o->filename_format, "$jobname") &&
+		       strstr(o->filename_format, "$jobnum") &&
+		       strstr(o->filename_format, "$filenum")))) {
+			log_info("fio: multiple writers may overwrite blocks "
+				"that belong to other jobs. This can cause "
 				"verification failures.\n");
-			ret = warnings_fatal;
+			ret |= warnings_fatal;
+		}
+
+		/*
+		 * Warn if verification is requested but no verification of any
+		 * kind can be started due to time constraints
+		 */
+		if (td_write(td) && o->do_verify && o->timeout &&
+		    o->time_based && !td_read(td) && !o->verify_backlog) {
+			log_info("fio: verification read phase will never "
+				 "start because write phase uses all of "
+				 "runtime\n");
+			ret |= warnings_fatal;
 		}
 
-		o->refill_buffers = 1;
+		if (!fio_option_is_set(o, refill_buffers))
+			o->refill_buffers = 1;
+
 		if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] &&
 		    !o->verify_interval)
 			o->verify_interval = o->min_bs[DDIR_WRITE];
+
+		/*
+		 * Verify interval must be smaller or equal to the
+		 * write size.
+		 */
+		if (o->verify_interval > o->min_bs[DDIR_WRITE])
+			o->verify_interval = o->min_bs[DDIR_WRITE];
+		else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ])
+			o->verify_interval = o->min_bs[DDIR_READ];
+
+		/*
+		 * Verify interval must be a factor of both min and max
+		 * write size
+		 */
+		if (!o->verify_interval ||
+		    (o->min_bs[DDIR_WRITE] % o->verify_interval) ||
+		    (o->max_bs[DDIR_WRITE] % o->verify_interval))
+			o->verify_interval = gcd(o->min_bs[DDIR_WRITE],
+							o->max_bs[DDIR_WRITE]);
 	}
 
 	if (o->pre_read) {
-		o->invalidate_cache = 0;
-		if (td->io_ops->flags & FIO_PIPEIO) {
+		if (o->invalidate_cache)
+			o->invalidate_cache = 0;
+		if (td_ioengine_flagged(td, FIO_PIPEIO)) {
 			log_info("fio: cannot pre-read files with an IO engine"
 				 " that isn't seekable. Pre-read disabled.\n");
-			ret = warnings_fatal;
+			ret |= warnings_fatal;
 		}
 	}
 
-	if (!o->unit_base) {
-		if (td->io_ops->flags & FIO_BIT_BASED)
-			o->unit_base = 1;
+	if (o->unit_base == N2S_NONE) {
+		if (td_ioengine_flagged(td, FIO_BIT_BASED))
+			o->unit_base = N2S_BITPERSEC;
 		else
-			o->unit_base = 8;
+			o->unit_base = N2S_BYTEPERSEC;
 	}
 
+#ifndef FIO_HAVE_ANY_FALLOCATE
+	/* Platform doesn't support any fallocate so force it to none */
+	o->fallocate_mode = FIO_FALLOCATE_NONE;
+#endif
+
 #ifndef CONFIG_FDATASYNC
 	if (o->fdatasync_blocks) {
 		log_info("fio: this platform does not support fdatasync()"
@@ -588,7 +865,7 @@
 			 " this warning\n");
 		o->fsync_blocks = o->fdatasync_blocks;
 		o->fdatasync_blocks = 0;
-		ret = warnings_fatal;
+		ret |= warnings_fatal;
 	}
 #endif
 
@@ -597,102 +874,178 @@
 	 * Windows doesn't support O_DIRECT or O_SYNC with the _open interface,
 	 * so fail if we're passed those flags
 	 */
-	if ((td->io_ops->flags & FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) {
+	if (td_ioengine_flagged(td, FIO_SYNCIO) && (o->odirect || o->sync_io)) {
 		log_err("fio: Windows does not support direct or non-buffered io with"
 				" the synchronous ioengines. Use the 'windowsaio' ioengine"
 				" with 'direct=1' and 'iodepth=1' instead.\n");
-		ret = 1;
+		ret |= 1;
 	}
 #endif
 
 	/*
 	 * For fully compressible data, just zero them at init time.
-	 * It's faster than repeatedly filling it.
+	 * It's faster than repeatedly filling it. For non-zero
+	 * compression, we should have refill_buffers set. Set it, unless
+	 * the job file already changed it.
 	 */
-	if (td->o.compress_percentage == 100) {
-		td->o.zero_buffers = 1;
-		td->o.compress_percentage = 0;
+	if (o->compress_percentage) {
+		if (o->compress_percentage == 100) {
+			o->zero_buffers = 1;
+			o->compress_percentage = 0;
+		} else if (!fio_option_is_set(o, refill_buffers)) {
+			o->refill_buffers = 1;
+			td->flags |= TD_F_REFILL_BUFFERS;
+		}
 	}
 
 	/*
 	 * Using a non-uniform random distribution excludes usage of
 	 * a random map
 	 */
-	if (td->o.random_distribution != FIO_RAND_DIST_RANDOM)
-		td->o.norandommap = 1;
+	if (o->random_distribution != FIO_RAND_DIST_RANDOM)
+		o->norandommap = 1;
 
 	/*
 	 * If size is set but less than the min block size, complain
 	 */
 	if (o->size && o->size < td_min_bs(td)) {
-		log_err("fio: size too small, must be larger than the IO size: %llu\n", (unsigned long long) o->size);
-		ret = 1;
+		log_err("fio: size too small, must not be less than minimum block size: %llu < %llu\n",
+			(unsigned long long) o->size, td_min_bs(td));
+		ret |= 1;
 	}
 
-	return ret;
-}
+	/*
+	 * O_ATOMIC implies O_DIRECT
+	 */
+	if (o->oatomic)
+		o->odirect = 1;
 
-/*
- * This function leaks the buffer
- */
-char *fio_uint_to_kmg(unsigned int val)
-{
-	char *buf = malloc(32);
-	char post[] = { 0, 'K', 'M', 'G', 'P', 'E', 0 };
-	char *p = post;
+	/*
+	 * If randseed is set, that overrides randrepeat
+	 */
+	if (fio_option_is_set(o, rand_seed))
+		o->rand_repeatable = 0;
 
-	do {
-		if (val & 1023)
-			break;
+	if (td_ioengine_flagged(td, FIO_NOEXTEND) && o->file_append) {
+		log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name);
+		ret |= 1;
+	}
+
+	if (fio_option_is_set(o, gtod_cpu)) {
+		fio_gtod_init();
+		fio_gtod_set_cpu(o->gtod_cpu);
+		fio_gtod_offload = 1;
+	}
 
-		val >>= 10;
-		p++;
-	} while (*p);
+	td->loops = o->loops;
+	if (!td->loops)
+		td->loops = 1;
+
+	if (o->block_error_hist && o->nr_files != 1) {
+		log_err("fio: block error histogram only available "
+			"with a single file per job, but %d files "
+			"provided\n", o->nr_files);
+		ret |= 1;
+	}
+
+	if (fio_option_is_set(o, clat_percentiles) &&
+	    !fio_option_is_set(o, lat_percentiles)) {
+		o->lat_percentiles = !o->clat_percentiles;
+	} else if (fio_option_is_set(o, lat_percentiles) &&
+		   !fio_option_is_set(o, clat_percentiles)) {
+		o->clat_percentiles = !o->lat_percentiles;
+	} else if (fio_option_is_set(o, lat_percentiles) &&
+		   fio_option_is_set(o, clat_percentiles) &&
+		   o->lat_percentiles && o->clat_percentiles) {
+		log_err("fio: lat_percentiles and clat_percentiles are "
+			"mutually exclusive\n");
+		ret |= 1;
+	}
+
+	if (o->disable_lat)
+		o->lat_percentiles = 0;
+	if (o->disable_clat)
+		o->clat_percentiles = 0;
 
-	snprintf(buf, 32, "%u%c", val, *p);
-	return buf;
+	/*
+	 * Fix these up to be nsec internally
+	 */
+	o->max_latency *= 1000ULL;
+	o->latency_target *= 1000ULL;
+	o->latency_window *= 1000ULL;
+
+	return ret;
 }
 
-/* External engines are specified by "external:name.o") */
-static const char *get_engine_name(const char *str)
+static void init_rand_file_service(struct thread_data *td)
 {
-	char *p = strstr(str, ":");
+	unsigned long nranges = td->o.nr_files << FIO_FSERVICE_SHIFT;
+	const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
 
-	if (!p)
-		return str;
-
-	p++;
-	strip_blank_front(&p);
-	strip_blank_end(p);
-	return p;
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
+		zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
+		pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
+		gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+		gauss_disable_hash(&td->next_file_gauss);
+	}
 }
 
-static int exists_and_not_file(const char *filename)
+void td_fill_verify_state_seed(struct thread_data *td)
 {
-	struct stat sb;
+	bool use64;
 
-	if (lstat(filename, &sb) == -1)
-		return 0;
-
-	/* \\.\ is the device namespace in Windows, where every file
-	 * is a device node */
-	if (S_ISREG(sb.st_mode) && strncmp(filename, "\\\\.\\", 4) != 0)
-		return 0;
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = true;
+	else
+		use64 = false;
 
-	return 1;
+	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
+		use64);
 }
 
-static void td_fill_rand_seeds_os(struct thread_data *td)
+static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
 {
-	os_random_seed(td->rand_seeds[FIO_RAND_BS_OFF], &td->bsrange_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_VER_OFF], &td->verify_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_MIX_OFF], &td->rwmix_state);
+	unsigned int read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
+	unsigned int write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
+	unsigned int trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
+	int i;
 
-	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
-		os_random_seed(td->rand_seeds[FIO_RAND_FILE_OFF], &td->next_file_state);
+	/*
+	 * trimwrite is special in that we need to generate the same
+	 * offsets to get the "write after trim" effect. If we are
+	 * using bssplit to set buffer length distributions, ensure that
+	 * we seed the trim and write generators identically. Ditto for
+	 * verify, read and writes must have the same seed, if we are doing
+	 * read verify.
+	 */
+	if (td->o.verify != VERIFY_NONE)
+		write_seed = read_seed;
+	if (td_trimwrite(td))
+		trim_seed = write_seed;
+	init_rand_seed(&td->bsrange_state[DDIR_READ], read_seed, use64);
+	init_rand_seed(&td->bsrange_state[DDIR_WRITE], write_seed, use64);
+	init_rand_seed(&td->bsrange_state[DDIR_TRIM], trim_seed, use64);
+
+	td_fill_verify_state_seed(td);
+	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
 
-	os_random_seed(td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], &td->file_size_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_TRIM_OFF], &td->trim_state);
+	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
+		init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF], use64);
+	else if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+		init_rand_file_service(td);
+
+	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64);
+	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64);
+	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64);
+	init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+	init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0);
+	init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0);
+	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
+	init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
 
 	if (!td_random(td))
 		return;
@@ -700,45 +1053,36 @@
 	if (td->o.rand_repeatable)
 		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
 
-	os_random_seed(td->rand_seeds[FIO_RAND_BLOCK_OFF], &td->random_state);
+	init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
 
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], &td->seq_rand_state[DDIR_READ]);
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF], &td->seq_rand_state[DDIR_WRITE]);
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF], &td->seq_rand_state[DDIR_TRIM]);
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct frand_state *s = &td->seq_rand_state[i];
+
+		init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
+	}
 }
 
-static void td_fill_rand_seeds_internal(struct thread_data *td)
+void td_fill_rand_seeds(struct thread_data *td)
 {
-	init_rand_seed(&td->__bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF]);
-	init_rand_seed(&td->__verify_state, td->rand_seeds[FIO_RAND_VER_OFF]);
-	init_rand_seed(&td->__rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF]);
-
-	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
-		init_rand_seed(&td->__next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF]);
-
-	init_rand_seed(&td->__file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF]);
-	init_rand_seed(&td->__trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF]);
-
-	if (!td_random(td))
-		return;
+	bool use64;
 
-	if (td->o.rand_repeatable)
-		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
+	if (td->o.allrand_repeatable) {
+		unsigned int i;
 
-	init_rand_seed(&td->__random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_READ], td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF]);
-}
+		for (i = 0; i < FIO_RAND_NR_OFFS; i++)
+			td->rand_seeds[i] = FIO_RANDSEED * td->thread_number
+			       	+ i;
+	}
 
-void td_fill_rand_seeds(struct thread_data *td)
-{
-	if (td->o.use_os_rand)
-		td_fill_rand_seeds_os(td);
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = true;
 	else
-		td_fill_rand_seeds_internal(td);
+		use64 = false;
+
+	td_fill_rand_seeds_internal(td, use64);
 
-	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF]);
+	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
+	frand_copy(&td->buf_state_prev, &td->buf_state);
 }
 
 /*
@@ -747,22 +1091,46 @@
  */
 int ioengine_load(struct thread_data *td)
 {
-	const char *engine;
-
-	/*
-	 * Engine has already been loaded.
-	 */
-	if (td->io_ops)
-		return 0;
 	if (!td->o.ioengine) {
 		log_err("fio: internal fault, no IO engine specified\n");
 		return 1;
 	}
 
-	engine = get_engine_name(td->o.ioengine);
-	td->io_ops = load_ioengine(td, engine);
+	if (td->io_ops) {
+		struct ioengine_ops *ops;
+		void *dlhandle;
+
+		/* An engine is loaded, but the requested ioengine
+		 * may have changed.
+		 */
+		if (!strcmp(td->io_ops->name, td->o.ioengine)) {
+			/* The right engine is already loaded */
+			return 0;
+		}
+
+		/*
+		 * Name of file and engine may be different, load ops
+		 * for this name and see if they match. If they do, then
+		 * the engine is unchanged.
+		 */
+		dlhandle = td->io_ops_dlhandle;
+		ops = load_ioengine(td);
+		if (ops == td->io_ops && dlhandle == td->io_ops_dlhandle) {
+			if (dlhandle)
+				dlclose(dlhandle);
+			return 0;
+		}
+
+		if (dlhandle && dlhandle != td->io_ops_dlhandle)
+			dlclose(dlhandle);
+
+		/* Unload the old engine. */
+		free_ioengine(td);
+	}
+
+	td->io_ops = load_ioengine(td);
 	if (!td->io_ops) {
-		log_err("fio: failed to load engine %s\n", engine);
+		log_err("fio: failed to load engine\n");
 		return 1;
 	}
 
@@ -790,7 +1158,7 @@
 		 */
 		if (origeo) {
 			memcpy(td->eo, origeo, td->io_ops->option_struct_size);
-			options_mem_dupe(td->eo, td->io_ops->options);
+			options_mem_dupe(td->io_ops->options, td->eo);
 		} else {
 			memset(td->eo, 0, td->io_ops->option_struct_size);
 			fill_default_options(td->eo, td->io_ops->options);
@@ -798,12 +1166,17 @@
 		*(struct thread_data **)td->eo = td;
 	}
 
+	if (td->o.odirect)
+		td->io_ops->flags |= FIO_RAWIO;
+
+	td_set_ioengine_flags(td);
 	return 0;
 }
 
 static void init_flags(struct thread_data *td)
 {
 	struct thread_options *o = &td->o;
+	int i;
 
 	if (o->verify_backlog)
 		td->flags |= TD_F_VER_BACKLOG;
@@ -813,25 +1186,53 @@
 		td->flags |= TD_F_READ_IOLOG;
 	if (o->refill_buffers)
 		td->flags |= TD_F_REFILL_BUFFERS;
-	if (o->scramble_buffers)
+	/*
+	 * Always scramble buffers if asked to
+	 */
+	if (o->scramble_buffers && fio_option_is_set(o, scramble_buffers))
+		td->flags |= TD_F_SCRAMBLE_BUFFERS;
+	/*
+	 * But also scramble buffers, unless we were explicitly asked
+	 * to zero them.
+	 */
+	if (o->scramble_buffers && !(o->zero_buffers &&
+	    fio_option_is_set(o, zero_buffers)))
 		td->flags |= TD_F_SCRAMBLE_BUFFERS;
 	if (o->verify != VERIFY_NONE)
-		td->flags |= TD_F_VER_NONE;
+		td->flags |= TD_F_DO_VERIFY;
+
+	if (o->verify_async || o->io_submit_mode == IO_MODE_OFFLOAD)
+		td->flags |= TD_F_NEED_LOCK;
+
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		td->flags &= ~TD_F_SCRAMBLE_BUFFERS;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (option_check_rate(td, i)) {
+			td->flags |= TD_F_CHECK_RATE;
+			break;
+		}
+	}
 }
 
 static int setup_random_seeds(struct thread_data *td)
 {
-	unsigned long seed;
+	uint64_t seed;
 	unsigned int i;
 
-	if (!td->o.rand_repeatable)
-		return init_random_state(td, td->rand_seeds, sizeof(td->rand_seeds));
+	if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+		int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+		if (!ret)
+			td_fill_rand_seeds(td);
+		return ret;
+	}
 
-	for (seed = 0x89, i = 0; i < 4; i++)
+	seed = td->o.rand_seed;
+	for (i = 0; i < 4; i++)
 		seed *= 0x9e370001UL;
 
 	for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
-		td->rand_seeds[i] = seed;
+		td->rand_seeds[i] = seed * td->thread_number + i;
 		seed *= 0x9e370001UL;
 	}
 
@@ -857,21 +1258,23 @@
 	{ .keyword = NULL, },
 	};
 
-static char *make_filename(char *buf, struct thread_options *o,
+static char *make_filename(char *buf, size_t buf_size,struct thread_options *o,
 			   const char *jobname, int jobnum, int filenum)
 {
 	struct fpre_keyword *f;
 	char copy[PATH_MAX];
+	size_t dst_left = PATH_MAX - 1;
 
 	if (!o->filename_format || !strlen(o->filename_format)) {
 		sprintf(buf, "%s.%d.%d", jobname, jobnum, filenum);
-		return NULL;
+		return buf;
 	}
 
 	for (f = &fpre_keywords[0]; f->keyword; f++)
 		f->strlen = strlen(f->keyword);
 
-	strcpy(buf, o->filename_format);
+	snprintf(buf, buf_size, "%s", o->filename_format);
+
 	memset(copy, 0, sizeof(copy));
 	for (f = &fpre_keywords[0]; f->keyword; f++) {
 		do {
@@ -889,38 +1292,142 @@
 			if (pre_len) {
 				strncpy(dst, buf, pre_len);
 				dst += pre_len;
+				dst_left -= pre_len;
 			}
 
 			switch (f->key) {
-			case FPRE_JOBNAME:
-				dst += sprintf(dst, "%s", jobname);
+			case FPRE_JOBNAME: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%s", jobname);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
-			case FPRE_JOBNUM:
-				dst += sprintf(dst, "%d", jobnum);
+				}
+			case FPRE_JOBNUM: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%d", jobnum);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
-			case FPRE_FILENUM:
-				dst += sprintf(dst, "%d", filenum);
+				}
+			case FPRE_FILENUM: {
+				int ret;
+
+				ret = snprintf(dst, dst_left, "%d", filenum);
+				if (ret < 0)
+					break;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
+				}
 			default:
 				assert(0);
 				break;
 			}
 
 			if (post_start)
-				strcpy(dst, buf + post_start);
+				strncpy(dst, buf + post_start, dst_left);
 
-			strcpy(buf, copy);
+			snprintf(buf, buf_size, "%s", copy);
 		} while (1);
 	}
 
 	return buf;
 }
 
-int parse_dryrun(void)
+bool parse_dryrun(void)
 {
 	return dump_cmdline || parse_only;
 }
 
+static void gen_log_name(char *name, size_t size, const char *logtype,
+			 const char *logname, unsigned int num,
+			 const char *suf, int per_job)
+{
+	if (per_job)
+		snprintf(name, size, "%s_%s.%d.%s", logname, logtype, num, suf);
+	else
+		snprintf(name, size, "%s_%s.%s", logname, logtype, suf);
+}
+
+static int check_waitees(char *waitee)
+{
+	struct thread_data *td;
+	int i, ret = 0;
+
+	for_each_td(td, i) {
+		if (td->subjob_number)
+			continue;
+
+		ret += !strcmp(td->o.name, waitee);
+	}
+
+	return ret;
+}
+
+static bool wait_for_ok(const char *jobname, struct thread_options *o)
+{
+	int nw;
+
+	if (!o->wait_for)
+		return true;
+
+	if (!strcmp(jobname, o->wait_for)) {
+		log_err("%s: a job cannot wait for itself (wait_for=%s).\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	if (!(nw = check_waitees(o->wait_for))) {
+		log_err("%s: waitee job %s unknown.\n", jobname, o->wait_for);
+		return false;
+	}
+
+	if (nw > 1) {
+		log_err("%s: multiple waitees %s found,\n"
+			"please avoid duplicates when using wait_for option.\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Treat an empty log file name the same as a one not given
+ */
+static const char *make_log_name(const char *logname, const char *jobname)
+{
+	if (logname && strcmp(logname, ""))
+		return logname;
+
+	return jobname;
+}
+
 /*
  * Adds a job to the list of things todo. Sanitizes the various options
  * to make sure we don't have conflicts, and initializes various
@@ -930,9 +1437,10 @@
 		   int recursed, int client_type)
 {
 	unsigned int i;
-	char fname[PATH_MAX];
+	char fname[PATH_MAX + 1];
 	int numjobs, file_alloced;
 	struct thread_options *o = &td->o;
+	char logname[PATH_MAX + 32];
 
 	/*
 	 * the def_thread is just for options, it's not a real job
@@ -958,24 +1466,32 @@
 	if (ioengine_load(td))
 		goto err;
 
-	if (o->odirect)
-		td->io_ops->flags |= FIO_RAWIO;
-
 	file_alloced = 0;
 	if (!o->filename && !td->files_index && !o->read_iolog_file) {
 		file_alloced = 1;
 
-		if (o->nr_files == 1 && exists_and_not_file(jobname))
-			add_file(td, jobname);
+		if (o->nr_files == 1 && exists_and_not_regfile(jobname))
+			add_file(td, jobname, job_add_num, 0);
 		else {
 			for (i = 0; i < o->nr_files; i++)
-				add_file(td, make_filename(fname, o, jobname, td->thread_number, i));
+				add_file(td, make_filename(fname, sizeof(fname), o, jobname, job_add_num, i), job_add_num, 0);
 		}
 	}
 
+	if (setup_random_seeds(td)) {
+		td_verror(td, errno, "setup_random_seeds");
+		goto err;
+	}
+
 	if (fixup_options(td))
 		goto err;
 
+	/*
+	 * Belongs to fixup_options, but o->name is not necessarily set as yet
+	 */
+	if (!wait_for_ok(jobname, o))
+		goto err;
+
 	flow_init_job(td);
 
 	/*
@@ -985,88 +1501,213 @@
 	if (td->eo)
 		*(struct thread_data **)td->eo = NULL;
 
-	if (td->io_ops->flags & FIO_DISKLESSIO) {
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO)) {
 		struct fio_file *f;
 
 		for_each_file(td, f, i)
 			f->real_file_size = -1ULL;
 	}
 
-	td->mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
+	td->sem = fio_sem_init(FIO_SEM_LOCKED);
 
 	td->ts.clat_percentiles = o->clat_percentiles;
+	td->ts.lat_percentiles = o->lat_percentiles;
 	td->ts.percentile_precision = o->percentile_precision;
 	memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
+	td->ts.sig_figs = o->sig_figs;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		td->ts.clat_stat[i].min_val = ULONG_MAX;
 		td->ts.slat_stat[i].min_val = ULONG_MAX;
 		td->ts.lat_stat[i].min_val = ULONG_MAX;
 		td->ts.bw_stat[i].min_val = ULONG_MAX;
+		td->ts.iops_stat[i].min_val = ULONG_MAX;
 	}
+	td->ts.sync_stat.min_val = ULONG_MAX;
 	td->ddir_seq_nr = o->ddir_seq_nr;
 
 	if ((o->stonewall || o->new_group) && prev_group_jobs) {
 		prev_group_jobs = 0;
 		groupid++;
+		if (groupid == INT_MAX) {
+			log_err("fio: too many groups defined\n");
+			goto err;
+		}
 	}
 
 	td->groupid = groupid;
 	prev_group_jobs++;
 
-	if (setup_random_seeds(td)) {
-		td_verror(td, errno, "init_random_state");
+	if (setup_rate(td))
 		goto err;
+
+	if (o->write_lat_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_LAT,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->lat_log_file, o->name);
+		const char *suf;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "lat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->lat_log, &p, logname);
+
+		gen_log_name(logname, sizeof(logname), "slat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->slat_log, &p, logname);
+
+		gen_log_name(logname, sizeof(logname), "clat", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->clat_log, &p, logname);
+
 	}
 
-	if (setup_rate(td))
-		goto err;
+	if (o->write_hist_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_HIST,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->hist_log_file, o->name);
+		const char *suf;
+
+#ifndef CONFIG_ZLIB
+		if (td->client_type) {
+			log_err("fio: --write_hist_log requires zlib in client/server mode\n");
+			goto err;
+		}
+#endif
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "clat_hist", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->clat_hist_log, &p, logname);
+	}
+
+	if (o->write_bw_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_BW,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->bw_log_file, o->name);
+		const char *suf;
+
+		if (fio_option_is_set(o, bw_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->bw_avg_time);
+		else
+			o->bw_avg_time = p.avg_msec;
+
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
 
-	if (o->lat_log_file) {
-		setup_log(&td->lat_log, o->log_avg_msec, IO_LOG_TYPE_LAT);
-		setup_log(&td->slat_log, o->log_avg_msec, IO_LOG_TYPE_SLAT);
-		setup_log(&td->clat_log, o->log_avg_msec, IO_LOG_TYPE_CLAT);
-	}
-	if (o->bw_log_file)
-		setup_log(&td->bw_log, o->log_avg_msec, IO_LOG_TYPE_BW);
-	if (o->iops_log_file)
-		setup_log(&td->iops_log, o->log_avg_msec, IO_LOG_TYPE_IOPS);
+		gen_log_name(logname, sizeof(logname), "bw", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->bw_log, &p, logname);
+	}
+	if (o->write_iops_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_IOPS,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = make_log_name(o->iops_log_file, o->name);
+		const char *suf;
+
+		if (fio_option_is_set(o, iops_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->iops_avg_time);
+		else
+			o->iops_avg_time = p.avg_msec;
+
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "iops", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->iops_log, &p, logname);
+	}
 
 	if (!o->name)
 		o->name = strdup(jobname);
 
-	if (output_format == FIO_OUTPUT_NORMAL) {
+	if (output_format & FIO_OUTPUT_NORMAL) {
 		if (!job_add_num) {
 			if (is_backend && !recursed)
 				fio_server_send_add_job(td);
 
-			if (!(td->io_ops->flags & FIO_NOIO)) {
+			if (!td_ioengine_flagged(td, FIO_NOIO)) {
 				char *c1, *c2, *c3, *c4;
 				char *c5 = NULL, *c6 = NULL;
+				int i2p = is_power_of_2(o->kb_base);
+				struct buf_output out;
 
-				c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-				c2 = fio_uint_to_kmg(o->max_bs[DDIR_READ]);
-				c3 = fio_uint_to_kmg(o->min_bs[DDIR_WRITE]);
-				c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
+				c1 = num2str(o->min_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+				c2 = num2str(o->max_bs[DDIR_READ], o->sig_figs, 1, i2p, N2S_BYTE);
+				c3 = num2str(o->min_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
+				c4 = num2str(o->max_bs[DDIR_WRITE], o->sig_figs, 1, i2p, N2S_BYTE);
 
 				if (!o->bs_is_seq_rand) {
-					c5 = fio_uint_to_kmg(o->min_bs[DDIR_TRIM]);
-					c6 = fio_uint_to_kmg(o->max_bs[DDIR_TRIM]);
+					c5 = num2str(o->min_bs[DDIR_TRIM], o->sig_figs, 1, i2p, N2S_BYTE);
+					c6 = num2str(o->max_bs[DDIR_TRIM], o->sig_figs, 1, i2p, N2S_BYTE);
 				}
 
-				log_info("%s: (g=%d): rw=%s, ", td->o.name,
+				buf_output_init(&out);
+				__log_buf(&out, "%s: (g=%d): rw=%s, ", td->o.name,
 							td->groupid,
 							ddir_str(o->td_ddir));
 
 				if (o->bs_is_seq_rand)
-					log_info("bs(seq/rand)=%s-%s/%s-%s, ",
+					__log_buf(&out, "bs=(R) %s-%s, (W) %s-%s, bs_is_seq_rand, ",
 							c1, c2, c3, c4);
 				else
-					log_info("bs=%s-%s/%s-%s/%s-%s, ",
+					__log_buf(&out, "bs=(R) %s-%s, (W) %s-%s, (T) %s-%s, ",
 							c1, c2, c3, c4, c5, c6);
 
-				log_info("ioengine=%s, iodepth=%u\n",
+				__log_buf(&out, "ioengine=%s, iodepth=%u\n",
 						td->io_ops->name, o->iodepth);
+				log_info_buf(out.buf, out.buflen);
+				buf_output_free(&out);
 
 				free(c1);
 				free(c2);
@@ -1079,13 +1720,24 @@
 			log_info("...\n");
 	}
 
+	if (td_steadystate_init(td))
+		goto err;
+
+	if (o->merge_blktrace_file && !merge_blktrace_iologs(td))
+		goto err;
+
+	if (merge_blktrace_only) {
+		put_job(td);
+		return 0;
+	}
+
 	/*
 	 * recurse add identical jobs, clear numjobs and stonewall options
 	 * as they don't apply to sub-jobs
 	 */
 	numjobs = o->numjobs;
 	while (--numjobs) {
-		struct thread_data *td_new = get_new_job(0, td, 1);
+		struct thread_data *td_new = get_new_job(false, td, true, jobname);
 
 		if (!td_new)
 			goto err;
@@ -1093,17 +1745,30 @@
 		td_new->o.numjobs = 1;
 		td_new->o.stonewall = 0;
 		td_new->o.new_group = 0;
+		td_new->subjob_number = numjobs;
+		td_new->o.ss_dur = o->ss_dur * 1000000l;
+		td_new->o.ss_limit = o->ss_limit;
 
 		if (file_alloced) {
-			td_new->o.filename = NULL;
+			if (td_new->files) {
+				struct fio_file *f;
+				for_each_file(td_new, f, i) {
+					if (f->file_name)
+						sfree(f->file_name);
+					sfree(f);
+				}
+				free(td_new->files);
+				td_new->files = NULL;
+			}
 			td_new->files_index = 0;
 			td_new->files_size = 0;
-			td_new->files = NULL;
+			if (td_new->o.filename) {
+				free(td_new->o.filename);
+				td_new->o.filename = NULL;
+			}
 		}
 
-		job_add_num = numjobs - 1;
-
-		if (add_job(td_new, jobname, job_add_num, 1, client_type))
+		if (add_job(td_new, jobname, numjobs, 1, client_type))
 			goto err;
 	}
 
@@ -1133,11 +1798,11 @@
 			sprintf(jobname, "%s", o[i] + 5);
 		}
 		if (in_global && !td_parent)
-			td_parent = get_new_job(1, &def_thread, 0);
+			td_parent = get_new_job(true, &def_thread, false, jobname);
 		else if (!in_global && !td) {
 			if (!td_parent)
 				td_parent = &def_thread;
-			td = get_new_job(0, td_parent, 0);
+			td = get_new_job(false, td_parent, false, jobname);
 		}
 		if (in_global)
 			fio_options_parse(td_parent, (char **) &o[i], 1);
@@ -1185,11 +1850,12 @@
 /*
  * This is our [ini] type file parser.
  */
-int parse_jobs_ini(char *file, int is_buf, int stonewall_flag, int type)
+static int __parse_jobs_ini(struct thread_data *td,
+		char *file, int is_buf, int stonewall_flag, int type,
+		int nested, char *name, char ***popts, int *aopts, int *nopts)
 {
-	unsigned int global;
-	struct thread_data *td;
-	char *string, *name;
+	bool global = false;
+	char *string;
 	FILE *f;
 	char *p;
 	int ret = 0, stonewall;
@@ -1199,6 +1865,9 @@
 	char **opts;
 	int i, alloc_opts, num_opts;
 
+	dprint(FD_PARSE, "Parsing ini file %s\n", file);
+	assert(td || !nested);
+
 	if (is_buf)
 		f = NULL;
 	else {
@@ -1208,22 +1877,37 @@
 			f = fopen(file, "r");
 
 		if (!f) {
-			perror("fopen job file");
+			int __err = errno;
+
+			log_err("fio: unable to open '%s' job file\n", file);
+			if (td)
+				td_verror(td, __err, "job file open");
 			return 1;
 		}
 	}
 
-	string = malloc(4096);
+	string = malloc(OPT_LEN_MAX);
 
 	/*
 	 * it's really 256 + small bit, 280 should suffice
 	 */
-	name = malloc(280);
-	memset(name, 0, 280);
+	if (!nested) {
+		name = malloc(280);
+		memset(name, 0, 280);
+	}
+
+	opts = NULL;
+	if (nested && popts) {
+		opts = *popts;
+		alloc_opts = *aopts;
+		num_opts = *nopts;
+	}
 
-	alloc_opts = 8;
-	opts = malloc(sizeof(char *) * alloc_opts);
-	num_opts = 0;
+	if (!opts) {
+		alloc_opts = 8;
+		opts = malloc(sizeof(char *) * alloc_opts);
+		num_opts = 0;
+	}
 
 	stonewall = stonewall_flag;
 	do {
@@ -1235,7 +1919,7 @@
 			if (is_buf)
 				p = strsep(&file, "\n");
 			else
-				p = fgets(string, 4096, f);
+				p = fgets(string, OPT_LEN_MAX, f);
 			if (!p)
 				break;
 		}
@@ -1244,58 +1928,73 @@
 		strip_blank_front(&p);
 		strip_blank_end(p);
 
+		dprint(FD_PARSE, "%s\n", p);
 		if (is_empty_or_comment(p))
 			continue;
-		if (sscanf(p, "[%255[^\n]]", name) != 1) {
-			if (inside_skip)
+
+		if (!nested) {
+			if (sscanf(p, "[%255[^\n]]", name) != 1) {
+				if (inside_skip)
+					continue;
+
+				log_err("fio: option <%s> outside of "
+					"[] job section\n", p);
+				ret = 1;
+				break;
+			}
+
+			name[strlen(name) - 1] = '\0';
+
+			if (skip_this_section(name)) {
+				inside_skip = 1;
 				continue;
-			log_err("fio: option <%s> outside of [] job section\n",
-									p);
-			break;
-		}
+			} else
+				inside_skip = 0;
 
-		name[strlen(name) - 1] = '\0';
+			dprint(FD_PARSE, "Parsing section [%s]\n", name);
 
-		if (skip_this_section(name)) {
-			inside_skip = 1;
-			continue;
-		} else
-			inside_skip = 0;
+			global = !strncmp(name, "global", 6);
 
-		global = !strncmp(name, "global", 6);
+			if (dump_cmdline) {
+				if (first_sect)
+					log_info("fio ");
+				if (!global)
+					log_info("--name=%s ", name);
+				first_sect = 0;
+			}
 
-		if (dump_cmdline) {
-			if (first_sect)
-				log_info("fio ");
-			if (!global)
-				log_info("--name=%s ", name);
-			first_sect = 0;
-		}
+			td = get_new_job(global, &def_thread, false, name);
+			if (!td) {
+				ret = 1;
+				break;
+			}
 
-		td = get_new_job(global, &def_thread, 0);
-		if (!td) {
-			ret = 1;
-			break;
-		}
+			/*
+			 * Separate multiple job files by a stonewall
+			 */
+			if (!global && stonewall) {
+				td->o.stonewall = stonewall;
+				stonewall = 0;
+			}
 
-		/*
-		 * Seperate multiple job files by a stonewall
-		 */
-		if (!global && stonewall) {
-			td->o.stonewall = stonewall;
-			stonewall = 0;
+			num_opts = 0;
+			memset(opts, 0, alloc_opts * sizeof(char *));
 		}
-
-		num_opts = 0;
-		memset(opts, 0, alloc_opts * sizeof(char *));
+		else
+			skip_fgets = 1;
 
 		while (1) {
-			if (is_buf)
-				p = strsep(&file, "\n");
+			if (!skip_fgets) {
+				if (is_buf)
+					p = strsep(&file, "\n");
+				else
+					p = fgets(string, OPT_LEN_MAX, f);
+				if (!p)
+					break;
+				dprint(FD_PARSE, "%s", p);
+			}
 			else
-				p = fgets(string, 4096, f);
-			if (!p)
-				break;
+				skip_fgets = 0;
 
 			if (is_empty_or_comment(p))
 				continue;
@@ -1307,12 +2006,57 @@
 			 * fgets() a new line at the top.
 			 */
 			if (p[0] == '[') {
+				if (nested) {
+					log_err("No new sections in included files\n");
+					ret = 1;
+					goto out;
+				}
+
 				skip_fgets = 1;
 				break;
 			}
 
 			strip_blank_end(p);
 
+			if (!strncmp(p, "include", strlen("include"))) {
+				char *filename = p + strlen("include") + 1,
+					*ts, *full_fn = NULL;
+
+				/*
+				 * Allow for the include filename
+				 * specification to be relative.
+				 */
+				if (access(filename, F_OK) &&
+				    (ts = strrchr(file, '/'))) {
+					if (asprintf(&full_fn, "%.*s%s",
+						 (int)(ts - file + 1), file,
+						 filename) < 0) {
+						ret = ENOMEM;
+						break;
+					}
+					filename = full_fn;
+				}
+
+				ret = __parse_jobs_ini(td, filename, is_buf,
+						       stonewall_flag, type, 1,
+						       name, &opts,
+						       &alloc_opts, &num_opts);
+
+				if (ret) {
+					log_err("Error %d while parsing "
+						"include file %s\n",
+						ret, filename);
+				}
+
+				if (full_fn)
+					free(full_fn);
+
+				if (ret)
+					break;
+
+				continue;
+			}
+
 			if (num_opts == alloc_opts) {
 				alloc_opts <<= 1;
 				opts = realloc(opts,
@@ -1323,11 +2067,17 @@
 			num_opts++;
 		}
 
+		if (nested) {
+			*popts = opts;
+			*aopts = alloc_opts;
+			*nopts = num_opts;
+			goto out;
+		}
+
 		ret = fio_options_parse(td, opts, num_opts);
 		if (!ret) {
 			if (dump_cmdline)
-				for (i = 0; i < num_opts; i++)
-					log_info("--%s ", opts[i]);
+				dump_opt_list(td);
 
 			ret = add_job(td, name, 0, 0, type);
 		} else {
@@ -1349,24 +2099,30 @@
 		i++;
 	}
 
-	for (i = 0; i < num_opts; i++)
-		free(opts[i]);
-
-	free(string);
-	free(name);
 	free(opts);
+out:
+	free(string);
+	if (!nested)
+		free(name);
 	if (!is_buf && f != stdin)
 		fclose(f);
 	return ret;
 }
 
+int parse_jobs_ini(char *file, int is_buf, int stonewall_flag, int type)
+{
+	return __parse_jobs_ini(NULL, file, is_buf, stonewall_flag, type,
+			0, NULL, NULL, NULL, NULL);
+}
+
 static int fill_def_thread(void)
 {
 	memset(&def_thread, 0, sizeof(def_thread));
+	INIT_FLIST_HEAD(&def_thread.opt_list);
 
 	fio_getaffinity(getpid(), &def_thread.o.cpumask);
-	def_thread.o.timeout = def_timeout;
 	def_thread.o.error_dump = 1;
+
 	/*
 	 * fill default options
 	 */
@@ -1374,24 +2130,60 @@
 	return 0;
 }
 
+static void show_debug_categories(void)
+{
+#ifdef FIO_INC_DEBUG
+	const struct debug_level *dl = &debug_levels[0];
+	int curlen, first = 1;
+
+	curlen = 0;
+	while (dl->name) {
+		int has_next = (dl + 1)->name != NULL;
+
+		if (first || curlen + strlen(dl->name) >= 80) {
+			if (!first) {
+				printf("\n");
+				curlen = 0;
+			}
+			curlen += printf("\t\t\t%s", dl->name);
+			curlen += 3 * (8 - 1);
+			if (has_next)
+				curlen += printf(",");
+		} else {
+			curlen += printf("%s", dl->name);
+			if (has_next)
+				curlen += printf(",");
+		}
+		dl++;
+		first = 0;
+	}
+	printf("\n");
+#endif
+}
+
+/*
+ * Following options aren't printed by usage().
+ * --append-terse - Equivalent to --output-format=terse, see f6a7df53.
+ * --latency-log - Deprecated option.
+ */
 static void usage(const char *name)
 {
 	printf("%s\n", fio_version_string);
 	printf("%s [options] [job options] <job file(s)>\n", name);
-	printf("  --debug=options\tEnable debug logging. May be one/more of:\n"
-		"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
-		"\t\t\tdiskutil,job,mutex,profile,time,net\n");
+	printf("  --debug=options\tEnable debug logging. May be one/more of:\n");
+	show_debug_categories();
 	printf("  --parse-only\t\tParse options only, don't start any IO\n");
+	printf("  --merge-blktrace-only\tMerge blktraces only, don't start any IO\n");
 	printf("  --output\t\tWrite output to file\n");
-	printf("  --runtime\t\tRuntime in seconds\n");
-	printf("  --latency-log\t\tGenerate per-job latency logs\n");
-	printf("  --bandwidth-log\tGenerate per-job bandwidth logs\n");
+	printf("  --bandwidth-log\tGenerate aggregate bandwidth logs\n");
 	printf("  --minimal\t\tMinimal (terse) output\n");
-	printf("  --output-format=x\tOutput format (terse,json,normal)\n");
-	printf("  --terse-version=x\tSet terse version output format to 'x'\n");
+	printf("  --output-format=type\tOutput format (terse,json,json+,normal)\n");
+	printf("  --terse-version=type\tSet terse version output format"
+		" (default 3, or 2 or 4)\n");
 	printf("  --version\t\tPrint version info and exit\n");
 	printf("  --help\t\tPrint this page\n");
 	printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
+	printf("  --crctest=[type]\tTest speed of checksum functions\n");
 	printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
 		" them\n");
 	printf("  --enghelp=engine\tPrint ioengine help, or list"
@@ -1401,29 +2193,38 @@
 	printf("  --showcmd\t\tTurn a job file into command line options\n");
 	printf("  --eta=when\t\tWhen ETA estimate should be printed\n");
 	printf("            \t\tMay be \"always\", \"never\" or \"auto\"\n");
-	printf("  --eta-newline=time\tForce a new line for every 'time'");
+	printf("  --eta-newline=t\tForce a new line for every 't'");
 	printf(" period passed\n");
 	printf("  --status-interval=t\tForce full status dump every");
 	printf(" 't' period passed\n");
 	printf("  --readonly\t\tTurn on safety read-only checks, preventing"
 		" writes\n");
-	printf("  --section=name\tOnly run specified section in job file\n");
+	printf("  --section=name\tOnly run specified section in job file,"
+		" multiple sections can be specified\n");
 	printf("  --alloc-size=kb\tSet smalloc pool to this size in kb"
-		" (def 1024)\n");
+		" (def 16384)\n");
 	printf("  --warnings-fatal\tFio parser warnings are fatal\n");
 	printf("  --max-jobs=nr\t\tMaximum number of threads/processes to support\n");
 	printf("  --server=args\t\tStart a backend fio server\n");
 	printf("  --daemonize=pidfile\tBackground fio server, write pid to file\n");
-	printf("  --client=hostname\tTalk to remote backend fio server at hostname\n");
+	printf("  --client=hostname\tTalk to remote backend(s) fio server at hostname\n");
+	printf("  --remote-config=file\tTell fio server to load this local job file\n");
 	printf("  --idle-prof=option\tReport cpu idleness on a system or percpu basis\n"
 		"\t\t\t(option=system,percpu) or run unit work\n"
 		"\t\t\tcalibration only (option=calibrate)\n");
-	printf("\nFio was written by Jens Axboe <jens.axboe@oracle.com>");
-	printf("\n                   Jens Axboe <jaxboe@fusionio.com>\n");
+#ifdef CONFIG_ZLIB
+	printf("  --inflate-log=log\tInflate and output compressed log\n");
+#endif
+	printf("  --trigger-file=file\tExecute trigger cmd when file exists\n");
+	printf("  --trigger-timeout=t\tExecute trigger at this time\n");
+	printf("  --trigger=cmd\t\tSet this command as local trigger\n");
+	printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
+	printf("  --aux-path=path\tUse this path for fio state generated files\n");
+	printf("\nFio was written by Jens Axboe <axboe@kernel.dk>\n");
 }
 
 #ifdef FIO_INC_DEBUG
-struct debug_level debug_levels[] = {
+const struct debug_level debug_levels[] = {
 	{ .name = "process",
 	  .help = "Process creation/exit logging",
 	  .shift = FD_PROCESS,
@@ -1480,16 +2281,39 @@
 	  .help = "Network logging",
 	  .shift = FD_NET,
 	},
+	{ .name = "rate",
+	  .help = "Rate logging",
+	  .shift = FD_RATE,
+	},
+	{ .name = "compress",
+	  .help = "Log compression logging",
+	  .shift = FD_COMPRESS,
+	},
+	{ .name = "steadystate",
+	  .help = "Steady state detection logging",
+	  .shift = FD_STEADYSTATE,
+	},
+	{ .name = "helperthread",
+	  .help = "Helper thread logging",
+	  .shift = FD_HELPERTHREAD,
+	},
+	{ .name = "zbd",
+	  .help = "Zoned Block Device logging",
+	  .shift = FD_ZBD,
+	},
 	{ .name = NULL, },
 };
 
 static int set_debug(const char *string)
 {
-	struct debug_level *dl;
+	const struct debug_level *dl;
 	char *p = (char *) string;
 	char *opt;
 	int i;
 
+	if (!string)
+		return 0;
+
 	if (!strcmp(string, "?") || !strcmp(string, "help")) {
 		log_info("fio: dumping debug options:");
 		for (i = 0; debug_levels[i].name; i++) {
@@ -1581,19 +2405,74 @@
 	return 0;
 }
 
-void parse_cmd_client(void *client, char *opt)
+static void parse_cmd_client(void *client, char *opt)
 {
 	fio_client_add_cmd_option(client, opt);
 }
 
+static void show_closest_option(const char *name)
+{
+	int best_option, best_distance;
+	int i, distance;
+
+	while (*name == '-')
+		name++;
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (l_opts[i].name) {
+		distance = string_distance(name, l_opts[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1 && string_distance_ok(name, best_distance))
+		log_err("Did you mean %s?\n", l_opts[best_option].name);
+}
+
+static int parse_output_format(const char *optarg)
+{
+	char *p, *orig, *opt;
+	int ret = 0;
+
+	p = orig = strdup(optarg);
+
+	output_format = 0;
+
+	while ((opt = strsep(&p, ",")) != NULL) {
+		if (!strcmp(opt, "minimal") ||
+		    !strcmp(opt, "terse") ||
+		    !strcmp(opt, "csv"))
+			output_format |= FIO_OUTPUT_TERSE;
+		else if (!strcmp(opt, "json"))
+			output_format |= FIO_OUTPUT_JSON;
+		else if (!strcmp(opt, "json+"))
+			output_format |= (FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS);
+		else if (!strcmp(opt, "normal"))
+			output_format |= FIO_OUTPUT_NORMAL;
+		else {
+			log_err("fio: invalid output format %s\n", opt);
+			ret = 1;
+			break;
+		}
+	}
+
+	free(orig);
+	return ret;
+}
+
 int parse_cmd_line(int argc, char *argv[], int client_type)
 {
 	struct thread_data *td = NULL;
 	int c, ini_idx = 0, lidx, ret = 0, do_exit = 0, exit_val = 0;
 	char *ostr = cmd_optstr;
-	void *pid_file = NULL;
+	char *pid_file = NULL;
 	void *cur_client = NULL;
-	int backend = 0;
+	bool backend = false;
 
 	/*
 	 * Reset optind handling, since we may call this multiple times
@@ -1602,8 +2481,6 @@
 	optind = 1;
 
 	while ((c = getopt_long_only(argc, argv, ostr, l_opts, &lidx)) != -1) {
-		did_arg = 1;
-
 		if ((c & FIO_CLIENT_FLAG) || client_flag_set(c)) {
 			parse_cmd_client(cur_client, argv[optind - 1]);
 			c &= ~FIO_CLIENT_FLAG;
@@ -1612,62 +2489,77 @@
 		switch (c) {
 		case 'a':
 			smalloc_pool_size = atoi(optarg);
-			break;
-		case 't':
-			def_timeout = atoi(optarg);
+			smalloc_pool_size <<= 10;
+			sinit();
 			break;
 		case 'l':
-			write_lat_log = 1;
+			log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n");
+			do_exit++;
+			exit_val = 1;
 			break;
 		case 'b':
-			write_bw_log = 1;
+			write_bw_log = true;
 			break;
-		case 'o':
-			f_out = fopen(optarg, "w+");
-			if (!f_out) {
-				perror("fopen output");
-				exit(1);
+		case 'o': {
+			FILE *tmp;
+
+			if (f_out && f_out != stdout)
+				fclose(f_out);
+
+			tmp = fopen(optarg, "w+");
+			if (!tmp) {
+				log_err("fio: output file open error: %s\n", strerror(errno));
+				exit_val = 1;
+				do_exit++;
+				break;
 			}
-			f_err = f_out;
+			f_err = f_out = tmp;
 			break;
+			}
 		case 'm':
 			output_format = FIO_OUTPUT_TERSE;
 			break;
 		case 'F':
-			if (!strcmp(optarg, "minimal") ||
-			    !strcmp(optarg, "terse") ||
-			    !strcmp(optarg, "csv"))
-				output_format = FIO_OUTPUT_TERSE;
-			else if (!strcmp(optarg, "json"))
-				output_format = FIO_OUTPUT_JSON;
-			else
-				output_format = FIO_OUTPUT_NORMAL;
+			if (parse_output_format(optarg)) {
+				log_err("fio: failed parsing output-format\n");
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			break;
+		case 'f':
+			output_format |= FIO_OUTPUT_TERSE;
 			break;
 		case 'h':
+			did_arg = true;
 			if (!cur_client) {
 				usage(argv[0]);
 				do_exit++;
 			}
 			break;
 		case 'c':
+			did_arg = true;
 			if (!cur_client) {
 				fio_show_option_help(optarg);
 				do_exit++;
 			}
 			break;
 		case 'i':
+			did_arg = true;
 			if (!cur_client) {
 				fio_show_ioengine_help(optarg);
 				do_exit++;
 			}
 			break;
 		case 's':
-			dump_cmdline = 1;
+			did_arg = true;
+			dump_cmdline = true;
 			break;
 		case 'r':
 			read_only = 1;
 			break;
 		case 'v':
+			did_arg = true;
 			if (!cur_client) {
 				log_info("%s\n", fio_version_string);
 				do_exit++;
@@ -1675,8 +2567,7 @@
 			break;
 		case 'V':
 			terse_version = atoi(optarg);
-			if (!(terse_version == 2 || terse_version == 3 ||
-			     terse_version == 4)) {
+			if (!(terse_version >= 2 && terse_version <= 5)) {
 				log_err("fio: bad terse version format\n");
 				exit_val = 1;
 				do_exit++;
@@ -1691,12 +2582,35 @@
 		case 'E': {
 			long long t = 0;
 
-			if (str_to_decimal(optarg, &t, 0, NULL)) {
+			if (check_str_time(optarg, &t, 1)) {
 				log_err("fio: failed parsing eta time %s\n", optarg);
 				exit_val = 1;
 				do_exit++;
+				break;
+			}
+			eta_new_line = t / 1000;
+			if (!eta_new_line) {
+				log_err("fio: eta new line time too short\n");
+				exit_val = 1;
+				do_exit++;
+			}
+			break;
+			}
+		case 'O': {
+			long long t = 0;
+
+			if (check_str_time(optarg, &t, 1)) {
+				log_err("fio: failed parsing eta interval %s\n", optarg);
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
+			eta_interval_msec = t / 1000;
+			if (eta_interval_msec < DISK_UTIL_MSEC) {
+				log_err("fio: eta interval time too short (%umsec min)\n", DISK_UTIL_MSEC);
+				exit_val = 1;
+				do_exit++;
 			}
-			eta_new_line = t;
 			break;
 			}
 		case 'd':
@@ -1704,7 +2618,8 @@
 				do_exit++;
 			break;
 		case 'P':
-			parse_only = 1;
+			did_arg = true;
+			parse_only = true;
 			break;
 		case 'x': {
 			size_t new_size;
@@ -1722,7 +2637,17 @@
 			nr_job_sections++;
 			break;
 			}
+#ifdef CONFIG_ZLIB
+		case 'X':
+			exit_val = iolog_file_inflate(optarg);
+			did_arg = true;
+			do_exit++;
+			break;
+#endif
 		case 'p':
+			did_arg = true;
+			if (exec_profile)
+				free(exec_profile);
 			exec_profile = strdup(optarg);
 			break;
 		case FIO_GETOPT_JOB: {
@@ -1732,8 +2657,9 @@
 			if (!strncmp(opt, "name", 4) && td) {
 				ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
 				if (ret)
-					return 0;
+					goto out_free;
 				td = NULL;
+				did_arg = true;
 			}
 			if (!td) {
 				int is_section = !strncmp(opt, "name", 4);
@@ -1745,9 +2671,16 @@
 				if (is_section && skip_this_section(val))
 					continue;
 
-				td = get_new_job(global, &def_thread, 1);
-				if (!td || ioengine_load(td))
-					return 0;
+				td = get_new_job(global, &def_thread, true, NULL);
+				if (!td || ioengine_load(td)) {
+					if (td) {
+						put_job(td);
+						td = NULL;
+					}
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
 				fio_options_set_ioengine_opts(l_opts, td);
 			}
 
@@ -1764,12 +2697,17 @@
 					td = NULL;
 				}
 				do_exit++;
+				exit_val = 1;
 			}
 
 			if (!ret && !strcmp(opt, "ioengine")) {
-				free_ioengine(td);
-				if (ioengine_load(td))
-					return 0;
+				if (ioengine_load(td)) {
+					put_job(td);
+					td = NULL;
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
 				fio_options_set_ioengine_opts(l_opts, td);
 			}
 			break;
@@ -1777,6 +2715,10 @@
 		case FIO_GETOPT_IOENGINE: {
 			const char *opt = l_opts[lidx].name;
 			char *val = optarg;
+
+			if (!td)
+				break;
+
 			ret = fio_cmd_ioengine_option_parse(td, opt, val);
 			break;
 		}
@@ -1792,6 +2734,8 @@
 			}
 			break;
 		case 'S':
+			did_arg = true;
+#ifndef CONFIG_NO_SHM
 			if (nr_clients) {
 				log_err("fio: can't be both client and server\n");
 				do_exit++;
@@ -1800,27 +2744,65 @@
 			}
 			if (optarg)
 				fio_server_set_arg(optarg);
-			is_backend = 1;
-			backend = 1;
+			is_backend = true;
+			backend = true;
+#else
+			log_err("fio: client/server requires SHM support\n");
+			do_exit++;
+			exit_val = 1;
+#endif
 			break;
 		case 'D':
+			if (pid_file)
+				free(pid_file);
 			pid_file = strdup(optarg);
 			break;
 		case 'I':
 			if ((ret = fio_idle_prof_parse_opt(optarg))) {
 				/* exit on error and calibration only */
+				did_arg = true;
 				do_exit++;
-				if (ret == -1) 
+				if (ret == -1)
 					exit_val = 1;
 			}
 			break;
 		case 'C':
+			did_arg = true;
 			if (is_backend) {
 				log_err("fio: can't be both client and server\n");
 				do_exit++;
 				exit_val = 1;
 				break;
 			}
+			/* if --client parameter contains a pathname */
+			if (0 == access(optarg, R_OK)) {
+				/* file contains a list of host addrs or names */
+				char hostaddr[PATH_MAX] = {0};
+				char formatstr[8];
+				FILE * hostf = fopen(optarg, "r");
+				if (!hostf) {
+					log_err("fio: could not open client list file %s for read\n", optarg);
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
+				sprintf(formatstr, "%%%ds", PATH_MAX - 1);
+				/*
+				 * read at most PATH_MAX-1 chars from each
+				 * record in this file
+				 */
+				while (fscanf(hostf, formatstr, hostaddr) == 1) {
+					/* expect EVERY host in file to be valid */
+					if (fio_client_add(&fio_client_ops, hostaddr, &cur_client)) {
+						log_err("fio: failed adding client %s from file %s\n", hostaddr, optarg);
+						do_exit++;
+						exit_val = 1;
+						break;
+					}
+				}
+				fclose(hostf);
+				break; /* no possibility of job file for "this client only" */
+			}
 			if (fio_client_add(&fio_client_ops, optarg, &cur_client)) {
 				log_err("fio: failed adding client %s\n", optarg);
 				do_exit++;
@@ -1836,29 +2818,88 @@
 				    !strncmp(argv[optind], "-", 1))
 					break;
 
-				fio_client_add_ini_file(cur_client, argv[optind]);
+				if (fio_client_add_ini_file(cur_client, argv[optind], false))
+					break;
 				optind++;
 			}
 			break;
+		case 'R':
+			did_arg = true;
+			if (fio_client_add_ini_file(cur_client, optarg, true)) {
+				do_exit++;
+				exit_val = 1;
+			}
+			break;
 		case 'T':
+			did_arg = true;
+			do_exit++;
+			exit_val = fio_monotonic_clocktest(1);
+			break;
+		case 'G':
+			did_arg = true;
 			do_exit++;
-			exit_val = fio_monotonic_clocktest();
+			exit_val = fio_crctest(optarg);
+			break;
+		case 'M':
+			did_arg = true;
+			do_exit++;
+			exit_val = fio_memcpy_test(optarg);
 			break;
 		case 'L': {
 			long long val;
 
-			if (check_str_time(optarg, &val)) {
+			if (check_str_time(optarg, &val, 1)) {
 				log_err("fio: failed parsing time %s\n", optarg);
 				do_exit++;
 				exit_val = 1;
 				break;
 			}
-			status_interval = val * 1000;
+			if (val < 1000) {
+				log_err("fio: status interval too small\n");
+				do_exit++;
+				exit_val = 1;
+			}
+			status_interval = val / 1000;
+			break;
+			}
+		case 'W':
+			if (trigger_file)
+				free(trigger_file);
+			trigger_file = strdup(optarg);
+			break;
+		case 'H':
+			if (trigger_cmd)
+				free(trigger_cmd);
+			trigger_cmd = strdup(optarg);
+			break;
+		case 'J':
+			if (trigger_remote_cmd)
+				free(trigger_remote_cmd);
+			trigger_remote_cmd = strdup(optarg);
+			break;
+		case 'K':
+			if (aux_path)
+				free(aux_path);
+			aux_path = strdup(optarg);
 			break;
+		case 'B':
+			if (check_str_time(optarg, &trigger_timeout, 1)) {
+				log_err("fio: failed parsing time %s\n", optarg);
+				do_exit++;
+				exit_val = 1;
 			}
+			trigger_timeout /= 1000000;
+			break;
+
+		case 'A':
+			did_arg = true;
+			merge_blktrace_only = true;
+			break;
 		case '?':
 			log_err("%s: unrecognized option '%s'\n", argv[0],
 							argv[optind - 1]);
+			show_closest_option(argv[optind - 1]);
+			/* fall through */
 		default:
 			do_exit++;
 			exit_val = 1;
@@ -1871,18 +2912,20 @@
 	if (do_exit && !(is_backend || nr_clients))
 		exit(exit_val);
 
-	if (nr_clients && fio_clients_connect()) {
-		do_exit++;
-		exit_val = 1;
-		return -1;
-	}
+	if (nr_clients && fio_clients_connect())
+		exit(1);
 
 	if (is_backend && backend)
 		return fio_start_server(pid_file);
+	else if (pid_file)
+		free(pid_file);
 
 	if (td) {
-		if (!ret)
+		if (!ret) {
 			ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
+			if (ret)
+				exit(1);
+		}
 	}
 
 	while (!ret && optind < argc) {
@@ -1892,6 +2935,7 @@
 		optind++;
 	}
 
+out_free:
 	return ini_idx;
 }
 
@@ -1927,7 +2971,7 @@
 
 	if (job_files > 0) {
 		for (i = 0; i < job_files; i++) {
-			if (fill_def_thread())
+			if (i && fill_def_thread())
 				return 1;
 			if (nr_clients) {
 				if (fio_clients_send_ini(ini_file[i]))
@@ -1948,6 +2992,7 @@
 
 	free(ini_file);
 	fio_options_free(&def_thread);
+	filesetup_mem_free();
 
 	if (!thread_number) {
 		if (parse_dryrun())
@@ -1959,23 +3004,12 @@
 		if (did_arg)
 			return 0;
 
-		log_err("No jobs(s) defined\n\n");
-
-		if (!did_arg) {
-			usage(argv[0]);
-			return 1;
-		}
-
-		return 0;
-	}
-
-	if (def_thread.o.gtod_offload) {
-		fio_gtod_init();
-		fio_gtod_offload = 1;
-		fio_gtod_cpu = def_thread.o.gtod_cpu;
+		log_err("No job(s) defined\n\n");
+		usage(argv[0]);
+		return 1;
 	}
 
-	if (output_format == FIO_OUTPUT_NORMAL)
+	if (output_format & FIO_OUTPUT_NORMAL)
 		log_info("%s\n", fio_version_string);
 
 	return 0;
@@ -1985,3 +3019,8 @@
 {
 	memcpy(o, &def_thread.o, sizeof(*o));
 }
+
+struct thread_data *get_global_options(void)
+{
+	return &def_thread;
+}
diff -Nru fio-2.1.3/io_ddir.h fio-3.16/io_ddir.h
--- fio-2.1.3/io_ddir.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/io_ddir.h	2019-09-20 01:01:52.000000000 +0000
@@ -5,14 +5,29 @@
 	DDIR_READ = 0,
 	DDIR_WRITE = 1,
 	DDIR_TRIM = 2,
-	DDIR_RWDIR_CNT = 3,
 	DDIR_SYNC = 3,
 	DDIR_DATASYNC,
 	DDIR_SYNC_FILE_RANGE,
 	DDIR_WAIT,
+	DDIR_LAST,
 	DDIR_INVAL = -1,
+
+	DDIR_RWDIR_CNT = 3,
+	DDIR_RWDIR_SYNC_CNT = 4,
 };
 
+static inline const char *io_ddir_name(enum fio_ddir ddir)
+{
+	static const char *name[] = { "read", "write", "trim", "sync",
+					"datasync", "sync_file_range",
+					"wait", };
+
+	if (ddir < DDIR_LAST)
+		return name[ddir];
+
+	return "invalid";
+}
+
 enum td_ddir {
 	TD_DDIR_READ		= 1 << 0,
 	TD_DDIR_WRITE		= 1 << 1,
@@ -23,6 +38,7 @@
 	TD_DDIR_RANDWRITE	= TD_DDIR_WRITE | TD_DDIR_RAND,
 	TD_DDIR_RANDRW		= TD_DDIR_RW | TD_DDIR_RAND,
 	TD_DDIR_RANDTRIM	= TD_DDIR_TRIM | TD_DDIR_RAND,
+	TD_DDIR_TRIMWRITE	= TD_DDIR_TRIM | TD_DDIR_WRITE,
 };
 
 #define td_read(td)		((td)->o.td_ddir & TD_DDIR_READ)
@@ -30,7 +46,9 @@
 #define td_trim(td)		((td)->o.td_ddir & TD_DDIR_TRIM)
 #define td_rw(td)		(((td)->o.td_ddir & TD_DDIR_RW) == TD_DDIR_RW)
 #define td_random(td)		((td)->o.td_ddir & TD_DDIR_RAND)
-#define file_randommap(td, f)	(!(td)->o.norandommap && (f)->io_axmap)
+#define file_randommap(td, f)	(!(td)->o.norandommap && fio_file_axmap((f)))
+#define td_trimwrite(td)	(((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \
+					== TD_DDIR_TRIMWRITE)
 
 static inline int ddir_sync(enum fio_ddir ddir)
 {
@@ -45,15 +63,13 @@
 
 static inline const char *ddir_str(enum td_ddir ddir)
 {
-	const char *ddir_str[] = { NULL, "read", "write", "rw", NULL,
-				   "randread", "randwrite", "randrw",
-				   "trim", NULL, NULL, NULL, "randtrim" };
+	static const char *__str[] = { NULL, "read", "write", "rw", "rand",
+				"randread", "randwrite", "randrw",
+				"trim", NULL, "trimwrite", NULL, "randtrim" };
 
-	return ddir_str[ddir];
+	return __str[ddir];
 }
 
-#define ddir_trim(ddir) ((ddir) == DDIR_TRIM)
-
 #define ddir_rw_sum(arr)	\
 	((arr)[DDIR_READ] + (arr)[DDIR_WRITE] + (arr)[DDIR_TRIM])
 
diff -Nru fio-2.1.3/ioengine.h fio-3.16/ioengine.h
--- fio-2.1.3/ioengine.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/ioengine.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,238 +0,0 @@
-#ifndef FIO_IOENGINE_H
-#define FIO_IOENGINE_H
-
-#include "compiler/compiler.h"
-#include "os/os.h"
-#include "log.h"
-#include "io_ddir.h"
-#include "debug.h"
-#include "file.h"
-
-#ifdef CONFIG_LIBAIO
-#include <libaio.h>
-#endif
-#ifdef CONFIG_GUASI
-#include <guasi.h>
-#endif
-
-#define FIO_IOOPS_VERSION	16
-
-enum {
-	IO_U_F_FREE		= 1 << 0,
-	IO_U_F_FLIGHT		= 1 << 1,
-	IO_U_F_FREE_DEF		= 1 << 2,
-	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
-	IO_U_F_BUSY_OK		= 1 << 4,
-	IO_U_F_TRIMMED		= 1 << 5,
-	IO_U_F_BARRIER		= 1 << 6,
-	IO_U_F_VER_LIST		= 1 << 7,
-};
-
-/*
- * The io unit
- */
-struct io_u {
-	struct timeval start_time;
-	struct timeval issue_time;
-
-	struct fio_file *file;
-	unsigned int flags;
-	enum fio_ddir ddir;
-
-	/*
-	 * For replay workloads, we may want to account as a different
-	 * IO type than what is being submitted.
-	 */
-	enum fio_ddir acct_ddir;
-
-	/*
-	 * Allocated/set buffer and length
-	 */
-	unsigned long buflen;
-	unsigned long long offset;
-	void *buf;
-
-	/*
-	 * Initial seed for generating the buffer contents
-	 */
-	unsigned long rand_seed;
-
-	/*
-	 * IO engine state, may be different from above when we get
-	 * partial transfers / residual data counts
-	 */
-	void *xfer_buf;
-	unsigned long xfer_buflen;
-
-	/*
-	 * Parameter related to pre-filled buffers and
-	 * their size to handle variable block sizes.
-	 */
-	unsigned long buf_filled_len;
-
-	union {
-#ifdef CONFIG_LIBAIO
-		struct iocb iocb;
-#endif
-#ifdef CONFIG_POSIXAIO
-		os_aiocb_t aiocb;
-#endif
-#ifdef FIO_HAVE_SGIO
-		struct sg_io_hdr hdr;
-#endif
-#ifdef CONFIG_GUASI
-		guasi_req_t greq;
-#endif
-#ifdef CONFIG_SOLARISAIO
-		aio_result_t resultp;
-#endif
-#ifdef FIO_HAVE_BINJECT
-		struct b_user_cmd buc;
-#endif
-#ifdef CONFIG_RDMA
-		struct ibv_mr *mr;
-#endif
-		void *mmap_data;
-	};
-
-	unsigned int resid;
-	unsigned int error;
-
-	/*
-	 * io engine private data
-	 */
-	union {
-		unsigned int index;
-		unsigned int seen;
-		void *engine_data;
-	};
-
-	struct flist_head verify_list;
-
-	/*
-	 * Callback for io completion
-	 */
-	int (*end_io)(struct thread_data *, struct io_u *);
-};
-
-/*
- * io_ops->queue() return values
- */
-enum {
-	FIO_Q_COMPLETED	= 0,		/* completed sync */
-	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
-	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
-};
-
-struct ioengine_ops {
-	struct flist_head list;
-	char name[16];
-	int version;
-	int flags;
-	int (*setup)(struct thread_data *);
-	int (*init)(struct thread_data *);
-	int (*prep)(struct thread_data *, struct io_u *);
-	int (*queue)(struct thread_data *, struct io_u *);
-	int (*commit)(struct thread_data *);
-	int (*getevents)(struct thread_data *, unsigned int, unsigned int, struct timespec *);
-	struct io_u *(*event)(struct thread_data *, int);
-	int (*cancel)(struct thread_data *, struct io_u *);
-	void (*cleanup)(struct thread_data *);
-	int (*open_file)(struct thread_data *, struct fio_file *);
-	int (*close_file)(struct thread_data *, struct fio_file *);
-	int (*get_file_size)(struct thread_data *, struct fio_file *);
-	void (*terminate)(struct thread_data *);
-	int (*io_u_init)(struct thread_data *, struct io_u *);
-	void (*io_u_free)(struct thread_data *, struct io_u *);
-	int option_struct_size;
-	struct fio_option *options;
-	void *data;
-	void *dlhandle;
-};
-
-enum fio_ioengine_flags {
-	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
-	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
-	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
-	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
-	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
-	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
-	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
-	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
-	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
-	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
-	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
-};
-
-/*
- * io engine entry points
- */
-extern int __must_check td_io_init(struct thread_data *);
-extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
-extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
-extern int __must_check td_io_sync(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, struct timespec *);
-extern int __must_check td_io_commit(struct thread_data *);
-extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
-extern int td_io_close_file(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
-
-extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
-extern void register_ioengine(struct ioengine_ops *);
-extern void unregister_ioengine(struct ioengine_ops *);
-extern void free_ioengine(struct thread_data *);
-extern void close_ioengine(struct thread_data *);
-
-extern int fio_show_ioengine_help(const char *engine);
-
-/*
- * io unit handling
- */
-#define queue_full(td)	io_u_qempty(&(td)->io_u_freelist)
-extern struct io_u *__get_io_u(struct thread_data *);
-extern struct io_u *get_io_u(struct thread_data *);
-extern void put_io_u(struct thread_data *, struct io_u *);
-extern void clear_io_u(struct thread_data *, struct io_u *);
-extern void requeue_io_u(struct thread_data *, struct io_u **);
-extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *, uint64_t *);
-extern int __must_check io_u_queued_complete(struct thread_data *, int, uint64_t *);
-extern void io_u_queued(struct thread_data *, struct io_u *);
-extern void io_u_quiesce(struct thread_data *);
-extern void io_u_log_error(struct thread_data *, struct io_u *);
-extern void io_u_mark_depth(struct thread_data *, unsigned int);
-extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
-extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
-void io_u_mark_complete(struct thread_data *, unsigned int);
-void io_u_mark_submit(struct thread_data *, unsigned int);
-
-int do_io_u_sync(struct thread_data *, struct io_u *);
-int do_io_u_trim(struct thread_data *, struct io_u *);
-
-#ifdef FIO_INC_DEBUG
-static inline void dprint_io_u(struct io_u *io_u, const char *p)
-{
-	struct fio_file *f = io_u->file;
-
-	dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u,
-					(unsigned long long) io_u->offset,
-					io_u->buflen, io_u->ddir);
-	if (fio_debug & (1 << FD_IO)) {
-		if (f)
-			log_info("/%s", f->file_name);
-
-		log_info("\n");
-	}
-}
-#else
-#define dprint_io_u(io_u, p)
-#endif
-
-static inline enum fio_ddir acct_ddir(struct io_u *io_u)
-{
-	if (io_u->acct_ddir != -1)
-		return io_u->acct_ddir;
-
-	return io_u->ddir;
-}
-
-#endif
diff -Nru fio-2.1.3/ioengines.c fio-3.16/ioengines.c
--- fio-2.1.3/ioengines.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/ioengines.c	2019-09-20 01:01:52.000000000 +0000
@@ -9,7 +9,6 @@
  * generic io engine that could be used for other projects.
  *
  */
-#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
@@ -19,55 +18,46 @@
 
 #include "fio.h"
 #include "diskutil.h"
+#include "zbd.h"
 
 static FLIST_HEAD(engine_list);
 
-static int check_engine_ops(struct ioengine_ops *ops)
+static bool check_engine_ops(struct ioengine_ops *ops)
 {
 	if (ops->version != FIO_IOOPS_VERSION) {
 		log_err("bad ioops version %d (want %d)\n", ops->version,
 							FIO_IOOPS_VERSION);
-		return 1;
+		return true;
 	}
 
 	if (!ops->queue) {
 		log_err("%s: no queue handler\n", ops->name);
-		return 1;
+		return true;
 	}
 
 	/*
 	 * sync engines only need a ->queue()
 	 */
 	if (ops->flags & FIO_SYNCIO)
-		return 0;
+		return false;
 
-	if (!ops->event) {
-		log_err("%s: no event handler\n", ops->name);
-		return 1;
-	}
-	if (!ops->getevents) {
-		log_err("%s: no getevents handler\n", ops->name);
-		return 1;
-	}
-	if (!ops->queue) {
-		log_err("%s: no queue handler\n", ops->name);
-		return 1;
+	if (!ops->event || !ops->getevents) {
+		log_err("%s: no event/getevents handler\n", ops->name);
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void unregister_ioengine(struct ioengine_ops *ops)
 {
 	dprint(FD_IO, "ioengine %s unregistered\n", ops->name);
-	flist_del(&ops->list);
-	INIT_FLIST_HEAD(&ops->list);
+	flist_del_init(&ops->list);
 }
 
 void register_ioengine(struct ioengine_ops *ops)
 {
 	dprint(FD_IO, "ioengine %s registered\n", ops->name);
-	INIT_FLIST_HEAD(&ops->list);
 	flist_add_tail(&ops->list, &engine_list);
 }
 
@@ -107,35 +97,75 @@
 	ops = dlsym(dlhandle, engine_lib);
 	if (!ops)
 		ops = dlsym(dlhandle, "ioengine");
+
+	/*
+	 * For some external engines (like C++ ones) it is not that trivial
+	 * to provide a non-static ionengine structure that we can reference.
+	 * Instead we call a method which allocates the required ioengine
+	 * structure.
+	 */
+	if (!ops) {
+		get_ioengine_t get_ioengine = dlsym(dlhandle, "get_ioengine");
+
+		if (get_ioengine)
+			get_ioengine(&ops);
+	}
+
 	if (!ops) {
 		td_vmsg(td, -1, dlerror(), "dlsym");
 		dlclose(dlhandle);
 		return NULL;
 	}
 
-	ops->dlhandle = dlhandle;
+	td->io_ops_dlhandle = dlhandle;
 	return ops;
 }
 
-struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name)
+static struct ioengine_ops *__load_ioengine(const char *name)
 {
-	struct ioengine_ops *ops, *ret;
-	char engine[16];
-
-	dprint(FD_IO, "load ioengine %s\n", name);
+	char engine[64];
 
-	strncpy(engine, name, sizeof(engine) - 1);
+	snprintf(engine, sizeof(engine), "%s", name);
 
 	/*
 	 * linux libaio has alias names, so convert to what we want
 	 */
-	if (!strncmp(engine, "linuxaio", 8) || !strncmp(engine, "aio", 3))
+	if (!strncmp(engine, "linuxaio", 8)) {
+		dprint(FD_IO, "converting ioengine name: %s -> libaio\n", name);
 		strcpy(engine, "libaio");
+	}
 
-	ops = find_ioengine(engine);
+	dprint(FD_IO, "load ioengine %s\n", engine);
+	return find_ioengine(engine);
+}
+
+struct ioengine_ops *load_ioengine(struct thread_data *td)
+{
+	struct ioengine_ops *ops = NULL;
+	const char *name;
+
+	/*
+	 * Use ->ioengine_so_path if an external ioengine path is specified.
+	 * In this case, ->ioengine is "external" which also means the prefix
+	 * for external ioengines "external:" is properly used.
+	 */
+	name = td->o.ioengine_so_path ?: td->o.ioengine;
+
+	/*
+	 * Try to load ->ioengine first, and if failed try to dlopen(3) either
+	 * ->ioengine or ->ioengine_so_path.  This is redundant for an external
+	 * ioengine with prefix, and also leaves the possibility of unexpected
+	 * behavior (e.g. if the "external" ioengine exists), but we do this
+	 * so as not to break job files not using the prefix.
+	 */
+	ops = __load_ioengine(td->o.ioengine);
 	if (!ops)
 		ops = dlopen_ioengine(td, name);
 
+	/*
+	 * If ops is NULL, we failed to load ->ioengine, and also failed to
+	 * dlopen(3) either ->ioengine or ->ioengine_so_path as a path.
+	 */
 	if (!ops) {
 		log_err("fio: engine %s not loadable\n", name);
 		return NULL;
@@ -147,11 +177,7 @@
 	if (check_engine_ops(ops))
 		return NULL;
 
-	ret = malloc(sizeof(*ret));
-	memcpy(ret, ops, sizeof(*ret));
-	ret->data = NULL;
-
-	return ret;
+	return ops;
 }
 
 /*
@@ -167,10 +193,11 @@
 		td->eo = NULL;
 	}
 
-	if (td->io_ops->dlhandle)
-		dlclose(td->io_ops->dlhandle);
+	if (td->io_ops_dlhandle) {
+		dlclose(td->io_ops_dlhandle);
+		td->io_ops_dlhandle = NULL;
+	}
 
-	free(td->io_ops);
 	td->io_ops = NULL;
 }
 
@@ -180,7 +207,7 @@
 
 	if (td->io_ops->cleanup) {
 		td->io_ops->cleanup(td);
-		td->io_ops->data = NULL;
+		td->io_ops_data = NULL;
 	}
 
 	free_ioengine(td);
@@ -196,7 +223,8 @@
 	if (td->io_ops->prep) {
 		int ret = td->io_ops->prep(td, io_u);
 
-		dprint(FD_IO, "->prep(%p)=%d\n", io_u, ret);
+		dprint(FD_IO, "prep: io_u %p: ret=%d\n", io_u, ret);
+
 		if (ret)
 			unlock_file(td, io_u->file);
 		return ret;
@@ -206,7 +234,7 @@
 }
 
 int td_io_getevents(struct thread_data *td, unsigned int min, unsigned int max,
-		    struct timespec *t)
+		    const struct timespec *t)
 {
 	int r = 0;
 
@@ -248,15 +276,26 @@
 	return r;
 }
 
-int td_io_queue(struct thread_data *td, struct io_u *io_u)
+enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 {
-	int ret;
+	const enum fio_ddir ddir = acct_ddir(io_u);
+	unsigned long long buflen = io_u->xfer_buflen;
+	enum fio_q_status ret;
 
 	dprint_io_u(io_u, "queue");
 	fio_ro_check(td, io_u);
 
 	assert((io_u->flags & IO_U_F_FLIGHT) == 0);
-	io_u->flags |= IO_U_F_FLIGHT;
+	io_u_set(td, io_u, IO_U_F_FLIGHT);
+
+	/*
+	 * If overlap checking was enabled in offload mode we
+	 * can release this lock that was acquired when we
+	 * started the overlap check because the IO_U_F_FLIGHT
+	 * flag is now set
+	 */
+	if (td_offload_overlap(td))
+		pthread_mutex_unlock(&overlap_check);
 
 	assert(fio_file_open(io_u->file));
 
@@ -268,7 +307,9 @@
 	io_u->error = 0;
 	io_u->resid = 0;
 
-	if (td->io_ops->flags & FIO_SYNCIO) {
+	if (td_ioengine_flagged(td, FIO_SYNCIO) ||
+		(td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && 
+		io_u->ddir == DDIR_TRIM)) {
 		if (fio_fill_issue_time(td))
 			fio_gettime(&io_u->issue_time, NULL);
 
@@ -277,16 +318,29 @@
 		 */
 		if (td->o.read_iolog_file)
 			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(struct timeval));
+					sizeof(io_u->issue_time));
 	}
 
-	if (ddir_rw(acct_ddir(io_u)))
-		td->io_issues[acct_ddir(io_u)]++;
+	if (ddir_rw(ddir)) {
+		if (!(io_u->flags & IO_U_F_VER_LIST)) {
+			td->io_issues[ddir]++;
+			td->io_issue_bytes[ddir] += buflen;
+		}
+		td->rate_io_issue_bytes[ddir] += buflen;
+	}
 
 	ret = td->io_ops->queue(td, io_u);
+	zbd_queue_io_u(io_u, ret);
 
 	unlock_file(td, io_u->file);
 
+	if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
+		td->io_issues[ddir]--;
+		td->io_issue_bytes[ddir] -= buflen;
+		td->rate_io_issue_bytes[ddir] -= buflen;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	}
+
 	/*
 	 * If an error was seen and the io engine didn't propagate it
 	 * back to 'td', do so.
@@ -304,35 +358,41 @@
 	    td->o.odirect) {
 
 		log_info("fio: first direct IO errored. File system may not "
-			 "support direct IO, or iomem_align= is bad.\n");
+			 "support direct IO, or iomem_align= is bad, or "
+			 "invalid block size. Try setting direct=0.\n");
+	}
+
+	if (zbd_unaligned_write(io_u->error) &&
+	    td->io_issues[io_u->ddir & 1] == 1 &&
+	    td->o.zone_mode != ZONE_MODE_ZBD) {
+		log_info("fio: first I/O failed. If %s is a zoned block device, consider --zonemode=zbd\n",
+			 io_u->file->file_name);
 	}
 
-	if (!td->io_ops->commit || ddir_trim(io_u->ddir)) {
+	if (!td->io_ops->commit) {
 		io_u_mark_submit(td, 1);
 		io_u_mark_complete(td, 1);
+		zbd_put_io_u(io_u);
 	}
 
 	if (ret == FIO_Q_COMPLETED) {
-		if (ddir_rw(io_u->ddir)) {
+		if (ddir_rw(io_u->ddir) || ddir_sync(io_u->ddir)) {
 			io_u_mark_depth(td, 1);
 			td->ts.total_io_u[io_u->ddir]++;
 		}
 	} else if (ret == FIO_Q_QUEUED) {
-		int r;
+		td->io_u_queued++;
 
-		if (ddir_rw(io_u->ddir)) {
-			td->io_u_queued++;
+		if (ddir_rw(io_u->ddir) || ddir_sync(io_u->ddir))
 			td->ts.total_io_u[io_u->ddir]++;
-		}
 
-		if (td->io_u_queued >= td->o.iodepth_batch) {
-			r = td_io_commit(td);
-			if (r < 0)
-				return r;
-		}
+		if (td->io_u_queued >= td->o.iodepth_batch)
+			td_io_commit(td);
 	}
 
-	if ((td->io_ops->flags & FIO_SYNCIO) == 0) {
+	if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
+		(!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) ||
+		 io_u->ddir != DDIR_TRIM)) {
 		if (fio_fill_issue_time(td))
 			fio_gettime(&io_u->issue_time, NULL);
 
@@ -341,7 +401,7 @@
 		 */
 		if (td->o.read_iolog_file)
 			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(struct timeval));
+					sizeof(io_u->issue_time));
 	}
 
 	return ret;
@@ -353,10 +413,13 @@
 
 	if (td->io_ops->init) {
 		ret = td->io_ops->init(td);
-		if (ret && td->o.iodepth > 1) {
-			log_err("fio: io engine init failed. Perhaps try"
-				" reducing io depth?\n");
-		}
+		if (ret)
+			log_err("fio: io engine %s init failed.%s\n",
+				td->io_ops->name,
+				td->o.iodepth > 1 ?
+				" Perhaps try reducing io depth?" : "");
+		else
+			td->io_ops_init = 1;
 		if (!td->error)
 			td->error = ret;
 	}
@@ -364,14 +427,14 @@
 	return ret;
 }
 
-int td_io_commit(struct thread_data *td)
+void td_io_commit(struct thread_data *td)
 {
 	int ret;
 
 	dprint(FD_IO, "calling ->commit(), depth %d\n", td->cur_depth);
 
 	if (!td->cur_depth || !td->io_u_queued)
-		return 0;
+		return;
 
 	io_u_mark_depth(td, td->io_u_queued);
 
@@ -386,14 +449,21 @@
 	 */
 	td->io_u_in_flight += td->io_u_queued;
 	td->io_u_queued = 0;
-
-	return 0;
 }
 
 int td_io_open_file(struct thread_data *td, struct fio_file *f)
 {
+	if (fio_file_closing(f)) {
+		/*
+		 * Open translates to undo closing.
+		 */
+		fio_file_clear_closing(f);
+		get_file(f);
+		return 0;
+	}
 	assert(!fio_file_open(f));
 	assert(f->fd == -1);
+	assert(td->io_ops->open_file);
 
 	if (td->io_ops->open_file(td, f)) {
 		if (td->error == EINVAL && td->o.odirect)
@@ -424,43 +494,62 @@
 		}
 	}
 
-	if (td->io_ops->flags & FIO_DISKLESSIO)
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
 		goto done;
 
 	if (td->o.invalidate_cache && file_invalidate_cache(td, f))
 		goto err;
 
-	if (td->o.fadvise_hint &&
-	    (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) {
+	if (td->o.fadvise_hint != F_ADV_NONE &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
 		int flags;
 
-		if (td_random(td))
+		if (td->o.fadvise_hint == F_ADV_TYPE) {
+			if (td_random(td))
+				flags = POSIX_FADV_RANDOM;
+			else
+				flags = POSIX_FADV_SEQUENTIAL;
+		} else if (td->o.fadvise_hint == F_ADV_RANDOM)
 			flags = POSIX_FADV_RANDOM;
-		else
+		else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL)
 			flags = POSIX_FADV_SEQUENTIAL;
+		else {
+			log_err("fio: unknown fadvise type %d\n",
+							td->o.fadvise_hint);
+			flags = POSIX_FADV_NORMAL;
+		}
 
 		if (posix_fadvise(f->fd, f->file_offset, f->io_size, flags) < 0) {
-			td_verror(td, errno, "fadvise");
-			goto err;
+			if (!fio_did_warn(FIO_WARN_FADVISE))
+				log_err("fio: fadvise hint failed\n");
 		}
 	}
+#ifdef FIO_HAVE_WRITE_HINT
+	if (fio_option_is_set(&td->o, write_hint) &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
+		uint64_t hint = td->o.write_hint;
+		int cmd;
 
-#ifdef FIO_OS_DIRECTIO
-	/*
-	 * Some OS's have a distinct call to mark the file non-buffered,
-	 * instead of using O_DIRECT (Solaris)
-	 */
-	if (td->o.odirect) {
-		int ret = fio_set_odirect(f->fd);
+		/*
+		 * For direct IO, we just need/want to set the hint on
+		 * the file descriptor. For buffered IO, we need to set
+		 * it on the inode.
+		 */
+		if (td->o.odirect)
+			cmd = F_SET_FILE_RW_HINT;
+		else
+			cmd = F_SET_RW_HINT;
 
-		if (ret) {
-			td_verror(td, ret, "fio_set_odirect");
-			log_err("fio: the file system does not seem to support direct IO\n");
+		if (fcntl(f->fd, cmd, &hint) < 0) {
+			td_verror(td, errno, "fcntl write hint");
 			goto err;
 		}
 	}
 #endif
 
+	if (td->o.odirect && !OS_O_DIRECT && fio_set_directio(td, f))
+		goto err;
+
 done:
 	log_file(td, f, FIO_LOG_OPEN_FILE);
 	return 0;
@@ -481,92 +570,45 @@
 	 */
 	fio_file_set_closing(f);
 
-	disk_util_dec(f->du);
-
-	if (td->o.file_lock_mode != FILE_LOCK_NONE)
-		unlock_file_all(td, f);
-
 	return put_file(td, f);
 }
 
-int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
-{
-	if (!td->io_ops->get_file_size)
-		return 0;
-
-	return td->io_ops->get_file_size(td, f);
-}
-
-static int do_sync_file_range(struct thread_data *td, struct fio_file *f)
+int td_io_unlink_file(struct thread_data *td, struct fio_file *f)
 {
-	off64_t offset, nbytes;
+	if (td->io_ops->unlink_file)
+		return td->io_ops->unlink_file(td, f);
+	else {
+		int ret;
 
-	offset = f->first_write;
-	nbytes = f->last_write - f->first_write;
+		ret = unlink(f->file_name);
+		if (ret < 0)
+			return errno;
 
-	if (!nbytes)
 		return 0;
-
-	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
-}
-
-int do_io_u_sync(struct thread_data *td, struct io_u *io_u)
-{
-	int ret;
-
-	if (io_u->ddir == DDIR_SYNC) {
-		ret = fsync(io_u->file->fd);
-	} else if (io_u->ddir == DDIR_DATASYNC) {
-#ifdef CONFIG_FDATASYNC
-		ret = fdatasync(io_u->file->fd);
-#else
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
-#endif
-	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
-		ret = do_sync_file_range(td, io_u->file);
-	else {
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
 	}
-
-	if (ret < 0)
-		io_u->error = errno;
-
-	return ret;
 }
 
-int do_io_u_trim(struct thread_data *td, struct io_u *io_u)
+int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
 {
-#ifndef FIO_HAVE_TRIM
-	io_u->error = EINVAL;
-	return 0;
-#else
-	struct fio_file *f = io_u->file;
-	int ret;
-
-	ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen);
-	if (!ret)
-		return io_u->xfer_buflen;
+	if (!td->io_ops->get_file_size)
+		return 0;
 
-	io_u->error = ret;
-	return 0;
-#endif
+	return td->io_ops->get_file_size(td, f);
 }
 
 int fio_show_ioengine_help(const char *engine)
 {
 	struct flist_head *entry;
 	struct thread_data td;
+	struct ioengine_ops *io_ops;
 	char *sep;
 	int ret = 1;
 
 	if (!engine || !*engine) {
 		log_info("Available IO engines:\n");
 		flist_for_each(entry, &engine_list) {
-			td.io_ops = flist_entry(entry, struct ioengine_ops,
-						list);
-			log_info("\t%s\n", td.io_ops->name);
+			io_ops = flist_entry(entry, struct ioengine_ops, list);
+			log_info("\t%s\n", io_ops->name);
 		}
 		return 0;
 	}
@@ -576,20 +618,20 @@
 		sep++;
 	}
 
-	memset(&td, 0, sizeof(td));
+	memset(&td, 0, sizeof(struct thread_data));
+	td.o.ioengine = (char *)engine;
+	io_ops = load_ioengine(&td);
 
-	td.io_ops = load_ioengine(&td, engine);
-	if (!td.io_ops) {
+	if (!io_ops) {
 		log_info("IO engine %s not found\n", engine);
 		return 1;
 	}
 
-	if (td.io_ops->options)
-		ret = show_cmd_help(td.io_ops->options, sep);
+	if (io_ops->options)
+		ret = show_cmd_help(io_ops->options, sep);
 	else
-		log_info("IO engine %s has no options\n", td.io_ops->name);
+		log_info("IO engine %s has no options\n", io_ops->name);
 
 	free_ioengine(&td);
-
 	return ret;
 }
diff -Nru fio-2.1.3/ioengines.h fio-3.16/ioengines.h
--- fio-2.1.3/ioengines.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/ioengines.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,96 @@
+#ifndef FIO_IOENGINE_H
+#define FIO_IOENGINE_H
+
+#include <stddef.h>
+
+#include "compiler/compiler.h"
+#include "flist.h"
+#include "io_u.h"
+
+#define FIO_IOOPS_VERSION	25
+
+/*
+ * io_ops->queue() return values
+ */
+enum fio_q_status {
+	FIO_Q_COMPLETED	= 0,		/* completed sync */
+	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
+	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
+};
+
+struct ioengine_ops {
+	struct flist_head list;
+	const char *name;
+	int version;
+	int flags;
+	int (*setup)(struct thread_data *);
+	int (*init)(struct thread_data *);
+	int (*post_init)(struct thread_data *);
+	int (*prep)(struct thread_data *, struct io_u *);
+	enum fio_q_status (*queue)(struct thread_data *, struct io_u *);
+	int (*commit)(struct thread_data *);
+	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+	struct io_u *(*event)(struct thread_data *, int);
+	char *(*errdetails)(struct io_u *);
+	int (*cancel)(struct thread_data *, struct io_u *);
+	void (*cleanup)(struct thread_data *);
+	int (*open_file)(struct thread_data *, struct fio_file *);
+	int (*close_file)(struct thread_data *, struct fio_file *);
+	int (*invalidate)(struct thread_data *, struct fio_file *);
+	int (*unlink_file)(struct thread_data *, struct fio_file *);
+	int (*get_file_size)(struct thread_data *, struct fio_file *);
+	void (*terminate)(struct thread_data *);
+	int (*iomem_alloc)(struct thread_data *, size_t);
+	void (*iomem_free)(struct thread_data *);
+	int (*io_u_init)(struct thread_data *, struct io_u *);
+	void (*io_u_free)(struct thread_data *, struct io_u *);
+	int option_struct_size;
+	struct fio_option *options;
+};
+
+enum fio_ioengine_flags {
+	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
+	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
+	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
+	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
+	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
+	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
+	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
+	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
+	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
+	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
+	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
+	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
+	FIO_NOSTATS	= 1 << 12,	/* don't do IO stats */
+	FIO_NOFILEHASH	= 1 << 13,	/* doesn't hash the files for lookup later. */
+	FIO_ASYNCIO_SYNC_TRIM
+			= 1 << 14	/* io engine has async ->queue except for trim */
+};
+
+/*
+ * External engine defined symbol to fill in the engine ops structure
+ */
+typedef void (*get_ioengine_t)(struct ioengine_ops **);
+
+/*
+ * io engine entry points
+ */
+extern int __must_check td_io_init(struct thread_data *);
+extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
+extern enum fio_q_status __must_check td_io_queue(struct thread_data *, struct io_u *);
+extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+extern void td_io_commit(struct thread_data *);
+extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
+extern int td_io_close_file(struct thread_data *, struct fio_file *);
+extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
+extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
+
+extern struct ioengine_ops *load_ioengine(struct thread_data *);
+extern void register_ioengine(struct ioengine_ops *);
+extern void unregister_ioengine(struct ioengine_ops *);
+extern void free_ioengine(struct thread_data *);
+extern void close_ioengine(struct thread_data *);
+
+extern int fio_show_ioengine_help(const char *engine);
+
+#endif
diff -Nru fio-2.1.3/iolog.c fio-3.16/iolog.c
--- fio-2.1.3/iolog.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/iolog.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,12 +4,30 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <libgen.h>
 #include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
 #include "flist.h"
 #include "fio.h"
-#include "verify.h"
 #include "trim.h"
+#include "filelock.h"
+#include "smalloc.h"
+#include "blktrace.h"
+#include "pshared.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+static int iolog_flush(struct io_log *log);
 
 static const char iolog_ver2[] = "fio version 2 iolog";
 
@@ -19,19 +37,14 @@
 	td->total_io_size += ipo->len;
 }
 
-void log_io_u(struct thread_data *td, struct io_u *io_u)
+void log_io_u(const struct thread_data *td, const struct io_u *io_u)
 {
-	const char *act[] = { "read", "write", "sync", "datasync",
-				"sync_file_range", "wait", "trim" };
-
-	assert(io_u->ddir <= 6);
-
 	if (!td->o.write_iolog_file)
 		return;
 
-	fprintf(td->iolog_f, "%s %s %llu %lu\n", io_u->file->file_name,
-						act[io_u->ddir], io_u->offset,
-						io_u->buflen);
+	fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name,
+						io_ddir_name(io_u->ddir),
+						io_u->offset, io_u->buflen);
 }
 
 void log_file(struct thread_data *td, struct fio_file *f,
@@ -56,20 +69,37 @@
 
 static void iolog_delay(struct thread_data *td, unsigned long delay)
 {
-	unsigned long usec = utime_since_now(&td->last_issue);
+	uint64_t usec = utime_since_now(&td->last_issue);
+	unsigned long orig_delay = delay;
+	uint64_t this_delay;
+	struct timespec ts;
 
+	if (delay < td->time_offset) {
+		td->time_offset = 0;
+		return;
+	}
+
+	delay -= td->time_offset;
 	if (delay < usec)
 		return;
 
 	delay -= usec;
 
-	/*
-	 * less than 100 usec delay, just regard it as noise
-	 */
-	if (delay < 100)
-		return;
+	fio_gettime(&ts, NULL);
+	while (delay && !td->terminate) {
+		this_delay = delay;
+		if (this_delay > 500000)
+			this_delay = 500000;
+
+		usec_sleep(td, this_delay);
+		delay -= this_delay;
+	}
 
-	usec_sleep(td, delay);
+	usec = utime_since_now(&ts);
+	if (usec > orig_delay)
+		td->time_offset = usec - orig_delay;
+	else
+		td->time_offset = 0;
 }
 
 static int ipo_special(struct thread_data *td, struct io_piece *ipo)
@@ -87,6 +117,11 @@
 
 	switch (ipo->file_action) {
 	case FIO_LOG_OPEN_FILE:
+		if (td->o.replay_redirect && fio_file_open(f)) {
+			dprint(FD_FILE, "iolog: ignoring re-open of file %s\n",
+					f->file_name);
+			break;
+		}
 		ret = td_io_open_file(td, f);
 		if (!ret)
 			break;
@@ -96,7 +131,7 @@
 		td_io_close_file(td, f);
 		break;
 	case FIO_LOG_UNLINK_FILE:
-		unlink(f->file_name);
+		td_io_unlink_file(td, f);
 		break;
 	default:
 		log_err("fio: bad file action %d\n", ipo->file_action);
@@ -106,6 +141,8 @@
 	return 1;
 }
 
+static bool read_iolog2(struct thread_data *td);
+
 int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 {
 	struct io_piece *ipo;
@@ -113,8 +150,14 @@
 
 	while (!flist_empty(&td->io_log_list)) {
 		int ret;
-
-		ipo = flist_entry(td->io_log_list.next, struct io_piece, list);
+		if (td->o.read_iolog_chunked) {
+			if (td->io_log_checkmark == td->io_log_current) {
+				if (!read_iolog2(td))
+					return 1;
+			}
+			td->io_log_current--;
+		}
+		ipo = flist_first_entry(&td->io_log_list, struct io_piece, list);
 		flist_del(&ipo->list);
 		remove_trim_entry(td, ipo);
 
@@ -133,7 +176,7 @@
 			io_u->buflen = ipo->len;
 			io_u->file = td->files[ipo->fileno];
 			get_file(io_u->file);
-			dprint(FD_IO, "iolog: get %llu/%lu/%s\n", io_u->offset,
+			dprint(FD_IO, "iolog: get %llu/%llu/%s\n", io_u->offset,
 						io_u->buflen, io_u->file->file_name);
 			if (ipo->delay)
 				iolog_delay(td, ipo->delay);
@@ -156,7 +199,7 @@
 void prune_io_piece_log(struct thread_data *td)
 {
 	struct io_piece *ipo;
-	struct rb_node *n;
+	struct fio_rb_node *n;
 
 	while ((n = rb_first(&td->io_hist_tree)) != NULL) {
 		ipo = rb_entry(n, struct io_piece, rb_node);
@@ -167,7 +210,7 @@
 	}
 
 	while (!flist_empty(&td->io_hist_list)) {
-		ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
 		flist_del(&ipo->list);
 		remove_trim_entry(td, ipo);
 		td->io_hist_len--;
@@ -180,14 +223,18 @@
  */
 void log_io_piece(struct thread_data *td, struct io_u *io_u)
 {
-	struct rb_node **p, *parent;
+	struct fio_rb_node **p, *parent;
 	struct io_piece *ipo, *__ipo;
 
-	ipo = malloc(sizeof(struct io_piece));
+	ipo = calloc(1, sizeof(struct io_piece));
 	init_ipo(ipo);
 	ipo->file = io_u->file;
 	ipo->offset = io_u->offset;
 	ipo->len = io_u->buflen;
+	ipo->numberio = io_u->numberio;
+	ipo->flags = IP_F_IN_FLIGHT;
+
+	io_u->ipo = ipo;
 
 	if (io_u_should_trim(td, io_u)) {
 		flist_add_tail(&ipo->trim_list, &td->trim_list);
@@ -195,21 +242,11 @@
 	}
 
 	/*
-	 * We don't need to sort the entries, if:
-	 *
-	 *	Sequential writes, or
-	 *	Random writes that lay out the file as it goes along
-	 *
-	 * For both these cases, just reading back data in the order we
-	 * wrote it out is the fastest.
-	 *
-	 * One exception is if we don't have a random map AND we are doing
-	 * verifies, in that case we need to check for duplicate blocks and
-	 * drop the old one, which we rely on the rb insert/lookup for
-	 * handling.
+	 * Only sort writes if we don't have a random map in which case we need
+	 * to check for duplicate blocks and drop the old one, which we rely on
+	 * the rb insert/lookup for handling.
 	 */
-	if ((!td_random(td) || !td->o.overwrite) &&
-	      (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
+	if (file_randommap(td, ipo->file)) {
 		INIT_FLIST_HEAD(&ipo->list);
 		flist_add_tail(&ipo->list, &td->io_hist_list);
 		ipo->flags |= IP_F_ONLIST;
@@ -226,6 +263,7 @@
 	p = &td->io_hist_tree.rb_node;
 	parent = NULL;
 	while (*p) {
+		int overlap = 0;
 		parent = *p;
 
 		__ipo = rb_entry(parent, struct io_piece, rb_node);
@@ -233,18 +271,26 @@
 			p = &(*p)->rb_left;
 		else if (ipo->file > __ipo->file)
 			p = &(*p)->rb_right;
-		else if (ipo->offset < __ipo->offset)
+		else if (ipo->offset < __ipo->offset) {
 			p = &(*p)->rb_left;
-		else if (ipo->offset > __ipo->offset)
+			overlap = ipo->offset + ipo->len > __ipo->offset;
+		}
+		else if (ipo->offset > __ipo->offset) {
 			p = &(*p)->rb_right;
-		else {
-			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
+			overlap = __ipo->offset + __ipo->len > ipo->offset;
+		}
+		else
+			overlap = 1;
+
+		if (overlap) {
+			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu\n",
 				__ipo->offset, __ipo->len,
 				ipo->offset, ipo->len);
 			td->io_hist_len--;
 			rb_erase(parent, &td->io_hist_tree);
 			remove_trim_entry(td, __ipo);
-			free(__ipo);
+			if (!(__ipo->flags & IP_F_IN_FLIGHT))
+				free(__ipo);
 			goto restart;
 		}
 	}
@@ -255,6 +301,45 @@
 	td->io_hist_len++;
 }
 
+void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
+{
+	struct io_piece *ipo = io_u->ipo;
+
+	if (td->ts.nr_block_infos) {
+		uint32_t *info = io_u_block_info(td, io_u);
+		if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+			if (io_u->ddir == DDIR_TRIM)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_TRIM_FAILURE);
+			else if (io_u->ddir == DDIR_WRITE)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_WRITE_FAILURE);
+		}
+	}
+
+	if (!ipo)
+		return;
+
+	if (ipo->flags & IP_F_ONRB)
+		rb_erase(&ipo->rb_node, &td->io_hist_tree);
+	else if (ipo->flags & IP_F_ONLIST)
+		flist_del(&ipo->list);
+
+	free(ipo);
+	io_u->ipo = NULL;
+	td->io_hist_len--;
+}
+
+void trim_io_piece(const struct io_u *io_u)
+{
+	struct io_piece *ipo = io_u->ipo;
+
+	if (!ipo)
+		return;
+
+	ipo->len = io_u->xfer_buflen - io_u->resid;
+}
+
 void write_iolog_close(struct thread_data *td)
 {
 	fflush(td->iolog_f);
@@ -264,36 +349,74 @@
 	td->iolog_buf = NULL;
 }
 
+static int64_t iolog_items_to_fetch(struct thread_data *td)
+{
+	struct timespec now;
+	uint64_t elapsed;
+	uint64_t for_1s;
+	int64_t items_to_fetch;
+
+	if (!td->io_log_highmark)
+		return 10;
+
+
+	fio_gettime(&now, NULL);
+	elapsed = ntime_since(&td->io_log_highmark_time, &now);
+	if (elapsed) {
+		for_1s = (td->io_log_highmark - td->io_log_current) * 1000000000 / elapsed;
+		items_to_fetch = for_1s - td->io_log_current;
+		if (items_to_fetch < 0)
+			items_to_fetch = 0;
+	} else
+		items_to_fetch = 0;
+
+	td->io_log_highmark = td->io_log_current + items_to_fetch;
+	td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+	fio_gettime(&td->io_log_highmark_time, NULL);
+
+	return items_to_fetch;
+}
+
 /*
  * Read version 2 iolog data. It is enhanced to include per-file logging,
  * syncs, etc.
  */
-static int read_iolog2(struct thread_data *td, FILE *f)
+static bool read_iolog2(struct thread_data *td)
 {
 	unsigned long long offset;
 	unsigned int bytes;
 	int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
-	char *fname, *act;
+	char *rfname, *fname, *act;
 	char *str, *p;
 	enum fio_ddir rw;
+	bool realloc = false;
+	int64_t items_to_fetch = 0;
 
-	free_release_files(td);
+	if (td->o.read_iolog_chunked) {
+		items_to_fetch = iolog_items_to_fetch(td);
+		if (!items_to_fetch)
+			return true;
+	}
 
 	/*
 	 * Read in the read iolog and store it, reuse the infrastructure
 	 * for doing verifications.
 	 */
 	str = malloc(4096);
-	fname = malloc(256+16);
+	rfname = fname = malloc(256+16);
 	act = malloc(256+16);
 
 	reads = writes = waits = 0;
-	while ((p = fgets(str, 4096, f)) != NULL) {
+	while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
 		struct io_piece *ipo;
 		int r;
 
-		r = sscanf(p, "%256s %256s %llu %u", fname, act, &offset,
+		r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
 									&bytes);
+
+		if (td->o.replay_redirect)
+			fname = td->o.replay_redirect;
+
 		if (r == 4) {
 			/*
 			 * Check action first
@@ -319,9 +442,14 @@
 		} else if (r == 2) {
 			rw = DDIR_INVAL;
 			if (!strcmp(act, "add")) {
-				td->o.nr_files++;
-				fileno = add_file(td, fname);
-				file_action = FIO_LOG_ADD_FILE;
+				if (td->o.replay_redirect &&
+				    get_fileno(td, fname) != -1) {
+					dprint(FD_FILE, "iolog: ignoring"
+						" re-add of file %s\n", fname);
+				} else {
+					fileno = add_file(td, fname, td->subjob_number, 1);
+					file_action = FIO_LOG_ADD_FILE;
+				}
 				continue;
 			} else if (!strcmp(act, "open")) {
 				fileno = get_fileno(td, fname);
@@ -335,7 +463,7 @@
 				continue;
 			}
 		} else {
-			log_err("bad iolog2: %s", p);
+			log_err("bad iolog2: %s\n", p);
 			continue;
 		}
 
@@ -349,6 +477,8 @@
 				continue;
 			writes++;
 		} else if (rw == DDIR_WAIT) {
+			if (td->o.no_stall)
+				continue;
 			waits++;
 		} else if (rw == DDIR_INVAL) {
 		} else if (!ddir_sync(rw)) {
@@ -359,26 +489,47 @@
 		/*
 		 * Make note of file
 		 */
-		ipo = malloc(sizeof(*ipo));
+		ipo = calloc(1, sizeof(*ipo));
 		init_ipo(ipo);
 		ipo->ddir = rw;
 		if (rw == DDIR_WAIT) {
 			ipo->delay = offset;
 		} else {
-			ipo->offset = offset;
+			if (td->o.replay_scale)
+				ipo->offset = offset / td->o.replay_scale;
+			else
+				ipo->offset = offset;
+			ipo_bytes_align(td->o.replay_align, ipo);
+
 			ipo->len = bytes;
-			if (bytes > td->o.max_bs[rw])
+			if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw]) {
+				realloc = true;
 				td->o.max_bs[rw] = bytes;
+			}
 			ipo->fileno = fileno;
 			ipo->file_action = file_action;
+			td->o.size += bytes;
 		}
 
 		queue_io_piece(td, ipo);
+
+		if (td->o.read_iolog_chunked) {
+			td->io_log_current++;
+			items_to_fetch--;
+			if (items_to_fetch == 0)
+				break;
+		}
 	}
 
 	free(str);
 	free(act);
-	free(fname);
+	free(rfname);
+
+	if (td->o.read_iolog_chunked) {
+		td->io_log_highmark = td->io_log_current;
+		td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+		fio_gettime(&td->io_log_highmark_time, NULL);
+	}
 
 	if (writes && read_only) {
 		log_err("fio: <%s> skips replay of %d writes due to"
@@ -386,8 +537,22 @@
 		writes = 0;
 	}
 
+	if (td->o.read_iolog_chunked) {
+		if (td->io_log_current == 0) {
+			return false;
+		}
+		td->o.td_ddir = TD_DDIR_RW;
+		if (realloc && td->orig_buffer)
+		{
+			io_u_quiesce(td);
+			free_io_mem(td);
+			init_io_u_buffers(td);
+		}
+		return true;
+	}
+
 	if (!reads && !writes && !waits)
-		return 1;
+		return false;
 	else if (reads && !writes)
 		td->o.td_ddir = TD_DDIR_READ;
 	else if (!reads && writes)
@@ -395,22 +560,70 @@
 	else
 		td->o.td_ddir = TD_DDIR_RW;
 
-	return 0;
+	return true;
+}
+
+static bool is_socket(const char *path)
+{
+	struct stat buf;
+	int r;
+
+	r = stat(path, &buf);
+	if (r == -1)
+		return false;
+
+	return S_ISSOCK(buf.st_mode);
+}
+
+static int open_socket(const char *path)
+{
+	struct sockaddr_un addr;
+	int ret, fd;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		return fd;
+
+	addr.sun_family = AF_UNIX;
+	if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path) >=
+	    sizeof(addr.sun_path)) {
+		log_err("%s: path name %s is too long for a Unix socket\n",
+			__func__, path);
+	}
+
+	ret = connect(fd, (const struct sockaddr *)&addr, strlen(path) + sizeof(addr.sun_family));
+	if (!ret)
+		return fd;
+
+	close(fd);
+	return -1;
 }
 
 /*
  * open iolog, check version, and call appropriate parser
  */
-static int init_iolog_read(struct thread_data *td)
+static bool init_iolog_read(struct thread_data *td)
 {
-	char buffer[256], *p;
-	FILE *f;
-	int ret;
+	char buffer[256], *p, *fname;
+	FILE *f = NULL;
+
+	fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
+	dprint(FD_IO, "iolog: name=%s\n", fname);
+
+	if (is_socket(fname)) {
+		int fd;
+
+		fd = open_socket(fname);
+		if (fd >= 0)
+			f = fdopen(fd, "r");
+	} else
+		f = fopen(fname, "r");
+
+	free(fname);
 
-	f = fopen(td->o.read_iolog_file, "r");
 	if (!f) {
 		perror("fopen read iolog");
-		return 1;
+		return false;
 	}
 
 	p = fgets(buffer, sizeof(buffer), f);
@@ -418,28 +631,28 @@
 		td_verror(td, errno, "iolog read");
 		log_err("fio: unable to read iolog\n");
 		fclose(f);
-		return 1;
+		return false;
 	}
 
 	/*
 	 * version 2 of the iolog stores a specific string as the
 	 * first line, check for that
 	 */
-	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
-		ret = read_iolog2(td, f);
-	else {
-		log_err("fio: iolog version 1 is no longer supported\n");
-		ret = 1;
+	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) {
+		free_release_files(td);
+		td->io_log_rfile = f;
+		return read_iolog2(td);
 	}
 
+	log_err("fio: iolog version 1 is no longer supported\n");
 	fclose(f);
-	return ret;
+	return false;
 }
 
 /*
  * Set up a log for storing io patterns.
  */
-static int init_iolog_write(struct thread_data *td)
+static bool init_iolog_write(struct thread_data *td)
 {
 	struct fio_file *ff;
 	FILE *f;
@@ -448,7 +661,7 @@
 	f = fopen(td->o.write_iolog_file, "a");
 	if (!f) {
 		perror("fopen write iolog");
-		return 1;
+		return false;
 	}
 
 	/*
@@ -463,7 +676,7 @@
 	 */
 	if (fprintf(f, "%s\n", iolog_ver2) < 0) {
 		perror("iolog init\n");
-		return 1;
+		return false;
 	}
 
 	/*
@@ -472,81 +685,1059 @@
 	for_each_file(td, ff, i)
 		log_file(td, ff, FIO_LOG_ADD_FILE);
 
-	return 0;
+	return true;
 }
 
-int init_iolog(struct thread_data *td)
+bool init_iolog(struct thread_data *td)
 {
-	int ret = 0;
+	bool ret;
 
 	if (td->o.read_iolog_file) {
+		int need_swap;
+
 		/*
 		 * Check if it's a blktrace file and load that if possible.
 		 * Otherwise assume it's a normal log file and load that.
 		 */
-		if (is_blktrace(td->o.read_iolog_file))
-			ret = load_blktrace(td, td->o.read_iolog_file);
+		if (is_blktrace(td->o.read_iolog_file, &need_swap))
+			ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
 		else
 			ret = init_iolog_read(td);
 	} else if (td->o.write_iolog_file)
 		ret = init_iolog_write(td);
+	else
+		ret = true;
+
+	if (!ret)
+		td_verror(td, EINVAL, "failed initializing iolog");
 
 	return ret;
 }
 
-void setup_log(struct io_log **log, unsigned long avg_msec, int log_type)
+void setup_log(struct io_log **log, struct log_params *p,
+	       const char *filename)
 {
-	struct io_log *l = malloc(sizeof(*l));
+	struct io_log *l;
+	int i;
+	struct io_u_plat_entry *entry;
+	struct flist_head *list;
+
+	l = scalloc(1, sizeof(*l));
+	INIT_FLIST_HEAD(&l->io_logs);
+	l->log_type = p->log_type;
+	l->log_offset = p->log_offset;
+	l->log_gz = p->log_gz;
+	l->log_gz_store = p->log_gz_store;
+	l->avg_msec = p->avg_msec;
+	l->hist_msec = p->hist_msec;
+	l->hist_coarseness = p->hist_coarseness;
+	l->filename = strdup(filename);
+	l->td = p->td;
+
+	/* Initialize histogram lists for each r/w direction,
+	 * with initial io_u_plat of all zeros:
+	 */
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		list = &l->hist_window[i].list;
+		INIT_FLIST_HEAD(list);
+		entry = calloc(1, sizeof(struct io_u_plat_entry));
+		flist_add(&entry->list, list);
+	}
+
+	if (l->td && l->td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+		struct io_logs *__p;
+
+		__p = calloc(1, sizeof(*l->pending));
+		__p->max_samples = DEF_LOG_ENTRIES;
+		__p->log = calloc(__p->max_samples, log_entry_sz(l));
+		l->pending = __p;
+	}
+
+	if (l->log_offset)
+		l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+
+	INIT_FLIST_HEAD(&l->chunk_list);
+
+	if (l->log_gz && !p->td)
+		l->log_gz = 0;
+	else if (l->log_gz || l->log_gz_store) {
+		mutex_init_pshared(&l->chunk_lock);
+		mutex_init_pshared(&l->deferred_free_lock);
+		p->td->flags |= TD_F_COMPRESS_LOG;
+	}
 
-	memset(l, 0, sizeof(*l));
-	l->nr_samples = 0;
-	l->max_samples = 1024;
-	l->log_type = log_type;
-	l->log = malloc(l->max_samples * sizeof(struct io_sample));
-	l->avg_msec = avg_msec;
 	*log = l;
 }
 
-void __finish_log(struct io_log *log, const char *name)
+#ifdef CONFIG_SETVBUF
+static void *set_file_buffer(FILE *f)
 {
-	unsigned int i;
+	size_t size = 1048576;
+	void *buf;
+
+	buf = malloc(size);
+	setvbuf(f, buf, _IOFBF, size);
+	return buf;
+}
+
+static void clear_file_buffer(void *buf)
+{
+	free(buf);
+}
+#else
+static void *set_file_buffer(FILE *f)
+{
+	return NULL;
+}
+
+static void clear_file_buffer(void *buf)
+{
+}
+#endif
+
+void free_log(struct io_log *log)
+{
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		free(cur_log->log);
+		sfree(cur_log);
+	}
+
+	if (log->pending) {
+		free(log->pending->log);
+		free(log->pending);
+		log->pending = NULL;
+	}
+
+	free(log->pending);
+	free(log->filename);
+	sfree(log);
+}
+
+uint64_t hist_sum(int j, int stride, uint64_t *io_u_plat,
+		uint64_t *io_u_plat_last)
+{
+	uint64_t sum;
+	int k;
+
+	if (io_u_plat_last) {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k] - io_u_plat_last[j + k];
+	} else {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k];
+	}
+
+	return sum;
+}
+
+static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+			       uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry, *entry_before;
+	uint64_t *io_u_plat;
+	uint64_t *io_u_plat_before;
+
+	int stride = 1 << hist_coarseness;
+	
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		entry_before = flist_first_entry(&entry->list, struct io_u_plat_entry, list);
+		io_u_plat_before = entry_before->io_u_plat;
+
+		fprintf(f, "%lu, %u, %llu, ", (unsigned long) s->time,
+						io_sample_ddir(s), (unsigned long long) s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%llu, ", (unsigned long long)
+			        hist_sum(j, stride, io_u_plat, io_u_plat_before));
+		}
+		fprintf(f, "%llu\n", (unsigned long long)
+		        hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat,
+					io_u_plat_before));
+
+		flist_del(&entry_before->list);
+		free(entry_before);
+	}
+}
+
+void flush_samples(FILE *f, void *samples, uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, nr_samples;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		if (!log_offset) {
+			fprintf(f, "%lu, %" PRId64 ", %u, %llu\n",
+					(unsigned long) s->time,
+					s->data.val,
+					io_sample_ddir(s), (unsigned long long) s->bs);
+		} else {
+			struct io_sample_offset *so = (void *) s;
+
+			fprintf(f, "%lu, %" PRId64 ", %u, %llu, %llu\n",
+					(unsigned long) s->time,
+					s->data.val,
+					io_sample_ddir(s), (unsigned long long) s->bs,
+					(unsigned long long) so->offset);
+		}
+	}
+}
+
+#ifdef CONFIG_ZLIB
+
+struct iolog_flush_data {
+	struct workqueue_work work;
+	struct io_log *log;
+	void *samples;
+	uint32_t nr_samples;
+	bool free;
+};
+
+#define GZ_CHUNK	131072
+
+static struct iolog_compress *get_new_chunk(unsigned int seq)
+{
+	struct iolog_compress *c;
+
+	c = malloc(sizeof(*c));
+	INIT_FLIST_HEAD(&c->list);
+	c->buf = malloc(GZ_CHUNK);
+	c->len = 0;
+	c->seq = seq;
+	return c;
+}
+
+static void free_chunk(struct iolog_compress *ic)
+{
+	free(ic->buf);
+	free(ic);
+}
+
+static int z_stream_init(z_stream *stream, int gz_hdr)
+{
+	int wbits = 15;
+
+	memset(stream, 0, sizeof(*stream));
+	stream->zalloc = Z_NULL;
+	stream->zfree = Z_NULL;
+	stream->opaque = Z_NULL;
+	stream->next_in = Z_NULL;
+
+	/*
+	 * zlib magic - add 32 for auto-detection of gz header or not,
+	 * if we decide to store files in a gzip friendly format.
+	 */
+	if (gz_hdr)
+		wbits += 32;
+
+	if (inflateInit2(stream, wbits) != Z_OK)
+		return 1;
+
+	return 0;
+}
+
+struct inflate_chunk_iter {
+	unsigned int seq;
+	int err;
+	void *buf;
+	size_t buf_size;
+	size_t buf_used;
+	size_t chunk_sz;
+};
+
+static void finish_chunk(z_stream *stream, FILE *f,
+			 struct inflate_chunk_iter *iter)
+{
+	int ret;
+
+	ret = inflateEnd(stream);
+	if (ret != Z_OK)
+		log_err("fio: failed to end log inflation seq %d (%d)\n",
+				iter->seq, ret);
+
+	flush_samples(f, iter->buf, iter->buf_used);
+	free(iter->buf);
+	iter->buf = NULL;
+	iter->buf_size = iter->buf_used = 0;
+}
+
+/*
+ * Iterative chunk inflation. Handles cases where we cross into a new
+ * sequence, doing flush finish of previous chunk if needed.
+ */
+static size_t inflate_chunk(struct iolog_compress *ic, int gz_hdr, FILE *f,
+			    z_stream *stream, struct inflate_chunk_iter *iter)
+{
+	size_t ret;
+
+	dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u\n",
+				(unsigned long) ic->len, ic->seq);
+
+	if (ic->seq != iter->seq) {
+		if (iter->seq)
+			finish_chunk(stream, f, iter);
+
+		z_stream_init(stream, gz_hdr);
+		iter->seq = ic->seq;
+	}
+
+	stream->avail_in = ic->len;
+	stream->next_in = ic->buf;
+
+	if (!iter->buf_size) {
+		iter->buf_size = iter->chunk_sz;
+		iter->buf = malloc(iter->buf_size);
+	}
+
+	while (stream->avail_in) {
+		size_t this_out = iter->buf_size - iter->buf_used;
+		int err;
+
+		stream->avail_out = this_out;
+		stream->next_out = iter->buf + iter->buf_used;
+
+		err = inflate(stream, Z_NO_FLUSH);
+		if (err < 0) {
+			log_err("fio: failed inflating log: %d\n", err);
+			iter->err = err;
+			break;
+		}
+
+		iter->buf_used += this_out - stream->avail_out;
+
+		if (!stream->avail_out) {
+			iter->buf_size += iter->chunk_sz;
+			iter->buf = realloc(iter->buf, iter->buf_size);
+			continue;
+		}
+
+		if (err == Z_STREAM_END)
+			break;
+	}
+
+	ret = (void *) stream->next_in - ic->buf;
+
+	dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) iter->buf_size);
+
+	return ret;
+}
+
+/*
+ * Inflate stored compressed chunks, or write them directly to the log
+ * file if so instructed.
+ */
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = log->log_gz, };
+	z_stream stream;
+
+	while (!flist_empty(&log->chunk_list)) {
+		struct iolog_compress *ic;
+
+		ic = flist_first_entry(&log->chunk_list, struct iolog_compress, list);
+		flist_del(&ic->list);
+
+		if (log->log_gz_store) {
+			size_t ret;
+
+			dprint(FD_COMPRESS, "log write chunk size=%lu, "
+				"seq=%u\n", (unsigned long) ic->len, ic->seq);
+
+			ret = fwrite(ic->buf, ic->len, 1, f);
+			if (ret != 1 || ferror(f)) {
+				iter.err = errno;
+				log_err("fio: error writing compressed log\n");
+			}
+		} else
+			inflate_chunk(ic, log->log_gz_store, f, &stream, &iter);
+
+		free_chunk(ic);
+	}
+
+	if (iter.seq) {
+		finish_chunk(&stream, f, &iter);
+		free(iter.buf);
+	}
+
+	return iter.err;
+}
+
+/*
+ * Open compressed log file and decompress the stored chunks and
+ * write them to stdout. The chunks are stored sequentially in the
+ * file, so we iterate over them and do them one-by-one.
+ */
+int iolog_file_inflate(const char *file)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = 64 * 1024 * 1024, };
+	struct iolog_compress ic;
+	z_stream stream;
+	struct stat sb;
+	size_t ret;
+	size_t total;
+	void *buf;
+	FILE *f;
+
+	f = fopen(file, "r");
+	if (!f) {
+		perror("fopen");
+		return 1;
+	}
+
+	if (stat(file, &sb) < 0) {
+		fclose(f);
+		perror("stat");
+		return 1;
+	}
+
+	ic.buf = buf = malloc(sb.st_size);
+	ic.len = sb.st_size;
+	ic.seq = 1;
+
+	ret = fread(ic.buf, ic.len, 1, f);
+	if (ret == 0 && ferror(f)) {
+		perror("fread");
+		fclose(f);
+		free(buf);
+		return 1;
+	} else if (ferror(f) || (!feof(f) && ret != 1)) {
+		log_err("fio: short read on reading log\n");
+		fclose(f);
+		free(buf);
+		return 1;
+	}
+
+	fclose(f);
+
+	/*
+	 * Each chunk will return Z_STREAM_END. We don't know how many
+	 * chunks are in the file, so we just keep looping and incrementing
+	 * the sequence number until we have consumed the whole compressed
+	 * file.
+	 */
+	total = ic.len;
+	do {
+		size_t iret;
+
+		iret = inflate_chunk(&ic,  1, stdout, &stream, &iter);
+		total -= iret;
+		if (!total)
+			break;
+		if (iter.err)
+			break;
+
+		ic.seq++;
+		ic.len -= iret;
+		ic.buf += iret;
+	} while (1);
+
+	if (iter.seq) {
+		finish_chunk(&stream, stdout, &iter);
+		free(iter.buf);
+	}
+
+	free(buf);
+	return iter.err;
+}
+
+#else
+
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	return 0;
+}
+
+int iolog_file_inflate(const char *file)
+{
+	log_err("fio: log inflation not possible without zlib\n");
+	return 1;
+}
+
+#endif
+
+void flush_log(struct io_log *log, bool do_append)
+{
+	void *buf;
 	FILE *f;
 
-	f = fopen(name, "a");
+	if (!do_append)
+		f = fopen(log->filename, "w");
+	else
+		f = fopen(log->filename, "a");
 	if (!f) {
 		perror("fopen log");
 		return;
 	}
 
-	for (i = 0; i < log->nr_samples; i++) {
-		fprintf(f, "%lu, %lu, %u, %u\n",
-				(unsigned long) log->log[i].time,
-				(unsigned long) log->log[i].val,
-				log->log[i].ddir, log->log[i].bs);
+	buf = set_file_buffer(f);
+
+	inflate_gz_chunks(log, f);
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		
+		if (log->td && log == log->td->clat_hist_log)
+			flush_hist_samples(f, log->hist_coarseness, cur_log->log,
+			                   log_sample_sz(log, cur_log));
+		else
+			flush_samples(f, cur_log->log, log_sample_sz(log, cur_log));
+		
+		sfree(cur_log);
 	}
 
 	fclose(f);
-	free(log->log);
-	free(log);
+	clear_file_buffer(buf);
 }
 
-void finish_log_named(struct thread_data *td, struct io_log *log,
-		       const char *prefix, const char *postfix)
+static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 {
-	char file_name[256], *p;
-
-	snprintf(file_name, sizeof(file_name), "%s_%s.log", prefix, postfix);
-	p = basename(file_name);
+	if (td->flags & TD_F_COMPRESS_LOG)
+		iolog_flush(log);
 
-	if (td->client_type == FIO_CLIENT_TYPE_GUI) {
-		fio_send_iolog(td, log, p);
-		free(log->log);
-		free(log);
+	if (trylock) {
+		if (fio_trylock_file(log->filename))
+			return 1;
 	} else
-		__finish_log(log, p);
+		fio_lock_file(log->filename);
+
+	if (td->client_type == FIO_CLIENT_TYPE_GUI || is_backend)
+		fio_send_iolog(td, log, log->filename);
+	else
+		flush_log(log, !td->o.per_job_logs);
+
+	fio_unlock_file(log->filename);
+	free_log(log);
+	return 0;
+}
+
+size_t log_chunk_sizes(struct io_log *log)
+{
+	struct flist_head *entry;
+	size_t ret;
+
+	if (flist_empty(&log->chunk_list))
+		return 0;
+
+	ret = 0;
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(entry, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(entry, struct iolog_compress, list);
+		ret += c->len;
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
+#ifdef CONFIG_ZLIB
+
+static void iolog_put_deferred(struct io_log *log, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+	if (log->deferred < IOLOG_MAX_DEFER) {
+		log->deferred_items[log->deferred] = ptr;
+		log->deferred++;
+	} else if (!fio_did_warn(FIO_WARN_IOLOG_DROP))
+		log_err("fio: had to drop log entry free\n");
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static void iolog_free_deferred(struct io_log *log)
+{
+	int i;
+
+	if (!log->deferred)
+		return;
+
+	pthread_mutex_lock(&log->deferred_free_lock);
+
+	for (i = 0; i < log->deferred; i++) {
+		free(log->deferred_items[i]);
+		log->deferred_items[i] = NULL;
+	}
+
+	log->deferred = 0;
+	pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static int gz_work(struct iolog_flush_data *data)
+{
+	struct iolog_compress *c = NULL;
+	struct flist_head list;
+	unsigned int seq;
+	z_stream stream;
+	size_t total = 0;
+	int ret;
+
+	INIT_FLIST_HEAD(&list);
+
+	memset(&stream, 0, sizeof(stream));
+	stream.zalloc = Z_NULL;
+	stream.zfree = Z_NULL;
+	stream.opaque = Z_NULL;
+
+	ret = deflateInit(&stream, Z_DEFAULT_COMPRESSION);
+	if (ret != Z_OK) {
+		log_err("fio: failed to init gz stream\n");
+		goto err;
+	}
+
+	seq = ++data->log->chunk_seq;
+
+	stream.next_in = (void *) data->samples;
+	stream.avail_in = data->nr_samples * log_entry_sz(data->log);
+
+	dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u, log=%s\n",
+				(unsigned long) stream.avail_in, seq,
+				data->log->filename);
+	do {
+		if (c)
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
+		c = get_new_chunk(seq);
+		stream.avail_out = GZ_CHUNK;
+		stream.next_out = c->buf;
+		ret = deflate(&stream, Z_NO_FLUSH);
+		if (ret < 0) {
+			log_err("fio: deflate log (%d)\n", ret);
+			free_chunk(c);
+			goto err;
+		}
+
+		c->len = GZ_CHUNK - stream.avail_out;
+		flist_add_tail(&c->list, &list);
+		total += c->len;
+	} while (stream.avail_in);
+
+	stream.next_out = c->buf + c->len;
+	stream.avail_out = GZ_CHUNK - c->len;
+
+	ret = deflate(&stream, Z_FINISH);
+	if (ret < 0) {
+		/*
+		 * Z_BUF_ERROR is special, it just means we need more
+		 * output space. We'll handle that below. Treat any other
+		 * error as fatal.
+		 */
+		if (ret != Z_BUF_ERROR) {
+			log_err("fio: deflate log (%d)\n", ret);
+			flist_del(&c->list);
+			free_chunk(c);
+			goto err;
+		}
+	}
+
+	total -= c->len;
+	c->len = GZ_CHUNK - stream.avail_out;
+	total += c->len;
+	dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, (unsigned long) c->len);
+
+	if (ret != Z_STREAM_END) {
+		do {
+			c = get_new_chunk(seq);
+			stream.avail_out = GZ_CHUNK;
+			stream.next_out = c->buf;
+			ret = deflate(&stream, Z_FINISH);
+			c->len = GZ_CHUNK - stream.avail_out;
+			total += c->len;
+			flist_add_tail(&c->list, &list);
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
+		} while (ret != Z_STREAM_END);
+	}
+
+	dprint(FD_COMPRESS, "deflated to size=%lu\n", (unsigned long) total);
+
+	ret = deflateEnd(&stream);
+	if (ret != Z_OK)
+		log_err("fio: deflateEnd %d\n", ret);
+
+	iolog_put_deferred(data->log, data->samples);
+
+	if (!flist_empty(&list)) {
+		pthread_mutex_lock(&data->log->chunk_lock);
+		flist_splice_tail(&list, &data->log->chunk_list);
+		pthread_mutex_unlock(&data->log->chunk_lock);
+	}
+
+	ret = 0;
+done:
+	if (data->free)
+		sfree(data);
+	return ret;
+err:
+	while (!flist_empty(&list)) {
+		c = flist_first_entry(list.next, struct iolog_compress, list);
+		flist_del(&c->list);
+		free_chunk(c);
+	}
+	ret = 1;
+	goto done;
+}
+
+/*
+ * Invoked from our compress helper thread, when logging would have exceeded
+ * the specified memory limitation. Compresses the previously stored
+ * entries.
+ */
+static int gz_work_async(struct submit_worker *sw, struct workqueue_work *work)
+{
+	return gz_work(container_of(work, struct iolog_flush_data, work));
+}
+
+static int gz_init_worker(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->wq->td;
+
+	if (!fio_option_is_set(&td->o, log_gz_cpumask))
+		return 0;
+
+	if (fio_setaffinity(gettid(), td->o.log_gz_cpumask) == -1) {
+		log_err("gz: failed to set CPU affinity\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct workqueue_ops log_compress_wq_ops = {
+	.fn		= gz_work_async,
+	.init_worker_fn	= gz_init_worker,
+	.nice		= 1,
+};
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return 0;
+
+	workqueue_init(td, &td->log_compress_wq, &log_compress_wq_ops, 1, sk_out);
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return;
+
+	workqueue_exit(&td->log_compress_wq);
+}
+
+/*
+ * Queue work item to compress the existing log entries. We reset the
+ * current log to a small size, and reference the existing log in the
+ * data that we queue for compression. Once compression has been done,
+ * this old log is freed. If called with finish == true, will not return
+ * until the log compression has completed, and will flush all previous
+ * logs too
+ */
+static int iolog_flush(struct io_log *log)
+{
+	struct iolog_flush_data *data;
+
+	data = malloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+	data->free = false;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		data->samples = cur_log->log;
+		data->nr_samples = cur_log->nr_samples;
+
+		sfree(cur_log);
+
+		gz_work(data);
+	}
+
+	free(data);
+	return 0;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	struct iolog_flush_data *data;
+
+	data = smalloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+
+	data->samples = cur_log->log;
+	data->nr_samples = cur_log->nr_samples;
+	data->free = true;
+
+	cur_log->nr_samples = cur_log->max_samples = 0;
+	cur_log->log = NULL;
+
+	workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+
+	iolog_free_deferred(log);
+
+	return 0;
+}
+#else
+
+static int iolog_flush(struct io_log *log)
+{
+	return 1;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	return 1;
+}
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+}
+
+#endif
+
+struct io_logs *iolog_cur_log(struct io_log *log)
+{
+	if (flist_empty(&log->io_logs))
+		return NULL;
+
+	return flist_last_entry(&log->io_logs, struct io_logs, list);
+}
+
+uint64_t iolog_nr_samples(struct io_log *iolog)
+{
+	struct flist_head *entry;
+	uint64_t ret = 0;
+
+	flist_for_each(entry, &iolog->io_logs) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_entry(entry, struct io_logs, list);
+		ret += cur_log->nr_samples;
+	}
+
+	return ret;
+}
+
+static int __write_log(struct thread_data *td, struct io_log *log, int try)
+{
+	if (log)
+		return finish_log(td, log, try);
+
+	return 0;
+}
+
+static int write_iops_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (per_unit_log(td->iops_log) != unit_log)
+		return 0;
+
+	ret = __write_log(td, td->iops_log, try);
+	if (!ret)
+		td->iops_log = NULL;
+
+	return ret;
+}
+
+static int write_slat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->slat_log, try);
+	if (!ret)
+		td->slat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_log, try);
+	if (!ret)
+		td->clat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_hist_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_hist_log, try);
+	if (!ret)
+		td->clat_hist_log = NULL;
+
+	return ret;
+}
+
+static int write_lat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->lat_log, try);
+	if (!ret)
+		td->lat_log = NULL;
+
+	return ret;
+}
+
+static int write_bandw_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (per_unit_log(td->bw_log) != unit_log)
+		return 0;
+
+	ret = __write_log(td, td->bw_log, try);
+	if (!ret)
+		td->bw_log = NULL;
+
+	return ret;
 }
 
-void finish_log(struct thread_data *td, struct io_log *log, const char *name)
+enum {
+	BW_LOG_MASK	= 1,
+	LAT_LOG_MASK	= 2,
+	SLAT_LOG_MASK	= 4,
+	CLAT_LOG_MASK	= 8,
+	IOPS_LOG_MASK	= 16,
+	CLAT_HIST_LOG_MASK = 32,
+
+	ALL_LOG_NR	= 6,
+};
+
+struct log_type {
+	unsigned int mask;
+	int (*fn)(struct thread_data *, int, bool);
+};
+
+static struct log_type log_types[] = {
+	{
+		.mask	= BW_LOG_MASK,
+		.fn	= write_bandw_log,
+	},
+	{
+		.mask	= LAT_LOG_MASK,
+		.fn	= write_lat_log,
+	},
+	{
+		.mask	= SLAT_LOG_MASK,
+		.fn	= write_slat_log,
+	},
+	{
+		.mask	= CLAT_LOG_MASK,
+		.fn	= write_clat_log,
+	},
+	{
+		.mask	= IOPS_LOG_MASK,
+		.fn	= write_iops_log,
+	},
+	{
+		.mask	= CLAT_HIST_LOG_MASK,
+		.fn	= write_clat_hist_log,
+	}
+};
+
+void td_writeout_logs(struct thread_data *td, bool unit_logs)
 {
-	finish_log_named(td, log, td->o.name, name);
+	unsigned int log_mask = 0;
+	unsigned int log_left = ALL_LOG_NR;
+	int old_state, i;
+
+	old_state = td_bump_runstate(td, TD_FINISHING);
+
+	finalize_logs(td, unit_logs);
+
+	while (log_left) {
+		int prev_log_left = log_left;
+
+		for (i = 0; i < ALL_LOG_NR && log_left; i++) {
+			struct log_type *lt = &log_types[i];
+			int ret;
+
+			if (!(log_mask & lt->mask)) {
+				ret = lt->fn(td, log_left != 1, unit_logs);
+				if (!ret) {
+					log_left--;
+					log_mask |= lt->mask;
+				}
+			}
+		}
+
+		if (prev_log_left == log_left)
+			usleep(5000);
+	}
+
+	td_restore_runstate(td, old_state);
+}
+
+void fio_writeout_logs(bool unit_logs)
+{
+	struct thread_data *td;
+	int i;
+
+	for_each_td(td, i)
+		td_writeout_logs(td, unit_logs);
 }
diff -Nru fio-2.1.3/iolog.h fio-3.16/iolog.h
--- fio-2.1.3/iolog.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/iolog.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,10 +1,12 @@
 #ifndef FIO_IOLOG_H
 #define FIO_IOLOG_H
 
+#include <stdio.h>
+
 #include "lib/rbtree.h"
 #include "lib/ieee754.h"
 #include "flist.h"
-#include "ioengine.h"
+#include "ioengines.h"
 
 /*
  * Use for maintaining statistics
@@ -18,14 +20,34 @@
 	fio_fp64_t S;
 };
 
+struct io_hist {
+	uint64_t samples;
+	unsigned long hist_last;
+	struct flist_head list;
+};
+
+
+union io_sample_data {
+	uint64_t val;
+	struct io_u_plat_entry *plat_entry;
+};
+
+#define sample_val(value) ((union io_sample_data) { .val = value })
+#define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat })
+
 /*
  * A single data sample
  */
 struct io_sample {
 	uint64_t time;
-	uint64_t val;
-	uint32_t ddir;
-	uint32_t bs;
+	union io_sample_data data;
+	uint32_t __ddir;
+	uint64_t bs;
+};
+
+struct io_sample_offset {
+	struct io_sample s;
+	uint64_t offset;
 };
 
 enum {
@@ -34,6 +56,17 @@
 	IO_LOG_TYPE_SLAT,
 	IO_LOG_TYPE_BW,
 	IO_LOG_TYPE_IOPS,
+	IO_LOG_TYPE_HIST,
+};
+
+#define DEF_LOG_ENTRIES		1024
+#define MAX_LOG_ENTRIES		(1024 * DEF_LOG_ENTRIES)
+
+struct io_logs {
+	struct flist_head list;
+	uint64_t nr_samples;
+	uint64_t max_samples;
+	void *log;
 };
 
 /*
@@ -43,25 +76,124 @@
 	/*
 	 * Entries already logged
 	 */
-	unsigned long nr_samples;
-	unsigned long max_samples;
-	struct io_sample *log;
+	struct flist_head io_logs;
+	uint32_t cur_log_max;
+
+	/*
+	 * When the current log runs out of space, store events here until
+	 * we have a chance to regrow
+	 */
+	struct io_logs *pending;
+
+	unsigned int log_ddir_mask;
+
+	char *filename;
+
+	struct thread_data *td;
 
 	unsigned int log_type;
 
 	/*
+	 * If we fail extending the log, stop collecting more entries.
+	 */
+	bool disabled;
+
+	/*
+	 * Log offsets
+	 */
+	unsigned int log_offset;
+
+	/*
+	 * Max size of log entries before a chunk is compressed
+	 */
+	unsigned int log_gz;
+
+	/*
+	 * Don't deflate for storing, just store the compressed bits
+	 */
+	unsigned int log_gz_store;
+
+	/*
 	 * Windowed average, for logging single entries average over some
 	 * period of time.
 	 */
 	struct io_stat avg_window[DDIR_RWDIR_CNT];
 	unsigned long avg_msec;
-	unsigned long avg_last;
+	unsigned long avg_last[DDIR_RWDIR_CNT];
+
+	/*
+	 * Windowed latency histograms, for keeping track of when we need to
+	 * save a copy of the histogram every approximately hist_msec
+	 * milliseconds.
+	 */
+	struct io_hist hist_window[DDIR_RWDIR_CNT];
+	unsigned long hist_msec;
+	unsigned int hist_coarseness;
+
+	pthread_mutex_t chunk_lock;
+	unsigned int chunk_seq;
+	struct flist_head chunk_list;
+
+	pthread_mutex_t deferred_free_lock;
+#define IOLOG_MAX_DEFER	8
+	void *deferred_items[IOLOG_MAX_DEFER];
+	unsigned int deferred;
 };
 
+/*
+ * If the upper bit is set, then we have the offset as well
+ */
+#define LOG_OFFSET_SAMPLE_BIT	0x80000000U
+#define io_sample_ddir(io)	((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+
+static inline void io_sample_set_ddir(struct io_log *log,
+				      struct io_sample *io,
+				      enum fio_ddir ddir)
+{
+	io->__ddir = ddir | log->log_ddir_mask;
+}
+
+static inline size_t __log_entry_sz(int log_offset)
+{
+	if (log_offset)
+		return sizeof(struct io_sample_offset);
+	else
+		return sizeof(struct io_sample);
+}
+
+static inline size_t log_entry_sz(struct io_log *log)
+{
+	return __log_entry_sz(log->log_offset);
+}
+
+static inline size_t log_sample_sz(struct io_log *log, struct io_logs *cur_log)
+{
+	return cur_log->nr_samples * log_entry_sz(log);
+}
+
+static inline struct io_sample *__get_sample(void *samples, int log_offset,
+					     uint64_t sample)
+{
+	uint64_t sample_offset = sample * __log_entry_sz(log_offset);
+	return (struct io_sample *) ((char *) samples + sample_offset);
+}
+
+struct io_logs *iolog_cur_log(struct io_log *);
+uint64_t iolog_nr_samples(struct io_log *);
+void regrow_logs(struct thread_data *);
+
+static inline struct io_sample *get_sample(struct io_log *iolog,
+					   struct io_logs *cur_log,
+					   uint64_t sample)
+{
+	return __get_sample(cur_log->log, iolog->log_offset, sample);
+}
+
 enum {
 	IP_F_ONRB	= 1,
 	IP_F_ONLIST	= 2,
 	IP_F_TRIMMED	= 4,
+	IP_F_IN_FLIGHT	= 8,
 };
 
 /*
@@ -69,7 +201,7 @@
  */
 struct io_piece {
 	union {
-		struct rb_node rb_node;
+		struct fio_rb_node rb_node;
 		struct flist_head list;
 	};
 	struct flist_head trim_list;
@@ -78,6 +210,7 @@
 		struct fio_file *file;
 	};
 	unsigned long long offset;
+	unsigned short numberio;
 	unsigned long len;
 	unsigned int flags;
 	enum fio_ddir ddir;
@@ -99,41 +232,80 @@
 
 struct io_u;
 extern int __must_check read_iolog_get(struct thread_data *, struct io_u *);
-extern void log_io_u(struct thread_data *, struct io_u *);
+extern void log_io_u(const struct thread_data *, const struct io_u *);
 extern void log_file(struct thread_data *, struct fio_file *, enum file_log_act);
-extern int __must_check init_iolog(struct thread_data *td);
+extern bool __must_check init_iolog(struct thread_data *td);
 extern void log_io_piece(struct thread_data *, struct io_u *);
+extern void unlog_io_piece(struct thread_data *, struct io_u *);
+extern void trim_io_piece(const struct io_u *);
 extern void queue_io_piece(struct thread_data *, struct io_piece *);
 extern void prune_io_piece_log(struct thread_data *);
 extern void write_iolog_close(struct thread_data *);
+extern int iolog_compress_init(struct thread_data *, struct sk_out *);
+extern void iolog_compress_exit(struct thread_data *);
+extern size_t log_chunk_sizes(struct io_log *);
+extern int init_io_u_buffers(struct thread_data *);
+
+#ifdef CONFIG_ZLIB
+extern int iolog_file_inflate(const char *);
+#endif
 
 /*
  * Logging
  */
-extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
-extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
-extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
-extern void add_bw_sample(struct thread_data *, enum fio_ddir, unsigned int,
-				struct timeval *);
-extern void add_iops_sample(struct thread_data *, enum fio_ddir, unsigned int,
-				struct timeval *);
-extern void init_disk_util(struct thread_data *);
-extern void update_rusage_stat(struct thread_data *);
-extern void setup_log(struct io_log **, unsigned long, int);
-extern void finish_log(struct thread_data *, struct io_log *, const char *);
-extern void finish_log_named(struct thread_data *, struct io_log *, const char *, const char *);
-extern void __finish_log(struct io_log *, const char *);
-extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
-extern int write_bw_log;
-extern void add_agg_sample(unsigned long, enum fio_ddir, unsigned int);
+struct log_params {
+	struct thread_data *td;
+	unsigned long avg_msec;
+	unsigned long hist_msec;
+	int hist_coarseness;
+	int log_type;
+	int log_offset;
+	int log_gz;
+	int log_gz_store;
+	int log_compress;
+};
+
+static inline bool per_unit_log(struct io_log *log)
+{
+	return log && (!log->avg_msec || log->log_gz || log->log_gz_store);
+}
+
+static inline bool inline_log(struct io_log *log)
+{
+	return log->log_type == IO_LOG_TYPE_LAT ||
+		log->log_type == IO_LOG_TYPE_CLAT ||
+		log->log_type == IO_LOG_TYPE_SLAT;
+}
+
+static inline void ipo_bytes_align(unsigned int replay_align, struct io_piece *ipo)
+{
+	if (!replay_align)
+		return;
+
+	ipo->offset &= ~(replay_align - (uint64_t)1);
+}
+
+extern void finalize_logs(struct thread_data *td, bool);
+extern void setup_log(struct io_log **, struct log_params *, const char *);
+extern void flush_log(struct io_log *, bool);
+extern void flush_samples(FILE *, void *, uint64_t);
+extern uint64_t hist_sum(int, int, uint64_t *, uint64_t *);
+extern void free_log(struct io_log *);
+extern void fio_writeout_logs(bool);
+extern void td_writeout_logs(struct thread_data *, bool);
+extern int iolog_cur_flush(struct io_log *, struct io_logs *);
 
 static inline void init_ipo(struct io_piece *ipo)
 {
-	memset(ipo, 0, sizeof(*ipo));
+	INIT_FLIST_HEAD(&ipo->list);
 	INIT_FLIST_HEAD(&ipo->trim_list);
 }
 
+struct iolog_compress {
+	struct flist_head list;
+	void *buf;
+	size_t len;
+	unsigned int seq;
+};
+
 #endif
diff -Nru fio-2.1.3/io_u.c fio-3.16/io_u.c
--- fio-2.1.3/io_u.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/io_u.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,30 +1,30 @@
 #include <unistd.h>
-#include <fcntl.h>
 #include <string.h>
-#include <signal.h>
-#include <time.h>
 #include <assert.h>
 
 #include "fio.h"
-#include "hash.h"
 #include "verify.h"
 #include "trim.h"
 #include "lib/rand.h"
 #include "lib/axmap.h"
+#include "err.h"
+#include "lib/pow2.h"
+#include "minmax.h"
+#include "zbd.h"
 
 struct io_completion_data {
 	int nr;				/* input */
 
 	int error;			/* output */
 	uint64_t bytes_done[DDIR_RWDIR_CNT];	/* output */
-	struct timeval time;		/* output */
+	struct timespec time;		/* output */
 };
 
 /*
  * The ->io_axmap contains a map of blocks we have or have not done io
  * to yet. Used to make sure we cover the entire range in a fair fashion.
  */
-static int random_map_free(struct fio_file *f, const uint64_t block)
+static bool random_map_free(struct fio_file *f, const uint64_t block)
 {
 	return !axmap_isset(f->io_axmap, block);
 }
@@ -32,21 +32,27 @@
 /*
  * Mark a given offset as used in the map.
  */
-static void mark_random_map(struct thread_data *td, struct io_u *io_u)
+static uint64_t mark_random_map(struct thread_data *td, struct io_u *io_u,
+				uint64_t offset, uint64_t buflen)
 {
-	unsigned int min_bs = td->o.rw_min_bs;
+	unsigned long long min_bs = td->o.min_bs[io_u->ddir];
 	struct fio_file *f = io_u->file;
-	unsigned int nr_blocks;
+	unsigned long long nr_blocks;
 	uint64_t block;
 
-	block = (io_u->offset - f->file_offset) / (uint64_t) min_bs;
-	nr_blocks = (io_u->buflen + min_bs - 1) / min_bs;
+	block = (offset - f->file_offset) / (uint64_t) min_bs;
+	nr_blocks = (buflen + min_bs - 1) / min_bs;
+	assert(nr_blocks > 0);
 
-	if (!(io_u->flags & IO_U_F_BUSY_OK))
+	if (!(io_u->flags & IO_U_F_BUSY_OK)) {
 		nr_blocks = axmap_set_nr(f->io_axmap, block, nr_blocks);
+		assert(nr_blocks > 0);
+	}
+
+	if ((nr_blocks * min_bs) < buflen)
+		buflen = nr_blocks * min_bs;
 
-	if ((nr_blocks * min_bs) < io_u->buflen)
-		io_u->buflen = nr_blocks * min_bs;
+	return buflen;
 }
 
 static uint64_t last_block(struct thread_data *td, struct fio_file *f,
@@ -59,14 +65,18 @@
 
 	/*
 	 * Hmm, should we make sure that ->io_size <= ->real_file_size?
+	 * -> not for now since there is code assuming it could go either.
 	 */
 	max_size = f->io_size;
 	if (max_size > f->real_file_size)
 		max_size = f->real_file_size;
 
-	if (td->o.zone_range)
+	if (td->o.zone_mode == ZONE_MODE_STRIDED && td->o.zone_range)
 		max_size = td->o.zone_range;
 
+	if (td->o.min_bs[ddir] > td->o.ba[ddir])
+		max_size -= td->o.min_bs[ddir] - td->o.ba[ddir];
+
 	max_blocks = max_size / (uint64_t) td->o.ba[ddir];
 	if (!max_blocks)
 		return 0;
@@ -74,40 +84,26 @@
 	return max_blocks;
 }
 
-struct rand_off {
-	struct flist_head list;
-	uint64_t off;
-};
-
 static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
-				  enum fio_ddir ddir, uint64_t *b)
+				  enum fio_ddir ddir, uint64_t *b,
+				  uint64_t lastb)
 {
-	uint64_t r, lastb;
-
-	lastb = last_block(td, f, ddir);
-	if (!lastb)
-		return 1;
+	uint64_t r;
 
-	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE) {
-		uint64_t rmax;
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE ||
+	    td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) {
 
-		rmax = td->o.use_os_rand ? OS_RAND_MAX : FRAND_MAX;
-
-		if (td->o.use_os_rand) {
-			rmax = OS_RAND_MAX;
-			r = os_random_long(&td->random_state);
-		} else {
-			rmax = FRAND_MAX;
-			r = __rand(&td->__random_state);
-		}
+		r = __rand(&td->random_state);
 
 		dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
 
-		*b = (lastb - 1) * (r / ((uint64_t) rmax + 1.0));
+		*b = lastb * (r / (rand_max(&td->random_state) + 1.0));
 	} else {
 		uint64_t off = 0;
 
-		if (lfsr_next(&f->lfsr, &off, lastb))
+		assert(fio_file_lfsr(f));
+
+		if (lfsr_next(&f->lfsr, &off))
 			return 1;
 
 		*b = off;
@@ -151,102 +147,180 @@
 	return 0;
 }
 
-static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b)
+static int __get_next_rand_offset_gauss(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
 {
-	struct rand_off *r1 = flist_entry(a, struct rand_off, list);
-	struct rand_off *r2 = flist_entry(b, struct rand_off, list);
-
-	return r1->off - r2->off;
+	*b = gauss_next(&f->gauss);
+	return 0;
 }
 
-static int get_off_from_method(struct thread_data *td, struct fio_file *f,
-			       enum fio_ddir ddir, uint64_t *b)
+static int __get_next_rand_offset_zoned_abs(struct thread_data *td,
+					    struct fio_file *f,
+					    enum fio_ddir ddir, uint64_t *b)
 {
-	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
-		return __get_next_rand_offset(td, f, ddir, b);
-	else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
-		return __get_next_rand_offset_zipf(td, f, ddir, b);
-	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
-		return __get_next_rand_offset_pareto(td, f, ddir, b);
+	struct zone_split_index *zsi;
+	uint64_t lastb, send, stotal;
+	unsigned int v;
 
-	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
-	return 1;
-}
+	lastb = last_block(td, f, ddir);
+	if (!lastb)
+		return 1;
 
-/*
- * Sort the reads for a verify phase in batches of verifysort_nr, if
- * specified.
- */
-static inline int should_sort_io(struct thread_data *td)
-{
-	if (!td->o.verifysort_nr || !td->o.do_verify)
-		return 0;
-	if (!td_random(td))
-		return 0;
-	if (td->runstate != TD_VERIFYING)
-		return 0;
-	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE)
-		return 0;
+	if (!td->o.zone_split_nr[ddir]) {
+bail:
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	}
 
-	return 1;
+	/*
+	 * Generate a value, v, between 1 and 100, both inclusive
+	 */
+	v = rand_between(&td->zone_state, 1, 100);
+
+	/*
+	 * Find our generated table. 'send' is the end block of this zone,
+	 * 'stotal' is our start offset.
+	 */
+	zsi = &td->zone_state_index[ddir][v - 1];
+	stotal = zsi->size_prev / td->o.ba[ddir];
+	send = zsi->size / td->o.ba[ddir];
+
+	/*
+	 * Should never happen
+	 */
+	if (send == -1U) {
+		if (!fio_did_warn(FIO_WARN_ZONED_BUG))
+			log_err("fio: bug in zoned generation\n");
+		goto bail;
+	} else if (send > lastb) {
+		/*
+		 * This happens if the user specifies ranges that exceed
+		 * the file/device size. We can't handle that gracefully,
+		 * so error and exit.
+		 */
+		log_err("fio: zoned_abs sizes exceed file size\n");
+		return 1;
+	}
+
+	/*
+	 * Generate index from 0..send-stotal
+	 */
+	if (__get_next_rand_offset(td, f, ddir, b, send - stotal) == 1)
+		return 1;
+
+	*b += stotal;
+	return 0;
 }
 
-static int should_do_random(struct thread_data *td, enum fio_ddir ddir)
-{
-	unsigned int v;
-	unsigned long r;
+static int __get_next_rand_offset_zoned(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
+{
+	unsigned int v, send, stotal;
+	uint64_t offset, lastb;
+	struct zone_split_index *zsi;
 
-	if (td->o.perc_rand[ddir] == 100)
+	lastb = last_block(td, f, ddir);
+	if (!lastb)
 		return 1;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->seq_rand_state[ddir]);
-		v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__seq_rand_state[ddir]);
-		v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
+	if (!td->o.zone_split_nr[ddir]) {
+bail:
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
 	}
 
-	return v <= td->o.perc_rand[ddir];
+	/*
+	 * Generate a value, v, between 1 and 100, both inclusive
+	 */
+	v = rand_between(&td->zone_state, 1, 100);
+
+	zsi = &td->zone_state_index[ddir][v - 1];
+	stotal = zsi->size_perc_prev;
+	send = zsi->size_perc;
+
+	/*
+	 * Should never happen
+	 */
+	if (send == -1U) {
+		if (!fio_did_warn(FIO_WARN_ZONED_BUG))
+			log_err("fio: bug in zoned generation\n");
+		goto bail;
+	}
+
+	/*
+	 * 'send' is some percentage below or equal to 100 that
+	 * marks the end of the current IO range. 'stotal' marks
+	 * the start, in percent.
+	 */
+	if (stotal)
+		offset = stotal * lastb / 100ULL;
+	else
+		offset = 0;
+
+	lastb = lastb * (send - stotal) / 100ULL;
+
+	/*
+	 * Generate index from 0..send-of-lastb
+	 */
+	if (__get_next_rand_offset(td, f, ddir, b, lastb) == 1)
+		return 1;
+
+	/*
+	 * Add our start offset, if any
+	 */
+	if (offset)
+		*b += offset;
+
+	return 0;
 }
 
 static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
 				enum fio_ddir ddir, uint64_t *b)
 {
-	struct rand_off *r;
-	int i, ret = 1;
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) {
+		uint64_t lastb;
 
-	if (!should_sort_io(td))
-		return get_off_from_method(td, f, ddir, b);
+		lastb = last_block(td, f, ddir);
+		if (!lastb)
+			return 1;
 
-	if (!flist_empty(&td->next_rand_list)) {
-		struct rand_off *r;
-fetch:
-		r = flist_entry(td->next_rand_list.next, struct rand_off, list);
-		flist_del(&r->list);
-		*b = r->off;
-		free(r);
-		return 0;
-	}
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	} else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		return __get_next_rand_offset_zipf(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		return __get_next_rand_offset_pareto(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		return __get_next_rand_offset_gauss(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return __get_next_rand_offset_zoned(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED_ABS)
+		return __get_next_rand_offset_zoned_abs(td, f, ddir, b);
+
+	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
+	return 1;
+}
 
-	for (i = 0; i < td->o.verifysort_nr; i++) {
-		r = malloc(sizeof(*r));
+static bool should_do_random(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned int v;
 
-		ret = get_off_from_method(td, f, ddir, &r->off);
-		if (ret) {
-			free(r);
-			break;
-		}
+	if (td->o.perc_rand[ddir] == 100)
+		return true;
 
-		flist_add(&r->list, &td->next_rand_list);
-	}
+	v = rand_between(&td->seq_rand_state[ddir], 1, 100);
 
-	if (ret && !i)
-		return ret;
+	return v <= td->o.perc_rand[ddir];
+}
+
+static void loop_cache_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	struct thread_options *o = &td->o;
 
-	assert(!flist_empty(&td->next_rand_list));
-	flist_sort(NULL, &td->next_rand_list, flist_cmp);
-	goto fetch;
+	if (o->invalidate_cache && !o->odirect) {
+		int fio_unused ret;
+
+		ret = file_invalidate_cache(td, f);
+	}
 }
 
 static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
@@ -255,14 +329,16 @@
 	if (!get_next_rand_offset(td, f, ddir, b))
 		return 0;
 
-	if (td->o.time_based) {
+	if (td->o.time_based ||
+	    (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)) {
 		fio_file_reset(td, f);
+		loop_cache_invalidate(td, f);
 		if (!get_next_rand_offset(td, f, ddir, b))
 			return 0;
 	}
 
 	dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n",
-			f->file_name, (unsigned long long) f->last_pos,
+			f->file_name, (unsigned long long) f->last_pos[ddir],
 			(unsigned long long) f->real_file_size);
 	return 1;
 }
@@ -270,20 +346,57 @@
 static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
 			       enum fio_ddir ddir, uint64_t *offset)
 {
+	struct thread_options *o = &td->o;
+
 	assert(ddir_rw(ddir));
 
-	if (f->last_pos >= f->io_size + get_start_offset(td) && td->o.time_based)
-		f->last_pos = f->last_pos - f->io_size;
+	/*
+	 * If we reach the end for a time based run, reset us back to 0
+	 * and invalidate the cache, if we need to.
+	 */
+	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
+	    o->time_based) {
+		f->last_pos[ddir] = f->file_offset;
+		loop_cache_invalidate(td, f);
+	}
 
-	if (f->last_pos < f->real_file_size) {
+	if (f->last_pos[ddir] < f->real_file_size) {
 		uint64_t pos;
 
-		if (f->last_pos == f->file_offset && td->o.ddir_seq_add < 0)
-			f->last_pos = f->real_file_size;
+		/*
+		 * Only rewind if we already hit the end
+		 */
+		if (f->last_pos[ddir] == f->file_offset &&
+		    f->file_offset && o->ddir_seq_add < 0) {
+			if (f->real_file_size > f->io_size)
+				f->last_pos[ddir] = f->io_size;
+			else
+				f->last_pos[ddir] = f->real_file_size;
+		}
 
-		pos = f->last_pos - f->file_offset;
-		if (pos)
-			pos += td->o.ddir_seq_add;
+		pos = f->last_pos[ddir] - f->file_offset;
+		if (pos && o->ddir_seq_add) {
+			pos += o->ddir_seq_add;
+
+			/*
+			 * If we reach beyond the end of the file
+			 * with holed IO, wrap around to the
+			 * beginning again. If we're doing backwards IO,
+			 * wrap to the end.
+			 */
+			if (pos >= f->real_file_size) {
+				if (o->ddir_seq_add > 0)
+					pos = f->file_offset;
+				else {
+					if (f->real_file_size > f->io_size)
+						pos = f->io_size;
+					else
+						pos = f->real_file_size;
+
+					pos += o->ddir_seq_add;
+				}
+			}
+		}
 
 		*offset = pos;
 		return 0;
@@ -294,7 +407,7 @@
 
 static int get_next_block(struct thread_data *td, struct io_u *io_u,
 			  enum fio_ddir ddir, int rw_seq,
-			  unsigned int *is_random)
+			  bool *is_random)
 {
 	struct fio_file *f = io_u->file;
 	uint64_t b, offset;
@@ -308,31 +421,31 @@
 		if (td_random(td)) {
 			if (should_do_random(td, ddir)) {
 				ret = get_next_rand_block(td, f, ddir, &b);
-				*is_random = 1;
+				*is_random = true;
 			} else {
-				*is_random = 0;
-				io_u->flags |= IO_U_F_BUSY_OK;
+				*is_random = false;
+				io_u_set(td, io_u, IO_U_F_BUSY_OK);
 				ret = get_next_seq_offset(td, f, ddir, &offset);
 				if (ret)
 					ret = get_next_rand_block(td, f, ddir, &b);
 			}
 		} else {
-			*is_random = 0;
+			*is_random = false;
 			ret = get_next_seq_offset(td, f, ddir, &offset);
 		}
 	} else {
-		io_u->flags |= IO_U_F_BUSY_OK;
-		*is_random = 0;
+		io_u_set(td, io_u, IO_U_F_BUSY_OK);
+		*is_random = false;
 
 		if (td->o.rw_seq == RW_SEQ_SEQ) {
 			ret = get_next_seq_offset(td, f, ddir, &offset);
 			if (ret) {
 				ret = get_next_rand_block(td, f, ddir, &b);
-				*is_random = 0;
+				*is_random = false;
 			}
 		} else if (td->o.rw_seq == RW_SEQ_IDENT) {
-			if (f->last_start != -1ULL)
-				offset = f->last_start - f->file_offset;
+			if (f->last_start[ddir] != -1ULL)
+				offset = f->last_start[ddir] - f->file_offset;
 			else
 				offset = 0;
 			ret = 0;
@@ -361,8 +474,8 @@
  * until we find a free one. For sequential io, just return the end of
  * the last io issued.
  */
-static int __get_next_offset(struct thread_data *td, struct io_u *io_u,
-			     unsigned int *is_random)
+static int get_next_offset(struct thread_data *td, struct io_u *io_u,
+			   bool *is_random)
 {
 	struct fio_file *f = io_u->file;
 	enum fio_ddir ddir = io_u->ddir;
@@ -396,41 +509,27 @@
 	return 0;
 }
 
-static int get_next_offset(struct thread_data *td, struct io_u *io_u,
-			   unsigned int *is_random)
-{
-	if (td->flags & TD_F_PROFILE_OPS) {
-		struct prof_io_ops *ops = &td->prof_io_ops;
-
-		if (ops->fill_io_u_off)
-			return ops->fill_io_u_off(td, io_u, is_random);
-	}
-
-	return __get_next_offset(td, io_u, is_random);
-}
-
-static inline int io_u_fits(struct thread_data *td, struct io_u *io_u,
-			    unsigned int buflen)
+static inline bool io_u_fits(struct thread_data *td, struct io_u *io_u,
+			     unsigned long long buflen)
 {
 	struct fio_file *f = io_u->file;
 
-	return io_u->offset + buflen <= f->io_size + get_start_offset(td);
+	return io_u->offset + buflen <= f->io_size + get_start_offset(td, f);
 }
 
-static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u,
-				      unsigned int is_random)
+static unsigned long long get_next_buflen(struct thread_data *td, struct io_u *io_u,
+				    bool is_random)
 {
 	int ddir = io_u->ddir;
-	unsigned int buflen = 0;
-	unsigned int minbs, maxbs;
-	unsigned long r, rand_max;
+	unsigned long long buflen = 0;
+	unsigned long long minbs, maxbs;
+	uint64_t frand_max, r;
+	bool power_2;
 
-	assert(ddir_rw(io_u->ddir));
+	assert(ddir_rw(ddir));
 
 	if (td->o.bs_is_seq_rand)
-		ddir = is_random ? DDIR_WRITE: DDIR_READ;
-	else
-		ddir = io_u->ddir;
+		ddir = is_random ? DDIR_WRITE : DDIR_READ;
 
 	minbs = td->o.min_bs[ddir];
 	maxbs = td->o.max_bs[ddir];
@@ -444,62 +543,42 @@
 	if (!io_u_fits(td, io_u, minbs))
 		return 0;
 
-	if (td->o.use_os_rand)
-		rand_max = OS_RAND_MAX;
-	else
-		rand_max = FRAND_MAX;
-
+	frand_max = rand_max(&td->bsrange_state[ddir]);
 	do {
-		if (td->o.use_os_rand)
-			r = os_random_long(&td->bsrange_state);
-		else
-			r = __rand(&td->__bsrange_state);
+		r = __rand(&td->bsrange_state[ddir]);
 
 		if (!td->o.bssplit_nr[ddir]) {
-			buflen = 1 + (unsigned int) ((double) maxbs *
-					(r / (rand_max + 1.0)));
-			if (buflen < minbs)
-				buflen = minbs;
+			buflen = minbs + (unsigned long long) ((double) maxbs *
+					(r / (frand_max + 1.0)));
 		} else {
-			long perc = 0;
+			long long perc = 0;
 			unsigned int i;
 
 			for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
 				struct bssplit *bsp = &td->o.bssplit[ddir][i];
 
+				if (!bsp->perc)
+					continue;
 				buflen = bsp->bs;
 				perc += bsp->perc;
-				if ((r <= ((rand_max / 100L) * perc)) &&
+				if ((r / perc <= frand_max / 100ULL) &&
 				    io_u_fits(td, io_u, buflen))
 					break;
 			}
 		}
 
-		if (td->o.do_verify && td->o.verify != VERIFY_NONE)
-			buflen = (buflen + td->o.verify_interval - 1) &
-				~(td->o.verify_interval - 1);
-
-		if (!td->o.bs_unaligned && is_power_of_2(minbs))
-			buflen = (buflen + minbs - 1) & ~(minbs - 1);
-
+		power_2 = is_power_of_2(minbs);
+		if (!td->o.bs_unaligned && power_2)
+			buflen &= ~(minbs - 1);
+		else if (!td->o.bs_unaligned && !power_2)
+			buflen -= buflen % minbs;
+		if (buflen > maxbs)
+			buflen = maxbs;
 	} while (!io_u_fits(td, io_u, buflen));
 
 	return buflen;
 }
 
-static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u,
-				    unsigned int is_random)
-{
-	if (td->flags & TD_F_PROFILE_OPS) {
-		struct prof_io_ops *ops = &td->prof_io_ops;
-
-		if (ops->fill_io_u_size)
-			return ops->fill_io_u_size(td, io_u, is_random);
-	}
-
-	return __get_next_buflen(td, io_u, is_random);
-}
-
 static void set_rwmix_bytes(struct thread_data *td)
 {
 	unsigned int diff;
@@ -516,15 +595,8 @@
 static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
 {
 	unsigned int v;
-	unsigned long r;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->rwmix_state);
-		v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__rwmix_state);
-		v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
-	}
+	v = rand_between(&td->rwmix_state, 1, 100);
 
 	if (v <= td->o.rwmix[DDIR_READ])
 		return DDIR_READ;
@@ -532,8 +604,10 @@
 	return DDIR_WRITE;
 }
 
-void io_u_quiesce(struct thread_data *td)
+int io_u_quiesce(struct thread_data *td)
 {
+	int ret = 0, completed = 0;
+
 	/*
 	 * We are going to sleep, ensure that we flush anything pending as
 	 * not to skew our latency numbers.
@@ -543,64 +617,70 @@
 	 * io's that have been actually submitted to an async engine,
 	 * and cur_depth is meaningless for sync engines.
 	 */
-	while (td->io_u_in_flight) {
-		int fio_unused ret;
+	if (td->io_u_queued || td->cur_depth)
+		td_io_commit(td);
 
-		ret = io_u_queued_complete(td, 1, NULL);
+	while (td->io_u_in_flight) {
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			completed += ret;
+		else if (ret < 0)
+			break;
 	}
+
+	if (td->flags & TD_F_REGROW_LOGS)
+		regrow_logs(td);
+
+	if (completed)
+		return completed;
+
+	return ret;
 }
 
 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 {
 	enum fio_ddir odir = ddir ^ 1;
-	struct timeval t;
-	long usec;
+	uint64_t usec;
+	uint64_t now;
 
 	assert(ddir_rw(ddir));
+	now = utime_since_now(&td->start);
 
-	if (td->rate_pending_usleep[ddir] <= 0)
+	/*
+	 * if rate_next_io_time is in the past, need to catch up to rate
+	 */
+	if (td->rate_next_io_time[ddir] <= now)
 		return ddir;
 
 	/*
-	 * We have too much pending sleep in this direction. See if we
+	 * We are ahead of rate in this direction. See if we
 	 * should switch.
 	 */
 	if (td_rw(td) && td->o.rwmix[odir]) {
 		/*
-		 * Other direction does not have too much pending, switch
+		 * Other direction is behind rate, switch
 		 */
-		if (td->rate_pending_usleep[odir] < 100000)
+		if (td->rate_next_io_time[odir] <= now)
 			return odir;
 
 		/*
-		 * Both directions have pending sleep. Sleep the minimum time
-		 * and deduct from both.
+		 * Both directions are ahead of rate. sleep the min,
+		 * switch if necessary
 		 */
-		if (td->rate_pending_usleep[ddir] <=
-			td->rate_pending_usleep[odir]) {
-			usec = td->rate_pending_usleep[ddir];
+		if (td->rate_next_io_time[ddir] <=
+		    td->rate_next_io_time[odir]) {
+			usec = td->rate_next_io_time[ddir] - now;
 		} else {
-			usec = td->rate_pending_usleep[odir];
+			usec = td->rate_next_io_time[odir] - now;
 			ddir = odir;
 		}
 	} else
-		usec = td->rate_pending_usleep[ddir];
+		usec = td->rate_next_io_time[ddir] - now;
 
-	io_u_quiesce(td);
+	if (td->o.io_submit_mode == IO_MODE_INLINE)
+		io_u_quiesce(td);
 
-	fio_gettime(&t, NULL);
 	usec_sleep(td, usec);
-	usec = utime_since_now(&t);
-
-	td->rate_pending_usleep[ddir] -= usec;
-
-	odir = ddir ^ 1;
-	if (td_rw(td) && __should_check_rate(td, odir))
-		td->rate_pending_usleep[odir] -= usec;
-
-	if (ddir_trim(ddir))
-		return ddir;
-
 	return ddir;
 }
 
@@ -614,28 +694,22 @@
 	enum fio_ddir ddir;
 
 	/*
-	 * see if it's time to fsync
+	 * See if it's time to fsync/fdatasync/sync_file_range first,
+	 * and if not then move on to check regular I/Os.
 	 */
-	if (td->o.fsync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC;
-
-	/*
-	 * see if it's time to fdatasync
-	 */
-	if (td->o.fdatasync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_DATASYNC;
-
-	/*
-	 * see if it's time to sync_file_range
-	 */
-	if (td->sync_file_range_nr &&
-	   !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC_FILE_RANGE;
+	if (should_fsync(td)) {
+		if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks))
+			return DDIR_SYNC;
+
+		if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks))
+			return DDIR_DATASYNC;
+
+		if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr))
+			return DDIR_SYNC_FILE_RANGE;
+	}
 
 	if (td_rw(td)) {
 		/*
@@ -659,8 +733,10 @@
 		ddir = DDIR_READ;
 	else if (td_write(td))
 		ddir = DDIR_WRITE;
-	else
+	else if (td_trim(td))
 		ddir = DDIR_TRIM;
+	else
+		ddir = DDIR_INVAL;
 
 	td->rwmix_ddir = rate_ddir(td, ddir);
 	return td->rwmix_ddir;
@@ -668,18 +744,28 @@
 
 static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
 {
-	io_u->ddir = io_u->acct_ddir = get_rw_ddir(td);
+	enum fio_ddir ddir = get_rw_ddir(td);
+
+	if (td_trimwrite(td)) {
+		struct fio_file *f = io_u->file;
+		if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM])
+			ddir = DDIR_TRIM;
+		else
+			ddir = DDIR_WRITE;
+	}
+
+	io_u->ddir = io_u->acct_ddir = ddir;
 
-	if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) &&
+	if (io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_BARRIER) &&
 	    td->o.barrier_blocks &&
 	   !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) &&
 	     td->io_issues[DDIR_WRITE])
-		io_u->flags |= IO_U_F_BARRIER;
+		io_u_set(td, io_u, IO_U_F_BARRIER);
 }
 
 void put_file_log(struct thread_data *td, struct fio_file *f)
 {
-	int ret = put_file(td, f);
+	unsigned int ret = put_file(td, f);
 
 	if (ret)
 		td_verror(td, ret, "file close");
@@ -687,54 +773,122 @@
 
 void put_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	td_io_u_lock(td);
+	const bool needs_lock = td_async_processing(td);
+
+	zbd_put_io_u(io_u);
+
+	if (td->parent)
+		td = td->parent;
 
-	if (io_u->file && !(io_u->flags & IO_U_F_FREE_DEF))
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT))
 		put_file_log(td, io_u->file);
+
 	io_u->file = NULL;
-	io_u->flags &= ~IO_U_F_FREE_DEF;
-	io_u->flags |= IO_U_F_FREE;
+	io_u_set(td, io_u, IO_U_F_FREE);
 
-	if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
+	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
 	io_u_qpush(&td->io_u_freelist, io_u);
-	td_io_u_unlock(td);
 	td_io_u_free_notify(td);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
 }
 
 void clear_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	io_u->flags &= ~IO_U_F_FLIGHT;
+	io_u_clear(td, io_u, IO_U_F_FLIGHT);
 	put_io_u(td, io_u);
 }
 
 void requeue_io_u(struct thread_data *td, struct io_u **io_u)
 {
+	const bool needs_lock = td_async_processing(td);
 	struct io_u *__io_u = *io_u;
 	enum fio_ddir ddir = acct_ddir(__io_u);
 
 	dprint(FD_IO, "requeue %p\n", __io_u);
 
-	td_io_u_lock(td);
+	if (td->parent)
+		td = td->parent;
 
-	__io_u->flags |= IO_U_F_FREE;
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	io_u_set(td, __io_u, IO_U_F_FREE);
 	if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(ddir))
 		td->io_issues[ddir]--;
 
-	__io_u->flags &= ~IO_U_F_FLIGHT;
-	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH)
+	io_u_clear(td, __io_u, IO_U_F_FLIGHT);
+	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
 
 	io_u_rpush(&td->io_u_requeues, __io_u);
-	td_io_u_unlock(td);
+	td_io_u_free_notify(td);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
 	*io_u = NULL;
 }
 
+static void setup_strided_zone_mode(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+
+	assert(td->o.zone_mode == ZONE_MODE_STRIDED);
+	assert(td->o.zone_size);
+	assert(td->o.zone_range);
+
+	/*
+	 * See if it's time to switch to a new zone
+	 */
+	if (td->zone_bytes >= td->o.zone_size && td->o.zone_skip) {
+		td->zone_bytes = 0;
+		f->file_offset += td->o.zone_range + td->o.zone_skip;
+
+		/*
+		 * Wrap from the beginning, if we exceed the file size
+		 */
+		if (f->file_offset >= f->real_file_size)
+			f->file_offset = get_start_offset(td, f);
+
+		f->last_pos[io_u->ddir] = f->file_offset;
+		td->io_skip_bytes += td->o.zone_skip;
+	}
+
+	/*
+	 * If zone_size > zone_range, then maintain the same zone until
+	 * zone_bytes >= zone_size.
+	 */
+	if (f->last_pos[io_u->ddir] >= (f->file_offset + td->o.zone_range)) {
+		dprint(FD_IO, "io_u maintain zone offset=%" PRIu64 "/last_pos=%" PRIu64 "\n",
+				f->file_offset, f->last_pos[io_u->ddir]);
+		f->last_pos[io_u->ddir] = f->file_offset;
+	}
+
+	/*
+	 * For random: if 'norandommap' is not set and zone_size > zone_range,
+	 * map needs to be reset as it's done with zone_range everytime.
+	 */
+	if ((td->zone_bytes % td->o.zone_range) == 0)
+		fio_file_reset(td, f);
+}
+
 static int fill_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	unsigned int is_random;
+	bool is_random;
+	uint64_t offset;
+	enum io_u_action ret;
 
-	if (td->io_ops->flags & FIO_NOIO)
+	if (td_ioengine_flagged(td, FIO_NOIO))
 		goto out;
 
 	set_rw_ddir(td, io_u);
@@ -745,15 +899,10 @@
 	if (!ddir_rw(io_u->ddir))
 		goto out;
 
-	/*
-	 * See if it's time to switch to a new zone
-	 */
-	if (td->zone_bytes >= td->o.zone_size && td->o.zone_skip) {
-		td->zone_bytes = 0;
-		io_u->file->file_offset += td->o.zone_range + td->o.zone_skip;
-		io_u->file->last_pos = io_u->file->file_offset;
-		td->io_skip_bytes += td->o.zone_skip;
-	}
+	if (td->o.zone_mode == ZONE_MODE_STRIDED)
+		setup_strided_zone_mode(td, io_u);
+	else if (td->o.zone_mode == ZONE_MODE_ZBD)
+		setup_zbd_zone_mode(td, io_u);
 
 	/*
 	 * No log, let the seq/rand engine retrieve the next buflen and
@@ -770,9 +919,16 @@
 		return 1;
 	}
 
+	offset = io_u->offset;
+	if (td->o.zone_mode == ZONE_MODE_ZBD) {
+		ret = zbd_adjust_block(td, io_u);
+		if (ret == io_u_eof)
+			return 1;
+	}
+
 	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
-		dprint(FD_IO, "io_u %p, offset too large\n", io_u);
-		dprint(FD_IO, "  off=%llu/%lu > %llu\n",
+		dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
+			io_u,
 			(unsigned long long) io_u->offset, io_u->buflen,
 			(unsigned long long) io_u->file->real_file_size);
 		return 1;
@@ -782,15 +938,15 @@
 	 * mark entry before potentially trimming io_u
 	 */
 	if (td_random(td) && file_randommap(td, io_u->file))
-		mark_random_map(td, io_u);
+		io_u->buflen = mark_random_map(td, io_u, offset, io_u->buflen);
 
 out:
-	dprint_io_u(io_u, "fill_io_u");
+	dprint_io_u(io_u, "fill");
 	td->zone_bytes += io_u->buflen;
 	return 0;
 }
 
-static void __io_u_mark_map(unsigned int *map, unsigned int nr)
+static void __io_u_mark_map(uint64_t *map, unsigned int nr)
 {
 	int idx = 0;
 
@@ -860,11 +1016,52 @@
 	td->ts.io_u_map[idx] += nr;
 }
 
-static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
+static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
+{
+	int idx = 0;
+
+	assert(nsec < 1000);
+
+	switch (nsec) {
+	case 750 ... 999:
+		idx = 9;
+		break;
+	case 500 ... 749:
+		idx = 8;
+		break;
+	case 250 ... 499:
+		idx = 7;
+		break;
+	case 100 ... 249:
+		idx = 6;
+		break;
+	case 50 ... 99:
+		idx = 5;
+		break;
+	case 20 ... 49:
+		idx = 4;
+		break;
+	case 10 ... 19:
+		idx = 3;
+		break;
+	case 4 ... 9:
+		idx = 2;
+		break;
+	case 2 ... 3:
+		idx = 1;
+	case 0 ... 1:
+		break;
+	}
+
+	assert(idx < FIO_IO_U_LAT_N_NR);
+	td->ts.io_u_lat_n[idx]++;
+}
+
+static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
 {
 	int idx = 0;
 
-	assert(usec < 1000);
+	assert(usec < 1000 && usec >= 1);
 
 	switch (usec) {
 	case 750 ... 999:
@@ -901,10 +1098,12 @@
 	td->ts.io_u_lat_u[idx]++;
 }
 
-static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
+static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
 {
 	int idx = 0;
 
+	assert(msec >= 1);
+
 	switch (msec) {
 	default:
 		idx = 11;
@@ -946,12 +1145,42 @@
 	td->ts.io_u_lat_m[idx]++;
 }
 
-static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
+static void io_u_mark_latency(struct thread_data *td, unsigned long long nsec)
 {
-	if (usec < 1000)
-		io_u_mark_lat_usec(td, usec);
+	if (nsec < 1000)
+		io_u_mark_lat_nsec(td, nsec);
+	else if (nsec < 1000000)
+		io_u_mark_lat_usec(td, nsec / 1000);
 	else
-		io_u_mark_lat_msec(td, usec / 1000);
+		io_u_mark_lat_msec(td, nsec / 1000000);
+}
+
+static unsigned int __get_next_fileno_rand(struct thread_data *td)
+{
+	unsigned long fileno;
+
+	if (td->o.file_service_type == FIO_FSERVICE_RANDOM) {
+		uint64_t frand_max = rand_max(&td->next_file_state);
+		unsigned long r;
+
+		r = __rand(&td->next_file_state);
+		return (unsigned int) ((double) td->o.nr_files
+				* (r / (frand_max + 1.0)));
+	}
+
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF)
+		fileno = zipf_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_PARETO)
+		fileno = pareto_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_GAUSS)
+		fileno = gauss_next(&td->next_file_gauss);
+	else {
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		assert(0);
+		return 0;
+	}
+
+	return fileno >> FIO_FSERVICE_SHIFT;
 }
 
 /*
@@ -966,17 +1195,8 @@
 
 	do {
 		int opened = 0;
-		unsigned long r;
 
-		if (td->o.use_os_rand) {
-			r = os_random_long(&td->next_file_state);
-			fno = (unsigned int) ((double) td->o.nr_files
-				* (r / (OS_RAND_MAX + 1.0)));
-		} else {
-			r = __rand(&td->__next_file_state);
-			fno = (unsigned int) ((double) td->o.nr_files
-				* (r / (FRAND_MAX + 1.0)));
-		}
+		fno = __get_next_fileno_rand(td);
 
 		f = td->files[fno];
 		if (fio_file_done(f))
@@ -985,6 +1205,9 @@
 		if (!fio_file_open(f)) {
 			int err;
 
+			if (td->nr_open_files >= td->o.open_files)
+				return ERR_PTR(-EBUSY);
+
 			err = td_io_open_file(td, f);
 			if (err)
 				continue;
@@ -1027,6 +1250,9 @@
 		if (!fio_file_open(f)) {
 			int err;
 
+			if (td->nr_open_files >= td->o.open_files)
+				return ERR_PTR(-EBUSY);
+
 			err = td_io_open_file(td, f);
 			if (err) {
 				dprint(FD_FILE, "error %d on open of %s\n",
@@ -1080,33 +1306,32 @@
 	else
 		f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing);
 
+	if (IS_ERR(f))
+		return f;
+
 	td->file_service_file = f;
 	td->file_service_left = td->file_service_nr - 1;
 out:
-	dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
+	if (f)
+		dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
+	else
+		dprint(FD_FILE, "get_next_file: NULL\n");
 	return f;
 }
 
 static struct fio_file *get_next_file(struct thread_data *td)
 {
-	if (!(td->flags & TD_F_PROFILE_OPS)) {
-		struct prof_io_ops *ops = &td->prof_io_ops;
-
-		if (ops->get_next_file)
-			return ops->get_next_file(td);
-	}
-
 	return __get_next_file(td);
 }
 
-static int set_io_u_file(struct thread_data *td, struct io_u *io_u)
+static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f;
 
 	do {
 		f = get_next_file(td);
-		if (!f)
-			return 1;
+		if (IS_ERR_OR_NULL(f))
+			return PTR_ERR(f);
 
 		io_u->file = f;
 		get_file(f);
@@ -1114,90 +1339,261 @@
 		if (!fill_io_u(td, io_u))
 			break;
 
+		zbd_put_io_u(io_u);
+
 		put_file_log(td, f);
 		td_io_close_file(td, f);
 		io_u->file = NULL;
-		fio_file_set_done(f);
-		td->nr_done_files++;
-		dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
+		if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+			fio_file_reset(td, f);
+		else {
+			fio_file_set_done(f);
+			td->nr_done_files++;
+			dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
 					td->nr_done_files, td->o.nr_files);
+		}
 	} while (1);
 
 	return 0;
 }
 
+static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+		      unsigned long long tnsec, unsigned long long max_nsec)
+{
+	if (!td->error)
+		log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+	td_verror(td, ETIMEDOUT, "max latency exceeded");
+	icd->error = ETIMEDOUT;
+}
+
+static void lat_new_cycle(struct thread_data *td)
+{
+	fio_gettime(&td->latency_ts, NULL);
+	td->latency_ios = ddir_rw_sum(td->io_blocks);
+	td->latency_failed = 0;
+}
+
+/*
+ * We had an IO outside the latency target. Reduce the queue depth. If we
+ * are at QD=1, then it's time to give up.
+ */
+static bool __lat_target_failed(struct thread_data *td)
+{
+	if (td->latency_qd == 1)
+		return true;
+
+	td->latency_qd_high = td->latency_qd;
+
+	if (td->latency_qd == td->latency_qd_low)
+		td->latency_qd_low--;
+
+	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
+
+	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * When we ramp QD down, quiesce existing IO to prevent
+	 * a storm of ramp downs due to pending higher depth.
+	 */
+	io_u_quiesce(td);
+	lat_new_cycle(td);
+	return false;
+}
+
+static bool lat_target_failed(struct thread_data *td)
+{
+	if (td->o.latency_percentile.u.f == 100.0)
+		return __lat_target_failed(td);
+
+	td->latency_failed++;
+	return false;
+}
+
+void lat_target_init(struct thread_data *td)
+{
+	td->latency_end_run = 0;
+
+	if (td->o.latency_target) {
+		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
+		fio_gettime(&td->latency_ts, NULL);
+		td->latency_qd = 1;
+		td->latency_qd_high = td->o.iodepth;
+		td->latency_qd_low = 1;
+		td->latency_ios = ddir_rw_sum(td->io_blocks);
+	} else
+		td->latency_qd = td->o.iodepth;
+}
+
+void lat_target_reset(struct thread_data *td)
+{
+	if (!td->latency_end_run)
+		lat_target_init(td);
+}
+
+static void lat_target_success(struct thread_data *td)
+{
+	const unsigned int qd = td->latency_qd;
+	struct thread_options *o = &td->o;
+
+	td->latency_qd_low = td->latency_qd;
+
+	/*
+	 * If we haven't failed yet, we double up to a failing value instead
+	 * of bisecting from highest possible queue depth. If we have set
+	 * a limit other than td->o.iodepth, bisect between that.
+	 */
+	if (td->latency_qd_high != o->iodepth)
+		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
+	else
+		td->latency_qd *= 2;
+
+	if (td->latency_qd > o->iodepth)
+		td->latency_qd = o->iodepth;
+
+	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * Same as last one, we are done. Let it run a latency cycle, so
+	 * we get only the results from the targeted depth.
+	 */
+	if (td->latency_qd == qd) {
+		if (td->latency_end_run) {
+			dprint(FD_RATE, "We are done\n");
+			td->done = 1;
+		} else {
+			dprint(FD_RATE, "Quiesce and final run\n");
+			io_u_quiesce(td);
+			td->latency_end_run = 1;
+			reset_all_stats(td);
+			reset_io_stats(td);
+		}
+	}
+
+	lat_new_cycle(td);
+}
+
+/*
+ * Check if we can bump the queue depth
+ */
+void lat_target_check(struct thread_data *td)
+{
+	uint64_t usec_window;
+	uint64_t ios;
+	double success_ios;
+
+	usec_window = utime_since_now(&td->latency_ts);
+	if (usec_window < td->o.latency_window)
+		return;
+
+	ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
+	success_ios = (double) (ios - td->latency_failed) / (double) ios;
+	success_ios *= 100.0;
+
+	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
+
+	if (success_ios >= td->o.latency_percentile.u.f)
+		lat_target_success(td);
+	else
+		__lat_target_failed(td);
+}
+
+/*
+ * If latency target is enabled, we might be ramping up or down and not
+ * using the full queue depth available.
+ */
+bool queue_full(const struct thread_data *td)
+{
+	const int qempty = io_u_qempty(&td->io_u_freelist);
+
+	if (qempty)
+		return true;
+	if (!td->o.latency_target)
+		return false;
+
+	return td->cur_depth >= td->latency_qd;
+}
 
 struct io_u *__get_io_u(struct thread_data *td)
 {
-	struct io_u *io_u;
+	const bool needs_lock = td_async_processing(td);
+	struct io_u *io_u = NULL;
+	int ret;
+
+	if (td->stop_io)
+		return NULL;
 
-	td_io_u_lock(td);
+	if (needs_lock)
+		__td_io_u_lock(td);
 
 again:
 	if (!io_u_rempty(&td->io_u_requeues))
 		io_u = io_u_rpop(&td->io_u_requeues);
-	else if (!io_u_qempty(&td->io_u_freelist)) {
+	else if (!queue_full(td)) {
 		io_u = io_u_qpop(&td->io_u_freelist);
 
+		io_u->file = NULL;
 		io_u->buflen = 0;
 		io_u->resid = 0;
-		io_u->file = NULL;
 		io_u->end_io = NULL;
 	}
 
 	if (io_u) {
 		assert(io_u->flags & IO_U_F_FREE);
-		io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF);
-		io_u->flags &= ~(IO_U_F_TRIMMED | IO_U_F_BARRIER);
-		io_u->flags &= ~IO_U_F_VER_LIST;
+		io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
+				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
+				 IO_U_F_VER_LIST);
 
 		io_u->error = 0;
 		io_u->acct_ddir = -1;
 		td->cur_depth++;
-		io_u->flags |= IO_U_F_IN_CUR_DEPTH;
-	} else if (td->o.verify_async) {
+		assert(!(td->flags & TD_F_CHILD));
+		io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH);
+		io_u->ipo = NULL;
+	} else if (td_async_processing(td)) {
 		/*
 		 * We ran out, wait for async verify threads to finish and
 		 * return one
 		 */
-		pthread_cond_wait(&td->free_cond, &td->io_u_lock);
-		goto again;
+		assert(!(td->flags & TD_F_CHILD));
+		ret = pthread_cond_wait(&td->free_cond, &td->io_u_lock);
+		assert(ret == 0);
+		if (!td->error)
+			goto again;
 	}
 
-	td_io_u_unlock(td);
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
 	return io_u;
 }
 
-static int check_get_trim(struct thread_data *td, struct io_u *io_u)
+static bool check_get_trim(struct thread_data *td, struct io_u *io_u)
 {
 	if (!(td->flags & TD_F_TRIM_BACKLOG))
-		return 0;
-
-	if (td->trim_entries) {
-		int get_trim = 0;
-
-		if (td->trim_batch) {
-			td->trim_batch--;
-			get_trim = 1;
-		} else if (!(td->io_hist_len % td->o.trim_backlog) &&
-			 td->last_ddir != DDIR_READ) {
-			td->trim_batch = td->o.trim_batch;
-			if (!td->trim_batch)
-				td->trim_batch = td->o.trim_backlog;
-			get_trim = 1;
-		}
-
-		if (get_trim && !get_next_trim(td, io_u))
-			return 1;
+		return false;
+	if (!td->trim_entries)
+		return false;
+
+	if (td->trim_batch) {
+		td->trim_batch--;
+		if (get_next_trim(td, io_u))
+			return true;
+	} else if (!(td->io_hist_len % td->o.trim_backlog) &&
+		     td->last_ddir != DDIR_READ) {
+		td->trim_batch = td->o.trim_batch;
+		if (!td->trim_batch)
+			td->trim_batch = td->o.trim_backlog;
+		if (get_next_trim(td, io_u))
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
-static int check_get_verify(struct thread_data *td, struct io_u *io_u)
+static bool check_get_verify(struct thread_data *td, struct io_u *io_u)
 {
 	if (!(td->flags & TD_F_VER_BACKLOG))
-		return 0;
+		return false;
 
 	if (td->io_hist_len) {
 		int get_verify = 0;
@@ -1214,11 +1610,11 @@
 
 		if (get_verify && !get_next_verify(td, io_u)) {
 			td->verify_batch--;
-			return 1;
+			return true;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 /*
@@ -1229,32 +1625,40 @@
  */
 static void small_content_scramble(struct io_u *io_u)
 {
-	unsigned int i, nr_blocks = io_u->buflen / 512;
-	uint64_t boffset;
+	unsigned long long i, nr_blocks = io_u->buflen >> 9;
 	unsigned int offset;
-	void *p, *end;
+	uint64_t boffset, *iptr;
+	char *p;
 
 	if (!nr_blocks)
 		return;
 
 	p = io_u->xfer_buf;
 	boffset = io_u->offset;
-	io_u->buf_filled_len = 0;
+
+	if (io_u->buf_filled_len)
+		io_u->buf_filled_len = 0;
+
+	/*
+	 * Generate random index between 0..7. We do chunks of 512b, if
+	 * we assume a cacheline is 64 bytes, then we have 8 of those.
+	 * Scramble content within the blocks in the same cacheline to
+	 * speed things up.
+	 */
+	offset = (io_u->start_time.tv_nsec ^ boffset) & 7;
 
 	for (i = 0; i < nr_blocks; i++) {
 		/*
-		 * Fill the byte offset into a "random" start offset of
-		 * the buffer, given by the product of the usec time
-		 * and the actual offset.
+		 * Fill offset into start of cacheline, time into end
+		 * of cacheline
 		 */
-		offset = (io_u->start_time.tv_usec ^ boffset) & 511;
-		offset &= ~(sizeof(uint64_t) - 1);
-		if (offset >= 512 - sizeof(uint64_t))
-			offset -= sizeof(uint64_t);
-		memcpy(p + offset, &boffset, sizeof(boffset));
+		iptr = (void *) p + (offset << 6);
+		*iptr = boffset;
+
+		iptr = (void *) p + 64 - 2 * sizeof(uint64_t);
+		iptr[0] = io_u->start_time.tv_sec;
+		iptr[1] = io_u->start_time.tv_nsec;
 
-		end = p + 512 - sizeof(io_u->start_time);
-		memcpy(end, &io_u->start_time, sizeof(io_u->start_time));
 		p += 512;
 		boffset += 512;
 	}
@@ -1262,13 +1666,14 @@
 
 /*
  * Return an io_u to be processed. Gets a buflen and offset, sets direction,
- * etc. The returned io_u is fully ready to be prepped and submitted.
+ * etc. The returned io_u is fully ready to be prepped, populated and submitted.
  */
 struct io_u *get_io_u(struct thread_data *td)
 {
 	struct fio_file *f;
 	struct io_u *io_u;
 	int do_scramble = 0;
+	long ret = 0;
 
 	io_u = __get_io_u(td);
 	if (!io_u) {
@@ -1294,32 +1699,37 @@
 		if (read_iolog_get(td, io_u))
 			goto err_put;
 	} else if (set_io_u_file(td, io_u)) {
+		ret = -EBUSY;
 		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
 		goto err_put;
 	}
 
 	f = io_u->file;
+	if (!f) {
+		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
+		goto err_put;
+	}
+
 	assert(fio_file_open(f));
 
 	if (ddir_rw(io_u->ddir)) {
-		if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) {
+		if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) {
 			dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
 			goto err_put;
 		}
 
-		f->last_start = io_u->offset;
-		f->last_pos = io_u->offset + io_u->buflen;
+		f->last_start[io_u->ddir] = io_u->offset;
+		f->last_pos[io_u->ddir] = io_u->offset + io_u->buflen;
 
 		if (io_u->ddir == DDIR_WRITE) {
 			if (td->flags & TD_F_REFILL_BUFFERS) {
 				io_u_fill_buffer(td, io_u,
-					io_u->xfer_buflen, io_u->xfer_buflen);
-			} else if (td->flags & TD_F_SCRAMBLE_BUFFERS)
+					td->o.min_bs[DDIR_WRITE],
+					io_u->buflen);
+			} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
+				   !(td->flags & TD_F_COMPRESS) &&
+				   !(td->flags & TD_F_DO_VERIFY))
 				do_scramble = 1;
-			if (td->flags & TD_F_VER_NONE) {
-				populate_verify_io_u(td, io_u);
-				do_scramble = 0;
-			}
 		} else if (io_u->ddir == DDIR_READ) {
 			/*
 			 * Reset the buf_filled parameters so next time if the
@@ -1338,171 +1748,215 @@
 out:
 	assert(io_u->file);
 	if (!td_io_prep(td, io_u)) {
-		if (!td->o.disable_slat)
+		if (!td->o.disable_lat)
 			fio_gettime(&io_u->start_time, NULL);
+
 		if (do_scramble)
 			small_content_scramble(io_u);
+
 		return io_u;
 	}
 err_put:
 	dprint(FD_IO, "get_io_u failed\n");
 	put_io_u(td, io_u);
-	return NULL;
+	return ERR_PTR(ret);
 }
 
-void io_u_log_error(struct thread_data *td, struct io_u *io_u)
+static void __io_u_log_error(struct thread_data *td, struct io_u *io_u)
 {
 	enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error);
-	const char *msg[] = { "read", "write", "sync", "datasync",
-				"sync_file_range", "wait", "trim" };
 
 	if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump)
 		return;
 
-	log_err("fio: io_u error");
+	log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%llu\n",
+		io_u->file ? " on file " : "",
+		io_u->file ? io_u->file->file_name : "",
+		strerror(io_u->error),
+		io_ddir_name(io_u->ddir),
+		io_u->offset, io_u->xfer_buflen);
 
-	if (io_u->file)
-		log_err(" on file %s", io_u->file->file_name);
-
-	log_err(": %s\n", strerror(io_u->error));
+	if (td->io_ops->errdetails) {
+		char *err = td->io_ops->errdetails(io_u);
 
-	log_err("     %s offset=%llu, buflen=%lu\n", msg[io_u->ddir],
-					io_u->offset, io_u->xfer_buflen);
+		log_err("fio: %s\n", err);
+		free(err);
+	}
 
 	if (!td->error)
 		td_verror(td, io_u->error, "io_u error");
 }
 
+void io_u_log_error(struct thread_data *td, struct io_u *io_u)
+{
+	__io_u_log_error(td, io_u);
+	if (td->parent)
+		__io_u_log_error(td->parent, io_u);
+}
+
+static inline bool gtod_reduce(struct thread_data *td)
+{
+	return (td->o.disable_clat && td->o.disable_slat && td->o.disable_bw)
+			|| td->o.gtod_reduce;
+}
+
+static void trim_block_info(struct thread_data *td, struct io_u *io_u)
+{
+	uint32_t *info = io_u_block_info(td, io_u);
+
+	if (BLOCK_INFO_STATE(*info) >= BLOCK_STATE_TRIM_FAILURE)
+		return;
+
+	*info = BLOCK_INFO(BLOCK_STATE_TRIMMED, BLOCK_INFO_TRIMS(*info) + 1);
+}
+
 static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 				  struct io_completion_data *icd,
 				  const enum fio_ddir idx, unsigned int bytes)
 {
-	unsigned long lusec = 0;
+	const int no_reduce = !gtod_reduce(td);
+	unsigned long long llnsec = 0;
+
+	if (td->parent)
+		td = td->parent;
+
+	if (!td->o.stats || td_ioengine_flagged(td, FIO_NOSTATS))
+		return;
 
-	if (!td->o.disable_clat || !td->o.disable_bw)
-		lusec = utime_since(&io_u->issue_time, &icd->time);
+	if (no_reduce)
+		llnsec = ntime_since(&io_u->issue_time, &icd->time);
 
 	if (!td->o.disable_lat) {
-		unsigned long tusec;
+		unsigned long long tnsec;
 
-		tusec = utime_since(&io_u->start_time, &icd->time);
-		add_lat_sample(td, idx, tusec, bytes);
+		tnsec = ntime_since(&io_u->start_time, &icd->time);
+		add_lat_sample(td, idx, tnsec, bytes, io_u->offset);
 
 		if (td->flags & TD_F_PROFILE_OPS) {
 			struct prof_io_ops *ops = &td->prof_io_ops;
 
 			if (ops->io_u_lat)
-				icd->error = ops->io_u_lat(td, tusec);
+				icd->error = ops->io_u_lat(td, tnsec);
 		}
 
-		if (td->o.max_latency && tusec > td->o.max_latency) {
-			if (!td->error)
-				log_err("fio: latency of %lu usec exceeds specified max (%u usec)\n", tusec, td->o.max_latency);
-			td_verror(td, ETIMEDOUT, "max latency exceeded");
-			icd->error = ETIMEDOUT;
+		if (td->o.max_latency && tnsec > td->o.max_latency)
+			lat_fatal(td, icd, tnsec, td->o.max_latency);
+		if (td->o.latency_target && tnsec > td->o.latency_target) {
+			if (lat_target_failed(td))
+				lat_fatal(td, icd, tnsec, td->o.latency_target);
 		}
 	}
 
-	if (!td->o.disable_clat) {
-		add_clat_sample(td, idx, lusec, bytes);
-		io_u_mark_latency(td, lusec);
-	}
+	if (ddir_rw(idx)) {
+		if (!td->o.disable_clat) {
+			add_clat_sample(td, idx, llnsec, bytes, io_u->offset);
+			io_u_mark_latency(td, llnsec);
+		}
 
-	if (!td->o.disable_bw)
-		add_bw_sample(td, idx, bytes, &icd->time);
+		if (!td->o.disable_bw && per_unit_log(td->bw_log))
+			add_bw_sample(td, io_u, bytes, llnsec);
 
-	add_iops_sample(td, idx, bytes, &icd->time);
+		if (no_reduce && per_unit_log(td->iops_log))
+			add_iops_sample(td, io_u, bytes);
+	} else if (ddir_sync(idx) && !td->o.disable_clat)
+		add_sync_clat_sample(&td->ts, llnsec);
 
-	if (td->o.number_ios && !--td->o.number_ios)
-		td->done = 1;
+	if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM)
+		trim_block_info(td, io_u);
 }
 
-static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+static void file_log_write_comp(const struct thread_data *td, struct fio_file *f,
+				uint64_t offset, unsigned int bytes)
 {
-	uint64_t secs, remainder, bps, bytes;
+	int idx;
+
+	if (!f)
+		return;
 
-	bytes = td->this_io_bytes[ddir];
-	bps = td->rate_bps[ddir];
-	secs = bytes / bps;
-	remainder = bytes % bps;
-	return remainder * 1000000 / bps + secs * 1000000;
+	if (f->first_write == -1ULL || offset < f->first_write)
+		f->first_write = offset;
+	if (f->last_write == -1ULL || ((offset + bytes) > f->last_write))
+		f->last_write = offset + bytes;
+
+	if (!f->last_write_comp)
+		return;
+
+	idx = f->last_write_idx++;
+	f->last_write_comp[idx] = offset;
+	if (f->last_write_idx == td->o.iodepth)
+		f->last_write_idx = 0;
 }
 
-static void io_completed(struct thread_data *td, struct io_u *io_u,
+static bool should_account(struct thread_data *td)
+{
+	return ramp_time_over(td) && (td->runstate == TD_RUNNING ||
+					   td->runstate == TD_VERIFYING);
+}
+
+static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
 			 struct io_completion_data *icd)
 {
-	struct fio_file *f;
+	struct io_u *io_u = *io_u_ptr;
+	enum fio_ddir ddir = io_u->ddir;
+	struct fio_file *f = io_u->file;
 
-	dprint_io_u(io_u, "io complete");
+	dprint_io_u(io_u, "complete");
 
-	td_io_u_lock(td);
 	assert(io_u->flags & IO_U_F_FLIGHT);
-	io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
-	td_io_u_unlock(td);
+	io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
 
-	if (ddir_sync(io_u->ddir)) {
-		td->last_was_sync = 1;
-		f = io_u->file;
+	/*
+	 * Mark IO ok to verify
+	 */
+	if (io_u->ipo) {
+		/*
+		 * Remove errored entry from the verification list
+		 */
+		if (io_u->error)
+			unlog_io_piece(td, io_u);
+		else {
+			io_u->ipo->flags &= ~IP_F_IN_FLIGHT;
+			write_barrier();
+		}
+	}
+
+	if (ddir_sync(ddir)) {
+		td->last_was_sync = true;
 		if (f) {
 			f->first_write = -1ULL;
 			f->last_write = -1ULL;
 		}
+		if (should_account(td))
+			account_io_completion(td, io_u, icd, ddir, io_u->buflen);
 		return;
 	}
 
-	td->last_was_sync = 0;
-	td->last_ddir = io_u->ddir;
+	td->last_was_sync = false;
+	td->last_ddir = ddir;
 
-	if (!io_u->error && ddir_rw(io_u->ddir)) {
-		unsigned int bytes = io_u->buflen - io_u->resid;
-		const enum fio_ddir idx = io_u->ddir;
-		const enum fio_ddir odx = io_u->ddir ^ 1;
+	if (!io_u->error && ddir_rw(ddir)) {
+		unsigned long long bytes = io_u->buflen - io_u->resid;
 		int ret;
 
-		td->io_blocks[idx]++;
-		td->this_io_blocks[idx]++;
-		td->io_bytes[idx] += bytes;
-
-		if (!(io_u->flags & IO_U_F_VER_LIST))
-			td->this_io_bytes[idx] += bytes;
-
-		if (idx == DDIR_WRITE) {
-			f = io_u->file;
-			if (f) {
-				if (f->first_write == -1ULL ||
-				    io_u->offset < f->first_write)
-					f->first_write = io_u->offset;
-				if (f->last_write == -1ULL ||
-				    ((io_u->offset + bytes) > f->last_write))
-					f->last_write = io_u->offset + bytes;
-			}
-		}
+		td->io_blocks[ddir]++;
+		td->io_bytes[ddir] += bytes;
 
-		if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
-					   td->runstate == TD_VERIFYING)) {
-			account_io_completion(td, io_u, icd, idx, bytes);
-
-			if (__should_check_rate(td, idx)) {
-				td->rate_pending_usleep[idx] =
-					(usec_for_io(td, idx) -
-					 utime_since_now(&td->start));
-			}
-			if (idx != DDIR_TRIM && __should_check_rate(td, odx))
-				td->rate_pending_usleep[odx] =
-					(usec_for_io(td, odx) -
-					 utime_since_now(&td->start));
+		if (!(io_u->flags & IO_U_F_VER_LIST)) {
+			td->this_io_blocks[ddir]++;
+			td->this_io_bytes[ddir] += bytes;
 		}
 
-		if (td_write(td) && idx == DDIR_WRITE &&
-		    td->o.do_verify &&
-		    td->o.verify != VERIFY_NONE &&
-		    !td->o.experimental_verify)
-			log_io_piece(td, io_u);
+		if (ddir == DDIR_WRITE)
+			file_log_write_comp(td, f, io_u->offset, bytes);
+
+		if (should_account(td))
+			account_io_completion(td, io_u, icd, ddir, bytes);
 
-		icd->bytes_done[idx] += bytes;
+		icd->bytes_done[ddir] += bytes;
 
 		if (io_u->end_io) {
-			ret = io_u->end_io(td, io_u);
+			ret = io_u->end_io(td, io_u_ptr);
+			io_u = *io_u_ptr;
 			if (ret && !icd->error)
 				icd->error = ret;
 		}
@@ -1511,9 +1965,11 @@
 		io_u_log_error(td, io_u);
 	}
 	if (icd->error) {
-		enum error_type_bit eb = td_error_type(io_u->ddir, icd->error);
+		enum error_type_bit eb = td_error_type(ddir, icd->error);
+
 		if (!td_non_fatal_error(td, eb, icd->error))
 			return;
+
 		/*
 		 * If there is a non_fatal error, then add to the error count
 		 * and clear all the errors.
@@ -1521,7 +1977,8 @@
 		update_error_count(td, icd->error);
 		td_clear_error(td);
 		icd->error = 0;
-		io_u->error = 0;
+		if (io_u)
+			io_u->error = 0;
 	}
 }
 
@@ -1529,13 +1986,14 @@
 		     int nr)
 {
 	int ddir;
-	if (!td->o.disable_clat || !td->o.disable_bw)
+
+	if (!gtod_reduce(td))
 		fio_gettime(&icd->time, NULL);
 
 	icd->nr = nr;
 
 	icd->error = 0;
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		icd->bytes_done[ddir] = 0;
 }
 
@@ -1548,9 +2006,9 @@
 	for (i = 0; i < icd->nr; i++) {
 		io_u = td->io_ops->event(td, i);
 
-		io_completed(td, io_u, icd);
+		io_completed(td, &io_u, icd);
 
-		if (!(io_u->flags & IO_U_F_FREE_DEF))
+		if (io_u)
 			put_io_u(td, io_u);
 	}
 }
@@ -1558,15 +2016,15 @@
 /*
  * Complete a single io_u for the sync engines.
  */
-int io_u_sync_complete(struct thread_data *td, struct io_u *io_u,
-		       uint64_t *bytes)
+int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
 {
 	struct io_completion_data icd;
+	int ddir;
 
 	init_icd(td, &icd, 1);
-	io_completed(td, io_u, &icd);
+	io_completed(td, &io_u, &icd);
 
-	if (!(io_u->flags & IO_U_F_FREE_DEF))
+	if (io_u)
 		put_io_u(td, io_u);
 
 	if (icd.error) {
@@ -1574,12 +2032,8 @@
 		return -1;
 	}
 
-	if (bytes) {
-		int ddir;
-
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
-			bytes[ddir] += icd.bytes_done[ddir];
-	}
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
 	return 0;
 }
@@ -1587,20 +2041,23 @@
 /*
  * Called to complete min_events number of io for the async engines.
  */
-int io_u_queued_complete(struct thread_data *td, int min_evts,
-			 uint64_t *bytes)
+int io_u_queued_complete(struct thread_data *td, int min_evts)
 {
 	struct io_completion_data icd;
 	struct timespec *tvp = NULL;
-	int ret;
+	int ret, ddir;
 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
 
-	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
+	dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
 
 	if (!min_evts)
 		tvp = &ts;
+	else if (min_evts > td->cur_depth)
+		min_evts = td->cur_depth;
 
-	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
+	/* No worries, td_io_getevents fixes min and max if they are
+	 * set incorrectly */
+	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete_max, tvp);
 	if (ret < 0) {
 		td_verror(td, -ret, "td_io_getevents");
 		return ret;
@@ -1614,14 +2071,10 @@
 		return -1;
 	}
 
-	if (bytes) {
-		int ddir;
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
-			bytes[ddir] += icd.bytes_done[ddir];
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1629,41 +2082,157 @@
  */
 void io_u_queued(struct thread_data *td, struct io_u *io_u)
 {
-	if (!td->o.disable_slat) {
+	if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) {
 		unsigned long slat_time;
 
-		slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
-		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen);
+		slat_time = ntime_since(&io_u->start_time, &io_u->issue_time);
+
+		if (td->parent)
+			td = td->parent;
+
+		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
+				io_u->offset);
+	}
+}
+
+/*
+ * See if we should reuse the last seed, if dedupe is enabled
+ */
+static struct frand_state *get_buf_state(struct thread_data *td)
+{
+	unsigned int v;
+
+	if (!td->o.dedupe_percentage)
+		return &td->buf_state;
+	else if (td->o.dedupe_percentage == 100) {
+		frand_copy(&td->buf_state_prev, &td->buf_state);
+		return &td->buf_state;
 	}
+
+	v = rand_between(&td->dedupe_state, 1, 100);
+
+	if (v <= td->o.dedupe_percentage)
+		return &td->buf_state_prev;
+
+	return &td->buf_state;
 }
 
-void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
-		    unsigned int max_bs)
+static void save_buf_state(struct thread_data *td, struct frand_state *rs)
 {
-	if (!td->o.zero_buffers) {
-		unsigned int perc = td->o.compress_percentage;
+	if (td->o.dedupe_percentage == 100)
+		frand_copy(rs, &td->buf_state_prev);
+	else if (rs == &td->buf_state)
+		frand_copy(&td->buf_state_prev, rs);
+}
 
-		if (perc) {
-			unsigned int seg = min_write;
+void fill_io_buffer(struct thread_data *td, void *buf, unsigned long long min_write,
+		    unsigned long long max_bs)
+{
+	struct thread_options *o = &td->o;
 
-			seg = min(min_write, td->o.compress_chunk);
-			if (!seg)
-				seg = min_write;
-
-			fill_random_buf_percentage(&td->buf_state, buf,
-						perc, seg, max_bs);
-		} else
-			fill_random_buf(&td->buf_state, buf, max_bs);
-	} else
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		return;
+
+	if (o->compress_percentage || o->dedupe_percentage) {
+		unsigned int perc = td->o.compress_percentage;
+		struct frand_state *rs;
+		unsigned long long left = max_bs;
+		unsigned long long this_write;
+
+		do {
+			rs = get_buf_state(td);
+
+			min_write = min(min_write, left);
+
+			if (perc) {
+				this_write = min_not_zero(min_write,
+							(unsigned long long) td->o.compress_chunk);
+
+				fill_random_buf_percentage(rs, buf, perc,
+					this_write, this_write,
+					o->buffer_pattern,
+					o->buffer_pattern_bytes);
+			} else {
+				fill_random_buf(rs, buf, min_write);
+				this_write = min_write;
+			}
+
+			buf += this_write;
+			left -= this_write;
+			save_buf_state(td, rs);
+		} while (left);
+	} else if (o->buffer_pattern_bytes)
+		fill_buffer_pattern(td, buf, max_bs);
+	else if (o->zero_buffers)
 		memset(buf, 0, max_bs);
+	else
+		fill_random_buf(get_buf_state(td), buf, max_bs);
 }
 
 /*
  * "randomly" fill the buffer contents
  */
 void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u,
-		      unsigned int min_write, unsigned int max_bs)
+		      unsigned long long min_write, unsigned long long max_bs)
 {
 	io_u->buf_filled_len = 0;
 	fill_io_buffer(td, io_u->buf, min_write, max_bs);
 }
+
+static int do_sync_file_range(const struct thread_data *td,
+			      struct fio_file *f)
+{
+	off64_t offset, nbytes;
+
+	offset = f->first_write;
+	nbytes = f->last_write - f->first_write;
+
+	if (!nbytes)
+		return 0;
+
+	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
+}
+
+int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	if (io_u->ddir == DDIR_SYNC) {
+		ret = fsync(io_u->file->fd);
+	} else if (io_u->ddir == DDIR_DATASYNC) {
+#ifdef CONFIG_FDATASYNC
+		ret = fdatasync(io_u->file->fd);
+#else
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+#endif
+	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
+		ret = do_sync_file_range(td, io_u->file);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	if (ret < 0)
+		io_u->error = errno;
+
+	return ret;
+}
+
+int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+{
+#ifndef FIO_HAVE_TRIM
+	io_u->error = EINVAL;
+	return 0;
+#else
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
+	if (!ret)
+		return io_u->xfer_buflen;
+
+	io_u->error = ret;
+	return 0;
+#endif
+}
diff -Nru fio-2.1.3/io_u.h fio-3.16/io_u.h
--- fio-2.1.3/io_u.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/io_u.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,197 @@
+#ifndef FIO_IO_U
+#define FIO_IO_U
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "io_ddir.h"
+#include "debug.h"
+#include "file.h"
+#include "workqueue.h"
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+#ifdef CONFIG_GUASI
+#include <guasi.h>
+#endif
+
+enum {
+	IO_U_F_FREE		= 1 << 0,
+	IO_U_F_FLIGHT		= 1 << 1,
+	IO_U_F_NO_FILE_PUT	= 1 << 2,
+	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
+	IO_U_F_BUSY_OK		= 1 << 4,
+	IO_U_F_TRIMMED		= 1 << 5,
+	IO_U_F_BARRIER		= 1 << 6,
+	IO_U_F_VER_LIST		= 1 << 7,
+};
+
+/*
+ * The io unit
+ */
+struct io_u {
+	struct timespec start_time;
+	struct timespec issue_time;
+
+	struct fio_file *file;
+	unsigned int flags;
+	enum fio_ddir ddir;
+
+	/*
+	 * For replay workloads, we may want to account as a different
+	 * IO type than what is being submitted.
+	 */
+	enum fio_ddir acct_ddir;
+
+	/*
+	 * Write generation
+	 */
+	unsigned short numberio;
+
+	/*
+	 * Allocated/set buffer and length
+	 */
+	unsigned long long buflen;
+	unsigned long long offset;
+	void *buf;
+
+	/*
+	 * Initial seed for generating the buffer contents
+	 */
+	uint64_t rand_seed;
+
+	/*
+	 * IO engine state, may be different from above when we get
+	 * partial transfers / residual data counts
+	 */
+	void *xfer_buf;
+	unsigned long long xfer_buflen;
+
+	/*
+	 * Parameter related to pre-filled buffers and
+	 * their size to handle variable block sizes.
+	 */
+	unsigned long long buf_filled_len;
+
+	struct io_piece *ipo;
+
+	unsigned long long resid;
+	unsigned int error;
+
+	/*
+	 * io engine private data
+	 */
+	union {
+		unsigned int index;
+		unsigned int seen;
+		void *engine_data;
+	};
+
+	union {
+		struct flist_head verify_list;
+		struct workqueue_work work;
+	};
+
+#ifdef CONFIG_LINUX_BLKZONED
+	/*
+	 * ZBD mode zbd_queue_io callback: called after engine->queue operation
+	 * to advance a zone write pointer and eventually unlock the I/O zone.
+	 * @q indicates the I/O queue status (busy, queued or completed).
+	 * @success == true means that the I/O operation has been queued or
+	 * completed successfully.
+	 */
+	void (*zbd_queue_io)(struct io_u *, int q, bool success);
+
+	/*
+	 * ZBD mode zbd_put_io callback: called in after completion of an I/O
+	 * or commit of an async I/O to unlock the I/O target zone.
+	 */
+	void (*zbd_put_io)(const struct io_u *);
+#endif
+
+	/*
+	 * Callback for io completion
+	 */
+	int (*end_io)(struct thread_data *, struct io_u **);
+
+	union {
+#ifdef CONFIG_LIBAIO
+		struct iocb iocb;
+#endif
+#ifdef CONFIG_POSIXAIO
+		os_aiocb_t aiocb;
+#endif
+#ifdef FIO_HAVE_SGIO
+		struct sg_io_hdr hdr;
+#endif
+#ifdef CONFIG_GUASI
+		guasi_req_t greq;
+#endif
+#ifdef CONFIG_SOLARISAIO
+		aio_result_t resultp;
+#endif
+#ifdef CONFIG_RDMA
+		struct ibv_mr *mr;
+#endif
+		void *mmap_data;
+	};
+};
+
+/*
+ * io unit handling
+ */
+extern struct io_u *__get_io_u(struct thread_data *);
+extern struct io_u *get_io_u(struct thread_data *);
+extern void put_io_u(struct thread_data *, struct io_u *);
+extern void clear_io_u(struct thread_data *, struct io_u *);
+extern void requeue_io_u(struct thread_data *, struct io_u **);
+extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
+extern int __must_check io_u_queued_complete(struct thread_data *, int);
+extern void io_u_queued(struct thread_data *, struct io_u *);
+extern int io_u_quiesce(struct thread_data *);
+extern void io_u_log_error(struct thread_data *, struct io_u *);
+extern void io_u_mark_depth(struct thread_data *, unsigned int);
+extern void fill_io_buffer(struct thread_data *, void *, unsigned long long, unsigned long long);
+extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned long long, unsigned long long);
+void io_u_mark_complete(struct thread_data *, unsigned int);
+void io_u_mark_submit(struct thread_data *, unsigned int);
+bool queue_full(const struct thread_data *);
+
+int do_io_u_sync(const struct thread_data *, struct io_u *);
+int do_io_u_trim(const struct thread_data *, struct io_u *);
+
+#ifdef FIO_INC_DEBUG
+static inline void dprint_io_u(struct io_u *io_u, const char *p)
+{
+	struct fio_file *f = io_u->file;
+
+	if (f)
+		dprint(FD_IO, "%s: io_u %p: off=0x%llx,len=0x%llx,ddir=%d,file=%s\n",
+				p, io_u,
+				(unsigned long long) io_u->offset,
+				io_u->buflen, io_u->ddir,
+				f->file_name);
+	else
+		dprint(FD_IO, "%s: io_u %p: off=0x%llx,len=0x%llx,ddir=%d\n",
+				p, io_u,
+				(unsigned long long) io_u->offset,
+				io_u->buflen, io_u->ddir);
+}
+#else
+#define dprint_io_u(io_u, p)
+#endif
+
+static inline enum fio_ddir acct_ddir(struct io_u *io_u)
+{
+	if (io_u->acct_ddir != -1)
+		return io_u->acct_ddir;
+
+	return io_u->ddir;
+}
+
+#define io_u_clear(td, io_u, val)	\
+	td_flags_clear((td), &(io_u->flags), (val))
+#define io_u_set(td, io_u, val)		\
+	td_flags_set((td), &(io_u)->flags, (val))
+
+#endif
diff -Nru fio-2.1.3/io_u_queue.c fio-3.16/io_u_queue.c
--- fio-2.1.3/io_u_queue.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/io_u_queue.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,22 +1,32 @@
 #include <stdlib.h>
+#include <string.h>
 #include "io_u_queue.h"
+#include "smalloc.h"
 
-int io_u_qinit(struct io_u_queue *q, unsigned int nr)
+bool io_u_qinit(struct io_u_queue *q, unsigned int nr, bool shared)
 {
-	q->io_us = calloc(sizeof(struct io_u *), nr);
+	if (shared)
+		q->io_us = smalloc(nr * sizeof(struct io_u *));
+	else
+		q->io_us = calloc(nr, sizeof(struct io_u *));
+
 	if (!q->io_us)
-		return 1;
+		return false;
 
 	q->nr = 0;
-	return 0;
+	q->max = nr;
+	return true;
 }
 
-void io_u_qexit(struct io_u_queue *q)
+void io_u_qexit(struct io_u_queue *q, bool shared)
 {
-	free(q->io_us);
+	if (shared)
+		sfree(q->io_us);
+	else
+		free(q->io_us);
 }
 
-int io_u_rinit(struct io_u_ring *ring, unsigned int nr)
+bool io_u_rinit(struct io_u_ring *ring, unsigned int nr)
 {
 	ring->max = nr + 1;
 	if (ring->max & (ring->max - 1)) {
@@ -29,12 +39,12 @@
 		ring->max++;
 	}
 
-	ring->ring = calloc(sizeof(struct io_u *), ring->max);
+	ring->ring = calloc(ring->max, sizeof(struct io_u *));
 	if (!ring->ring)
-		return 1;
+		return false;
 
 	ring->head = ring->tail = 0;
-	return 0;
+	return true;
 }
 
 void io_u_rexit(struct io_u_ring *ring)
diff -Nru fio-2.1.3/io_u_queue.h fio-3.16/io_u_queue.h
--- fio-2.1.3/io_u_queue.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/io_u_queue.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,37 +2,51 @@
 #define FIO_IO_U_QUEUE
 
 #include <assert.h>
+#include <stddef.h>
+
+#include "lib/types.h"
 
 struct io_u;
 
 struct io_u_queue {
 	struct io_u **io_us;
 	unsigned int nr;
+	unsigned int max;
 };
 
 static inline struct io_u *io_u_qpop(struct io_u_queue *q)
 {
-	if (q->nr)
-		return q->io_us[--q->nr];
+	if (q->nr) {
+		const unsigned int next = --q->nr;
+		struct io_u *io_u = q->io_us[next];
+
+		q->io_us[next] = NULL;
+		return io_u;
+	}
 
 	return NULL;
 }
 
 static inline void io_u_qpush(struct io_u_queue *q, struct io_u *io_u)
 {
-	q->io_us[q->nr++] = io_u;
+	if (q->nr < q->max) {
+		q->io_us[q->nr++] = io_u;
+		return;
+	}
+
+	assert(0);
 }
 
-static inline int io_u_qempty(struct io_u_queue *q)
+static inline int io_u_qempty(const struct io_u_queue *q)
 {
 	return !q->nr;
 }
 
 #define io_u_qiter(q, io_u, i)	\
-	for (i = 0, io_u = (q)->io_us[0]; i < (q)->nr; i++, io_u = (q)->io_us[i])
+	for (i = 0; i < (q)->nr && (io_u = (q)->io_us[i]); i++)
 
-int io_u_qinit(struct io_u_queue *q, unsigned int nr);
-void io_u_qexit(struct io_u_queue *q);
+bool io_u_qinit(struct io_u_queue *q, unsigned int nr, bool shared);
+void io_u_qexit(struct io_u_queue *q, bool shared);
 
 struct io_u_ring {
 	unsigned int head;
@@ -41,7 +55,7 @@
 	struct io_u **ring;
 };
 
-int io_u_rinit(struct io_u_ring *ring, unsigned int nr);
+bool io_u_rinit(struct io_u_ring *ring, unsigned int nr);
 void io_u_rexit(struct io_u_ring *ring);
 
 static inline void io_u_rpush(struct io_u_ring *r, struct io_u *io_u)
diff -Nru fio-2.1.3/json.c fio-3.16/json.c
--- fio-2.1.3/json.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/json.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,5 @@
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 #include <errno.h>
 #include <stdarg.h>
 #include "json.h"
@@ -8,18 +7,12 @@
 
 struct json_object *json_create_object(void)
 {
-	struct json_object *obj = malloc(sizeof(struct json_object));
-	if (obj)
-		memset(obj, 0, sizeof(struct json_object));
-	return obj;
+	return calloc(1, sizeof(struct json_object));
 }
 
 struct json_array *json_create_array(void)
 {
-	struct json_array *array = malloc(sizeof(struct json_array));
-	if (array)
-		memset(array, 0, sizeof(struct json_array));
-	return array;
+	return calloc(1, sizeof(struct json_array));
 }
 
 static struct json_pair *json_create_pair(const char *name, struct json_value *value)
@@ -35,7 +28,7 @@
 	return pair;
 }
 
-static struct json_value *json_create_value_int(long number)
+static struct json_value *json_create_value_int(long long number)
 {
 	struct json_value *value = malloc(sizeof(struct json_value));
 
@@ -46,7 +39,7 @@
 	return value;
 }
 
-static struct json_value *json_create_value_float(float number)
+static struct json_value *json_create_value_float(double number)
 {
 	struct json_value *value = malloc(sizeof(struct json_value));
 
@@ -84,7 +77,7 @@
 }
 
 /*
- * Valid JSON strings must escape '"' and '/' with a preceeding '/'
+ * Valid JSON strings must escape '"' and '/' with a preceding '/'
  */
 static struct json_value *json_create_value_string(const char *str)
 {
@@ -212,7 +205,7 @@
 	if (type == JSON_TYPE_STRING)
 		value = json_create_value_string(va_arg(args, char *));
 	else if (type == JSON_TYPE_INTEGER)
-		value = json_create_value_int(va_arg(args, long));
+		value = json_create_value_int(va_arg(args, long long));
 	else if (type == JSON_TYPE_FLOAT)
 		value = json_create_value_float(va_arg(args, double));
 	else if (type == JSON_TYPE_OBJECT)
@@ -237,7 +230,7 @@
 	return 0;
 }
 
-static void json_print_array(struct json_array *array);
+static void json_print_array(struct json_array *array, struct buf_output *);
 int json_array_add_value_type(struct json_array *array, int type, ...)
 {
 	struct json_value *value;
@@ -248,7 +241,7 @@
 	if (type == JSON_TYPE_STRING)
 		value = json_create_value_string(va_arg(args, char *));
 	else if (type == JSON_TYPE_INTEGER)
-		value = json_create_value_int(va_arg(args, long));
+		value = json_create_value_int(va_arg(args, long long));
 	else if (type == JSON_TYPE_FLOAT)
 		value = json_create_value_float(va_arg(args, double));
 	else if (type == JSON_TYPE_OBJECT)
@@ -296,70 +289,70 @@
 		return json_array_level(value->parent_array) + 1;
 }
 
-static void json_print_level(int level)
+static void json_print_level(int level, struct buf_output *out)
 {
 	while (level-- > 0)
-		log_info("  ");
+		log_buf(out, "  ");
 }
 
-static void json_print_pair(struct json_pair *pair);
-static void json_print_array(struct json_array *array);
-static void json_print_value(struct json_value *value);
-void json_print_object(struct json_object *obj)
+static void json_print_pair(struct json_pair *pair, struct buf_output *);
+static void json_print_array(struct json_array *array, struct buf_output *);
+static void json_print_value(struct json_value *value, struct buf_output *);
+void json_print_object(struct json_object *obj, struct buf_output *out)
 {
 	int i;
 
-	log_info("{\n");
+	log_buf(out, "{\n");
 	for (i = 0; i < obj->pair_cnt; i++) {
 		if (i > 0)
-			log_info(",\n");
-		json_print_pair(obj->pairs[i]);
+			log_buf(out, ",\n");
+		json_print_pair(obj->pairs[i], out);
 	}
-	log_info("\n");
-	json_print_level(json_object_level(obj));
-	log_info("}");
+	log_buf(out, "\n");
+	json_print_level(json_object_level(obj), out);
+	log_buf(out, "}");
 }
 
-static void json_print_pair(struct json_pair *pair)
+static void json_print_pair(struct json_pair *pair, struct buf_output *out)
 {
-	json_print_level(json_pair_level(pair));
-	log_info("\"%s\" : ", pair->name);
-	json_print_value(pair->value);
+	json_print_level(json_pair_level(pair), out);
+	log_buf(out, "\"%s\" : ", pair->name);
+	json_print_value(pair->value, out);
 }
 
-static void json_print_array(struct json_array *array)
+static void json_print_array(struct json_array *array, struct buf_output *out)
 {
 	int i;
 
-	log_info("[\n");
+	log_buf(out, "[\n");
 	for (i = 0; i < array->value_cnt; i++) {
 		if (i > 0)
-			log_info(",\n");
-		json_print_level(json_value_level(array->values[i]));
-		json_print_value(array->values[i]);
+			log_buf(out, ",\n");
+		json_print_level(json_value_level(array->values[i]), out);
+		json_print_value(array->values[i], out);
 	}
-	log_info("\n");
-	json_print_level(json_array_level(array));
-	log_info("]");
+	log_buf(out, "\n");
+	json_print_level(json_array_level(array), out);
+	log_buf(out, "]");
 }
 
-static void json_print_value(struct json_value *value)
+static void json_print_value(struct json_value *value, struct buf_output *out)
 {
 	switch (value->type) {
 	case JSON_TYPE_STRING:
-		log_info("\"%s\"", value->string);
+		log_buf(out, "\"%s\"", value->string);
 		break;
 	case JSON_TYPE_INTEGER:
-		log_info("%ld", value->integer_number);
+		log_buf(out, "%lld", value->integer_number);
 		break;
 	case JSON_TYPE_FLOAT:
-		log_info("%.2f", value->float_number);
+		log_buf(out, "%f", value->float_number);
 		break;
 	case JSON_TYPE_OBJECT:
-		json_print_object(value->object);
+		json_print_object(value->object, out);
 		break;
 	case JSON_TYPE_ARRAY:
-		json_print_array(value->array);
+		json_print_array(value->array, out);
 		break;
 	}
 }
diff -Nru fio-2.1.3/json.h fio-3.16/json.h
--- fio-2.1.3/json.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/json.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,8 +1,7 @@
 #ifndef __JSON__H
 #define __JSON__H
-struct json_object;
-struct json_array;
-struct json_pair;
+
+#include "lib/output_buffer.h"
 
 #define JSON_TYPE_STRING 0
 #define JSON_TYPE_INTEGER 1
@@ -14,7 +13,7 @@
 struct json_value {
 	int type;
 	union {
-		long integer_number;
+		long long integer_number;
 		double float_number;
 		char *string;
 		struct json_object *object;
@@ -52,7 +51,7 @@
 
 int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...);
 #define json_object_add_value_int(obj, name, val) \
-	json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (val))
+	json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (long long) (val))
 #define json_object_add_value_float(obj, name, val) \
 	json_object_add_value_type((obj), name, JSON_TYPE_FLOAT, (val))
 #define json_object_add_value_string(obj, name, val) \
@@ -73,5 +72,8 @@
 #define json_array_add_value_array(obj, val) \
 	json_array_add_value_type((obj), JSON_TYPE_ARRAY, (val))
 
-void json_print_object(struct json_object *obj);
+#define json_array_last_value_object(obj) \
+	(obj->values[obj->value_cnt - 1]->object)
+
+void json_print_object(struct json_object *obj, struct buf_output *out);
 #endif
diff -Nru fio-2.1.3/lib/axmap.c fio-3.16/lib/axmap.c
--- fio-2.1.3/lib/axmap.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/axmap.c	2019-09-20 01:01:52.000000000 +0000
@@ -5,7 +5,7 @@
  * nothing to do with anything remotely narcissistic.
  *
  * A set bit at layer N indicates a full word at layer N-1, and so forth. As
- * the bitmap becomes progressively more full, checking for existance
+ * the bitmap becomes progressively more full, checking for existence
  * becomes cheaper (since fewer layers are walked, making it a lot more
  * cache friendly) and locating the next free space likewise.
  *
@@ -22,7 +22,6 @@
 
 #include "../arch/arch.h"
 #include "axmap.h"
-#include "../smalloc.h"
 #include "../minmax.h"
 
 #if BITS_PER_LONG == 64
@@ -33,32 +32,58 @@
 #error "Number of arch bits unknown"
 #endif
 
-#define BLOCKS_PER_UNIT		(1UL << UNIT_SHIFT)
+#define BLOCKS_PER_UNIT		(1U << UNIT_SHIFT)
 #define BLOCKS_PER_UNIT_MASK	(BLOCKS_PER_UNIT - 1)
 
-#define firstfree_valid(b)	((b)->first_free != (uint64_t) -1)
+static const unsigned long bit_masks[] = {
+	0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007,
+	0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f,
+	0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff,
+	0x0000000000000fff, 0x0000000000001fff, 0x0000000000003fff, 0x0000000000007fff,
+	0x000000000000ffff, 0x000000000001ffff, 0x000000000003ffff, 0x000000000007ffff,
+	0x00000000000fffff, 0x00000000001fffff, 0x00000000003fffff, 0x00000000007fffff,
+	0x0000000000ffffff, 0x0000000001ffffff, 0x0000000003ffffff, 0x0000000007ffffff,
+	0x000000000fffffff, 0x000000001fffffff, 0x000000003fffffff, 0x000000007fffffff,
+	0x00000000ffffffff,
+#if BITS_PER_LONG == 64
+	0x00000001ffffffff, 0x00000003ffffffff, 0x00000007ffffffff, 0x0000000fffffffff,
+	0x0000001fffffffff, 0x0000003fffffffff, 0x0000007fffffffff, 0x000000ffffffffff,
+	0x000001ffffffffff, 0x000003ffffffffff, 0x000007ffffffffff, 0x00000fffffffffff,
+	0x00001fffffffffff, 0x00003fffffffffff, 0x00007fffffffffff, 0x0000ffffffffffff,
+	0x0001ffffffffffff, 0x0003ffffffffffff, 0x0007ffffffffffff, 0x000fffffffffffff,
+	0x001fffffffffffff, 0x003fffffffffffff, 0x007fffffffffffff, 0x00ffffffffffffff,
+	0x01ffffffffffffff, 0x03ffffffffffffff, 0x07ffffffffffffff, 0x0fffffffffffffff,
+	0x1fffffffffffffff, 0x3fffffffffffffff, 0x7fffffffffffffff, 0xffffffffffffffff
+#endif
+};
 
+/**
+ * struct axmap_level - a bitmap used to implement struct axmap
+ * @level: Level index. Each map has at least one level with index zero. The
+ *	higher the level index, the fewer bits a struct axmap_level contains.
+ * @map_size: Number of elements of the @map array.
+ * @map: A bitmap with @map_size elements.
+ */
 struct axmap_level {
 	int level;
 	unsigned long map_size;
 	unsigned long *map;
 };
 
+/**
+ * struct axmap - a set that can store numbers 0 .. @nr_bits - 1
+ * @nr_level: Number of elements of the @levels array.
+ * @levels: struct axmap_level array in which lower levels contain more bits
+ *	than higher levels.
+ * @nr_bits: One more than the highest value stored in the set.
+ */
 struct axmap {
 	unsigned int nr_levels;
 	struct axmap_level *levels;
-	uint64_t first_free;
 	uint64_t nr_bits;
 };
 
-static unsigned long ulog64(unsigned long val, unsigned int log)
-{
-	while (log-- && val)
-		val >>= UNIT_SHIFT;
-
-	return val;
-}
-
+/* Remove all elements from the @axmap set */
 void axmap_reset(struct axmap *axmap)
 {
 	int i;
@@ -68,8 +93,6 @@
 
 		memset(al->map, 0, al->map_size * sizeof(unsigned long));
 	}
-
-	axmap->first_free = 0;
 }
 
 void axmap_free(struct axmap *axmap)
@@ -80,18 +103,19 @@
 		return;
 
 	for (i = 0; i < axmap->nr_levels; i++)
-		sfree(axmap->levels[i].map);
+		free(axmap->levels[i].map);
 
-	sfree(axmap->levels);
-	sfree(axmap);
+	free(axmap->levels);
+	free(axmap);
 }
 
-struct axmap *axmap_new(unsigned long nr_bits)
+/* Allocate memory for a set that can store the numbers 0 .. @nr_bits - 1. */
+struct axmap *axmap_new(uint64_t nr_bits)
 {
 	struct axmap *axmap;
 	unsigned int i, levels;
 
-	axmap = smalloc(sizeof(*axmap));
+	axmap = malloc(sizeof(*axmap));
 	if (!axmap)
 		return NULL;
 
@@ -103,87 +127,87 @@
 	}
 
 	axmap->nr_levels = levels;
-	axmap->levels = smalloc(axmap->nr_levels * sizeof(struct axmap_level));
+	axmap->levels = calloc(axmap->nr_levels, sizeof(struct axmap_level));
+	if (!axmap->levels)
+		goto free_axmap;
 	axmap->nr_bits = nr_bits;
 
 	for (i = 0; i < axmap->nr_levels; i++) {
 		struct axmap_level *al = &axmap->levels[i];
 
+		nr_bits = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
+
 		al->level = i;
-		al->map_size = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
-		al->map = smalloc(al->map_size * sizeof(unsigned long));
+		al->map_size = nr_bits;
+		al->map = malloc(al->map_size * sizeof(unsigned long));
 		if (!al->map)
-			goto err;
+			goto free_levels;
 
-		nr_bits = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
 	}
 
 	axmap_reset(axmap);
 	return axmap;
-err:
+
+free_levels:
 	for (i = 0; i < axmap->nr_levels; i++)
-		if (axmap->levels[i].map)
-			sfree(axmap->levels[i].map);
+		free(axmap->levels[i].map);
 
-	sfree(axmap->levels);
+	free(axmap->levels);
+
+free_axmap:
+	free(axmap);
 	return NULL;
 }
 
-static int axmap_handler(struct axmap *axmap, uint64_t bit_nr,
-			  int (*func)(struct axmap_level *, unsigned long, unsigned int,
+/*
+ * Call @func for each level, starting at level zero, until a level is found
+ * for which @func returns true. Return false if none of the @func calls
+ * returns true.
+ */
+static bool axmap_handler(struct axmap *axmap, uint64_t bit_nr,
+			  bool (*func)(struct axmap_level *, uint64_t, unsigned int,
 			  void *), void *data)
 {
 	struct axmap_level *al;
+	uint64_t index = bit_nr;
 	int i;
 
 	for (i = 0; i < axmap->nr_levels; i++) {
-		unsigned long index = ulog64(bit_nr, i);
 		unsigned long offset = index >> UNIT_SHIFT;
 		unsigned int bit = index & BLOCKS_PER_UNIT_MASK;
 
 		al = &axmap->levels[i];
 
 		if (func(al, offset, bit, data))
-			return 1;
+			return true;
+
+		if (index)
+			index >>= UNIT_SHIFT;
 	}
 
-	return 0;
+	return false;
 }
 
-static int axmap_handler_topdown(struct axmap *axmap, uint64_t bit_nr,
-	int (*func)(struct axmap_level *, unsigned long, unsigned int, void *),
-	void *data)
+/*
+ * Call @func for each level, starting at the highest level, until a level is
+ * found for which @func returns true. Return false if none of the @func calls
+ * returns true.
+ */
+static bool axmap_handler_topdown(struct axmap *axmap, uint64_t bit_nr,
+	bool (*func)(struct axmap_level *, uint64_t, unsigned int, void *))
 {
-	struct axmap_level *al;
-	int i, level = axmap->nr_levels;
+	int i;
 
 	for (i = axmap->nr_levels - 1; i >= 0; i--) {
-		unsigned long index = ulog64(bit_nr, --level);
+		uint64_t index = bit_nr >> (UNIT_SHIFT * i);
 		unsigned long offset = index >> UNIT_SHIFT;
 		unsigned int bit = index & BLOCKS_PER_UNIT_MASK;
 
-		al = &axmap->levels[i];
-
-		if (func(al, offset, bit, data))
-			return 1;
+		if (func(&axmap->levels[i], offset, bit, NULL))
+			return true;
 	}
 
-	return 0;
-}
-
-static int axmap_clear_fn(struct axmap_level *al, unsigned long offset,
-			   unsigned int bit, void *unused)
-{
-	if (!(al->map[offset] & (1UL << bit)))
-		return 1;
-
-	al->map[offset] &= ~(1UL << bit);
-	return 0;
-}
-
-void axmap_clear(struct axmap *axmap, uint64_t bit_nr)
-{
-	axmap_handler(axmap, bit_nr, axmap_clear_fn, NULL);
+	return false;
 }
 
 struct axmap_set_data {
@@ -191,29 +215,12 @@
 	unsigned int set_bits;
 };
 
-static unsigned long bit_masks[] = {
-	0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007,
-	0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f,
-	0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff,
-	0x0000000000000fff, 0x0000000000001fff, 0x0000000000003fff, 0x0000000000007fff,
-	0x000000000000ffff, 0x000000000001ffff, 0x000000000003ffff, 0x000000000007ffff,
-	0x00000000000fffff, 0x00000000001fffff, 0x00000000003fffff, 0x00000000007fffff,
-	0x0000000000ffffff, 0x0000000001ffffff, 0x0000000003ffffff, 0x0000000007ffffff,
-	0x000000000fffffff, 0x000000001fffffff, 0x000000003fffffff, 0x000000007fffffff,
-	0x00000000ffffffff,
-#if BITS_PER_LONG == 64
-	0x00000001ffffffff, 0x00000003ffffffff, 0x00000007ffffffff, 0x0000000fffffffff,
-	0x0000001fffffffff, 0x0000003fffffffff, 0x0000007fffffffff, 0x000000ffffffffff,
-	0x000001ffffffffff, 0x000003ffffffffff, 0x000007ffffffffff, 0x00000fffffffffff,
-	0x00001fffffffffff, 0x00003fffffffffff, 0x00007fffffffffff, 0x0000ffffffffffff,
-	0x0001ffffffffffff, 0x0003ffffffffffff, 0x0007ffffffffffff, 0x000fffffffffffff,
-	0x001fffffffffffff, 0x003fffffffffffff, 0x007fffffffffffff, 0x00ffffffffffffff,
-	0x01ffffffffffffff, 0x03ffffffffffffff, 0x07ffffffffffffff, 0x0fffffffffffffff,
-	0x1fffffffffffffff, 0x3fffffffffffffff, 0x7fffffffffffffff, 0xffffffffffffffff
-#endif
-};
-
-static int axmap_set_fn(struct axmap_level *al, unsigned long offset,
+/*
+ * Set at most @__data->nr_bits bits in @al at offset @offset. Do not exceed
+ * the boundary of the element at offset @offset. Return the number of bits
+ * that have been set in @__data->set_bits if @al->level == 0.
+ */
+static bool axmap_set_fn(struct axmap_level *al, uint64_t offset,
 			 unsigned int bit, void *__data)
 {
 	struct axmap_set_data *data = __data;
@@ -228,59 +235,52 @@
 	 * Mask off any potential overlap, only sets contig regions
 	 */
 	overlap = al->map[offset] & mask;
-	if (overlap == mask)
-		return 1;
-
-	while (overlap) {
-		unsigned long clear_mask = ~(1UL << ffz(~overlap));
+	if (overlap == mask) {
+		data->set_bits = 0;
+		return true;
+	}
 
-		mask &= clear_mask;
-		overlap &= clear_mask;
-		nr_bits--;
+	if (overlap) {
+		nr_bits = ffz(~overlap) - bit;
+		if (!nr_bits)
+			return true;
+		mask = bit_masks[nr_bits] << bit;
 	}
 
 	assert(mask);
 	assert(!(al->map[offset] & mask));
-		
 	al->map[offset] |= mask;
 
 	if (!al->level)
 		data->set_bits = nr_bits;
 
+	/* For the next level */
 	data->nr_bits = 1;
+
 	return al->map[offset] != -1UL;
 }
 
+/*
+ * Set up to @data->nr_bits starting from @bit_nr in @axmap. Start at
+ * @bit_nr. If that bit has not yet been set then set it and continue until
+ * either @data->nr_bits have been set or a 1 bit is found. Store the number
+ * of bits that have been set in @data->set_bits. It is guaranteed that all
+ * bits that have been requested to set fit in the same unsigned long word of
+ * level 0 of @axmap.
+ */
 static void __axmap_set(struct axmap *axmap, uint64_t bit_nr,
 			 struct axmap_set_data *data)
 {
-	unsigned int set_bits, nr_bits = data->nr_bits;
-
-	if (axmap->first_free >= bit_nr &&
-	    axmap->first_free < bit_nr + data->nr_bits)
-		axmap->first_free = -1ULL;
+	unsigned int nr_bits = data->nr_bits;
 
 	if (bit_nr > axmap->nr_bits)
 		return;
 	else if (bit_nr + nr_bits > axmap->nr_bits)
 		nr_bits = axmap->nr_bits - bit_nr;
 
-	set_bits = 0;
-	while (nr_bits) {
-		axmap_handler(axmap, bit_nr, axmap_set_fn, data);
-		set_bits += data->set_bits;
-
-		if (!data->set_bits ||
-		    data->set_bits != (BLOCKS_PER_UNIT - nr_bits))
-			break;
-
-		nr_bits -= data->set_bits;
-		bit_nr += data->set_bits;
-
-		data->nr_bits = nr_bits;
-	}
+	assert(nr_bits <= BLOCKS_PER_UNIT);
 
-	data->set_bits = set_bits;
+	axmap_handler(axmap, bit_nr, axmap_set_fn, data);
 }
 
 void axmap_set(struct axmap *axmap, uint64_t bit_nr)
@@ -290,7 +290,14 @@
 	__axmap_set(axmap, bit_nr, &data);
 }
 
-unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits)
+/*
+ * Set up to @nr_bits starting from @bit in @axmap. Start at @bit. If that
+ * bit has not yet been set then set it and continue until either @nr_bits
+ * have been set or a 1 bit is found. Return the number of bits that have been
+ * set.
+ */
+unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr,
+			  unsigned int nr_bits)
 {
 	unsigned int set_bits = 0;
 
@@ -299,7 +306,7 @@
 		unsigned int max_bits, this_set;
 
 		max_bits = BLOCKS_PER_UNIT - (bit_nr & BLOCKS_PER_UNIT_MASK);
-		if (max_bits < nr_bits)
+		if (nr_bits > max_bits)
 			data.nr_bits = max_bits;
 
 		this_set = data.nr_bits;
@@ -315,118 +322,133 @@
 	return set_bits;
 }
 
-static int axmap_isset_fn(struct axmap_level *al, unsigned long offset,
-			    unsigned int bit, void *unused)
+static bool axmap_isset_fn(struct axmap_level *al, uint64_t offset,
+			   unsigned int bit, void *unused)
 {
-	return (al->map[offset] & (1UL << bit)) != 0;
+	return (al->map[offset] & (1ULL << bit)) != 0;
 }
 
-int axmap_isset(struct axmap *axmap, uint64_t bit_nr)
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr)
 {
 	if (bit_nr <= axmap->nr_bits)
-		return axmap_handler_topdown(axmap, bit_nr, axmap_isset_fn, NULL);
+		return axmap_handler_topdown(axmap, bit_nr, axmap_isset_fn);
 
-	return 0;
+	return false;
 }
 
-static uint64_t axmap_find_first_free(struct axmap *axmap, unsigned int level,
-				       uint64_t index)
+/*
+ * Find the first free bit that is at least as large as bit_nr.  Return
+ * -1 if no free bit is found before the end of the map.
+ */
+static uint64_t axmap_find_first_free(struct axmap *axmap, uint64_t bit_nr)
 {
-	uint64_t ret = -1ULL;
-	unsigned long j;
 	int i;
+	unsigned long temp;
+	unsigned int bit;
+	uint64_t offset, base_index, index;
+	struct axmap_level *al;
 
-	/*
-	 * Start at the bottom, then converge towards first free bit at the top
-	 */
-	for (i = level; i >= 0; i--) {
-		struct axmap_level *al = &axmap->levels[i];
+	index = 0;
+	for (i = axmap->nr_levels - 1; i >= 0; i--) {
+		al = &axmap->levels[i];
+
+		/* Shift previously calculated index for next level */
+		index <<= UNIT_SHIFT;
 
 		/*
-		 * Clear 'ret', this is a bug condition.
+		 * Start from an index that's at least as large as the
+		 * originally passed in bit number.
 		 */
-		if (index >= al->map_size) {
-			ret = -1ULL;
-			break;
-		}
-
-		for (j = index; j < al->map_size; j++) {
-			if (al->map[j] == -1UL)
-				continue;
-
-			/*
-			 * First free bit here is our index into the first
-			 * free bit at the next higher level
-			 */
-			ret = index = (j << UNIT_SHIFT) + ffz(al->map[j]);
-			break;
-		}
-	}
-
-	if (ret < axmap->nr_bits)
-		return ret;
-
-	return (uint64_t) -1ULL;
-}
+		base_index = bit_nr >> (UNIT_SHIFT * i);
+		if (index < base_index)
+			index = base_index;
+
+		/* Get the offset and bit for this level */
+		offset = index >> UNIT_SHIFT;
+		bit = index & BLOCKS_PER_UNIT_MASK;
 
-uint64_t axmap_first_free(struct axmap *axmap)
-{
-	if (firstfree_valid(axmap))
-		return axmap->first_free;
-
-	axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0);
-	return axmap->first_free;
-}
+		/*
+		 * If the previous level had unused bits in its last
+		 * word, the offset could be bigger than the map at
+		 * this level. That means no free bits exist before the
+		 * end of the map, so return -1.
+		 */
+		if (offset >= al->map_size)
+			return -1ULL;
 
-struct axmap_next_free_data {
-	unsigned int level;
-	unsigned long offset;
-	uint64_t bit;
-};
+		/* Check the first word starting with the specific bit */
+		temp = ~bit_masks[bit] & ~al->map[offset];
+		if (temp)
+			goto found;
 
-static int axmap_next_free_fn(struct axmap_level *al, unsigned long offset,
-			       unsigned int bit, void *__data)
-{
-	struct axmap_next_free_data *data = __data;
-	uint64_t mask = ~bit_masks[(data->bit + 1) & BLOCKS_PER_UNIT_MASK];
+		/*
+		 * No free bit in the first word, so iterate
+		 * looking for a word with one or more free bits.
+		 */
+		for (offset++; offset < al->map_size; offset++) {
+			temp = ~al->map[offset];
+			if (temp)
+				goto found;
+		}
 
-	if (!(mask & ~al->map[offset]))
-		return 0;
+		/* Did not find a free bit */
+		return -1ULL;
 
-	if (al->map[offset] != -1UL) {
-		data->level = al->level;
-		data->offset = offset;
-		return 1;
+found:
+		/* Compute the index of the free bit just found */
+		index = (offset << UNIT_SHIFT) + ffz(~temp);
 	}
 
-	data->bit = (data->bit + BLOCKS_PER_UNIT - 1) / BLOCKS_PER_UNIT;
-	return 0;
+	/* If found an unused bit in the last word of level 0, return -1 */
+	if (index >= axmap->nr_bits)
+		return -1ULL;
+
+	return index;
 }
 
 /*
  * 'bit_nr' is already set. Find the next free bit after this one.
+ * Return -1 if no free bits found.
  */
 uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr)
 {
-	struct axmap_next_free_data data = { .level = -1U, .bit = bit_nr, };
 	uint64_t ret;
+	uint64_t next_bit = bit_nr + 1;
+	unsigned long temp;
+	uint64_t offset;
+	unsigned int bit;
+
+	if (bit_nr >= axmap->nr_bits)
+		return -1ULL;
+
+	/* If at the end of the map, wrap-around */
+	if (next_bit == axmap->nr_bits)
+		next_bit = 0;
 
-	if (firstfree_valid(axmap) && bit_nr < axmap->first_free)
-		return axmap->first_free;
+	offset = next_bit >> UNIT_SHIFT;
+	bit = next_bit & BLOCKS_PER_UNIT_MASK;
 
-	if (!axmap_handler(axmap, bit_nr, axmap_next_free_fn, &data))
-		return axmap_first_free(axmap);
+	/*
+	 * As an optimization, do a quick check for a free bit
+	 * in the current word at level 0. If not found, do
+	 * a topdown search.
+	 */
+	temp = ~bit_masks[bit] & ~axmap->levels[0].map[offset];
+	if (temp) {
+		ret = (offset << UNIT_SHIFT) + ffz(~temp);
 
-	assert(data.level != -1U);
+		/* Might have found an unused bit at level 0 */
+		if (ret >= axmap->nr_bits)
+			ret = -1ULL;
+	} else
+		ret = axmap_find_first_free(axmap, next_bit);
 
 	/*
-	 * In the rare case that the map is unaligned, we might end up
-	 * finding an offset that's beyond the valid end. For that case,
-	 * find the first free one, the map is practically full.
+	 * If there are no free bits starting at next_bit and going
+	 * to the end of the map, wrap around by searching again
+	 * starting at bit 0.
 	 */
-	ret = axmap_find_first_free(axmap, data.level, data.offset);
-	if (ret != -1ULL)
-		return ret;
-
-	return axmap_first_free(axmap);
+	if (ret == -1ULL && next_bit != 0)
+		ret = axmap_find_first_free(axmap, 0);
+	return ret;
 }
diff -Nru fio-2.1.3/lib/axmap.h fio-3.16/lib/axmap.h
--- fio-2.1.3/lib/axmap.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/axmap.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,16 +2,15 @@
 #define FIO_BITMAP_H
 
 #include <inttypes.h>
+#include "types.h"
 
 struct axmap;
-struct axmap *axmap_new(unsigned long nr_bits);
+struct axmap *axmap_new(uint64_t nr_bits);
 void axmap_free(struct axmap *bm);
 
-void axmap_clear(struct axmap *axmap, uint64_t bit_nr);
 void axmap_set(struct axmap *axmap, uint64_t bit_nr);
 unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits);
-int axmap_isset(struct axmap *axmap, uint64_t bit_nr);
-uint64_t axmap_first_free(struct axmap *axmap);
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr);
 uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr);
 void axmap_reset(struct axmap *axmap);
 
diff -Nru fio-2.1.3/lib/bloom.c fio-3.16/lib/bloom.c
--- fio-2.1.3/lib/bloom.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/bloom.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,123 @@
+#include <stdlib.h>
+
+#include "bloom.h"
+#include "../hash.h"
+#include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/crc32c.h"
+#include "../crc/fnv.h"
+
+struct bloom {
+	uint64_t nentries;
+
+	uint32_t *map;
+};
+
+#define BITS_PER_INDEX	(sizeof(uint32_t) * 8)
+#define BITS_INDEX_MASK	(BITS_PER_INDEX - 1)
+
+struct bloom_hash {
+	unsigned int seed;
+	uint32_t (*fn)(const void *, uint32_t, uint32_t);
+};
+
+static uint32_t bloom_crc32c(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fio_crc32c(buf, len);
+}
+
+static uint32_t bloom_fnv(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fnv(buf, len, seed);
+}
+
+#define BLOOM_SEED	0x8989
+
+static struct bloom_hash hashes[] = {
+	{
+		.seed = BLOOM_SEED,
+		.fn = jhash,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = XXH32,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = murmurhash3,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_crc32c,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_fnv,
+	},
+};
+
+#define N_HASHES	5
+
+struct bloom *bloom_new(uint64_t entries)
+{
+	struct bloom *b;
+	size_t no_uints;
+
+	crc32c_arm64_probe();
+	crc32c_intel_probe();
+
+	b = malloc(sizeof(*b));
+	b->nentries = entries;
+	no_uints = (entries + BITS_PER_INDEX - 1) / BITS_PER_INDEX;
+	b->map = calloc(no_uints, sizeof(uint32_t));
+	if (!b->map) {
+		free(b);
+		return NULL;
+	}
+
+	return b;
+}
+
+void bloom_free(struct bloom *b)
+{
+	free(b->map);
+	free(b);
+}
+
+static bool __bloom_check(struct bloom *b, const void *data, unsigned int len,
+			  bool set)
+{
+	uint32_t hash[N_HASHES];
+	int i, was_set;
+
+	for (i = 0; i < N_HASHES; i++) {
+		hash[i] = hashes[i].fn(data, len, hashes[i].seed);
+		hash[i] = hash[i] % b->nentries;
+	}
+
+	was_set = 0;
+	for (i = 0; i < N_HASHES; i++) {
+		const unsigned int index = hash[i] / BITS_PER_INDEX;
+		const unsigned int bit = hash[i] & BITS_INDEX_MASK;
+
+		if (b->map[index] & (1U << bit))
+			was_set++;
+		else if (set)
+			b->map[index] |= 1U << bit;
+		else
+			break;
+	}
+
+	return was_set == N_HASHES;
+}
+
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords)
+{
+	return __bloom_check(b, data, nwords * sizeof(uint32_t), true);
+}
+
+bool bloom_string(struct bloom *b, const char *data, unsigned int len,
+		  bool set)
+{
+	return __bloom_check(b, data, len, set);
+}
diff -Nru fio-2.1.3/lib/bloom.h fio-3.16/lib/bloom.h
--- fio-2.1.3/lib/bloom.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/bloom.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,14 @@
+#ifndef FIO_BLOOM_H
+#define FIO_BLOOM_H
+
+#include <inttypes.h>
+#include "../lib/types.h"
+
+struct bloom;
+
+struct bloom *bloom_new(uint64_t entries);
+void bloom_free(struct bloom *b);
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords);
+bool bloom_string(struct bloom *b, const char *data, unsigned int len, bool);
+
+#endif
diff -Nru fio-2.1.3/lib/ffz.h fio-3.16/lib/ffz.h
--- fio-2.1.3/lib/ffz.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/ffz.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,16 +1,16 @@
 #ifndef FIO_FFZ_H
 #define FIO_FFZ_H
 
-static inline int __ffs(unsigned long word)
+#include <inttypes.h>
+
+static inline int ffs64(uint64_t word)
 {
 	int r = 0;
 
-#if BITS_PER_LONG == 64
 	if ((word & 0xffffffff) == 0) {
 		r += 32;
 		word >>= 32;
 	}
-#endif
 	if (!(word & 0xffff)) {
 		word >>= 16;
 		r += 16;
@@ -27,17 +27,26 @@
 		word >>= 2;
 		r += 2;
 	}
-	if (!(word & 1)) {
-		word >>= 1;
+	if (!(word & 1))
 		r += 1;
-	}
 
 	return r;
 }
 
+#ifndef ARCH_HAVE_FFZ
+
 static inline int ffz(unsigned long bitmask)
 {
-	return __ffs(~bitmask);
+	return ffs64(~bitmask);
+}
+
+#else
+#define ffz(bitmask)	arch_ffz(bitmask)
+#endif
+
+static inline int ffz64(uint64_t bitmask)
+{
+	return ffs64(~bitmask);
 }
 
 #endif
diff -Nru fio-2.1.3/lib/gauss.c fio-3.16/lib/gauss.c
--- fio-2.1.3/lib/gauss.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/gauss.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,63 @@
+#include <math.h>
+#include <string.h>
+#include "../hash.h"
+#include "gauss.h"
+
+#define GAUSS_ITERS	12
+
+static int gauss_dev(struct gauss_state *gs)
+{
+	unsigned int r;
+	int vr;
+
+	if (!gs->stddev)
+		return 0;
+
+	r = __rand(&gs->r);
+	vr = gs->stddev * (r / (FRAND32_MAX + 1.0));
+
+	return vr - gs->stddev / 2;
+}
+
+unsigned long long gauss_next(struct gauss_state *gs)
+{
+	unsigned long long sum = 0;
+	int i;
+
+	for (i = 0; i < GAUSS_ITERS; i++)
+		sum += __rand(&gs->r) % (gs->nranges + 1);
+
+	sum = (sum + GAUSS_ITERS - 1) / GAUSS_ITERS;
+
+	if (gs->stddev) {
+		int dev = gauss_dev(gs);
+
+		while (dev + sum >= gs->nranges)
+			dev /= 2;
+		sum += dev;
+	}
+
+	if (!gs->disable_hash)
+		sum = __hash_u64(sum);
+
+	return sum % gs->nranges;
+}
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed)
+{
+	memset(gs, 0, sizeof(*gs));
+	init_rand_seed(&gs->r, seed, 0);
+	gs->nranges = nranges;
+
+	if (dev != 0.0) {
+		gs->stddev = ceil((double) (nranges * 100.0) / dev);
+		if (gs->stddev > nranges / 2)
+			gs->stddev = nranges / 2;
+	}
+}
+
+void gauss_disable_hash(struct gauss_state *gs)
+{
+	gs->disable_hash = true;
+}
diff -Nru fio-2.1.3/lib/gauss.h fio-3.16/lib/gauss.h
--- fio-2.1.3/lib/gauss.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/gauss.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,19 @@
+#ifndef FIO_GAUSS_H
+#define FIO_GAUSS_H
+
+#include <inttypes.h>
+#include "rand.h"
+
+struct gauss_state {
+	struct frand_state r;
+	uint64_t nranges;
+	unsigned int stddev;
+	bool disable_hash;
+};
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed);
+unsigned long long gauss_next(struct gauss_state *gs);
+void gauss_disable_hash(struct gauss_state *gs);
+
+#endif
diff -Nru fio-2.1.3/lib/getopt.h fio-3.16/lib/getopt.h
--- fio-2.1.3/lib/getopt.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/getopt.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-#ifdef CONFIG_GETOPT_LONG_ONLY
-
-#include <getopt.h>
-
-#else
-
-#ifndef _GETOPT_H
-#define _GETOPT_H
-
-struct option {
-	const char *name;
-	int has_arg;
-	int *flag;
-	int val;
-};
-
-enum {
-	no_argument	  = 0,
-	required_argument = 1,
-	optional_argument = 2,
-};
-
-int getopt_long_only(int, char *const *, const char *, const struct option *, int *);
-
-#endif
-#endif
diff -Nru fio-2.1.3/lib/getopt_long.c fio-3.16/lib/getopt_long.c
--- fio-2.1.3/lib/getopt_long.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/getopt_long.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,167 +0,0 @@
-/*
- * getopt.c
- *
- * getopt_long(), or at least a common subset thereof:
- *
- * - Option reordering is not supported
- * - -W foo is not supported
- * - First optstring character "-" not supported.
- *
- * This file was imported from the klibc library from hpa
- */
-
-#include <stdint.h>
-#include <unistd.h>
-#include <string.h>
-
-#include "getopt.h"
-
-char *optarg = NULL;
-int optind = 0, opterr = 0, optopt = 0;
-
-static struct getopt_private_state {
-	const char *optptr;
-	const char *last_optstring;
-	char *const *last_argv;
-} pvt;
-
-static inline const char *option_matches(const char *arg_str,
-					 const char *opt_name)
-{
-	while (*arg_str != '\0' && *arg_str != '=') {
-		if (*arg_str++ != *opt_name++)
-			return NULL;
-	}
-
-	if (*opt_name)
-		return NULL;
-
-	return arg_str;
-}
-
-int getopt_long_only(int argc, char *const *argv, const char *optstring,
-		const struct option *longopts, int *longindex)
-{
-	const char *carg;
-	const char *osptr;
-	int opt;
-
-	optarg = NULL;
-
-	/* getopt() relies on a number of different global state
-	   variables, which can make this really confusing if there is
-	   more than one use of getopt() in the same program.  This
-	   attempts to detect that situation by detecting if the
-	   "optstring" or "argv" argument have changed since last time
-	   we were called; if so, reinitialize the query state. */
-
-	if (optstring != pvt.last_optstring || argv != pvt.last_argv ||
-	    optind < 1 || optind > argc) {
-		/* optind doesn't match the current query */
-		pvt.last_optstring = optstring;
-		pvt.last_argv = argv;
-		optind = 1;
-		pvt.optptr = NULL;
-	}
-
-	carg = argv[optind];
-
-	/* First, eliminate all non-option cases */
-
-	if (!carg || carg[0] != '-' || !carg[1])
-		return -1;
-
-	if (carg[1] == '-') {
-		const struct option *lo;
-		const char *opt_end = NULL;
-
-		optind++;
-
-		/* Either it's a long option, or it's -- */
-		if (!carg[2]) {
-			/* It's -- */
-			return -1;
-		}
-
-		for (lo = longopts; lo->name; lo++) {
-			if ((opt_end = option_matches(carg+2, lo->name)))
-			    break;
-		}
-		if (!opt_end)
-			return '?';
-
-		if (longindex)
-			*longindex = lo-longopts;
-
-		if (*opt_end == '=') {
-			if (lo->has_arg)
-				optarg = (char *)opt_end+1;
-			else
-				return '?';
-		} else if (lo->has_arg == 1) {
-			if (!(optarg = argv[optind]))
-				return '?';
-			optind++;
-		}
-
-		if (lo->flag) {
-			*lo->flag = lo->val;
-			return 0;
-		} else {
-			return lo->val;
-		}
-	}
-
-	if ((uintptr_t) (pvt.optptr - carg) > (uintptr_t) strlen(carg)) {
-		/* Someone frobbed optind, change to new opt. */
-		pvt.optptr = carg + 1;
-	}
-
-	opt = *pvt.optptr++;
-
-	if (opt != ':' && (osptr = strchr(optstring, opt))) {
-		if (osptr[1] == ':') {
-			if (*pvt.optptr) {
-				/* Argument-taking option with attached
-				   argument */
-				optarg = (char *)pvt.optptr;
-				optind++;
-			} else {
-				/* Argument-taking option with non-attached
-				   argument */
-				if (osptr[2] == ':') {
-					if (argv[optind + 1]) {
-						optarg = (char *)argv[optind+1];
-						optind += 2;
-					} else {
-						optarg = NULL;
-						optind++;
-					}
-					return opt;
-				} else if (argv[optind + 1]) {
-					optarg = (char *)argv[optind+1];
-					optind += 2;
-				} else {
-					/* Missing argument */
-					optind++;
-					return (optstring[0] == ':')
-						? ':' : '?';
-				}
-			}
-			return opt;
-		} else {
-			/* Non-argument-taking option */
-			/* pvt.optptr will remember the exact position to
-			   resume at */
-			if (!*pvt.optptr)
-				optind++;
-			return opt;
-		}
-	} else {
-		/* Unknown option */
-		optopt = opt;
-		if (!*pvt.optptr)
-			optind++;
-		return '?';
-	}
-}
diff -Nru fio-2.1.3/lib/ieee754.c fio-3.16/lib/ieee754.c
--- fio-2.1.3/lib/ieee754.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/ieee754.c	2019-09-20 01:01:52.000000000 +0000
@@ -5,7 +5,6 @@
  *
  * Below code was granted to the public domain.
  */
-#include <inttypes.h>
 #include "ieee754.h"
 
 uint64_t pack754(long double f, unsigned bits, unsigned expbits)
diff -Nru fio-2.1.3/lib/inet_aton.c fio-3.16/lib/inet_aton.c
--- fio-2.1.3/lib/inet_aton.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/inet_aton.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,6 +0,0 @@
-#include "inet_aton.h"
-
-int inet_aton(const char *cp, struct in_addr *inp)
-{
-	return inet_pton(AF_INET, cp, inp);
-}
diff -Nru fio-2.1.3/lib/inet_aton.h fio-3.16/lib/inet_aton.h
--- fio-2.1.3/lib/inet_aton.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/inet_aton.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,8 +0,0 @@
-#ifndef FIO_INET_ATON_LIB_H
-#define FIO_INET_ATON_LIB_H
-
-#include <arpa/inet.h>
-
-int inet_aton(const char *cp, struct in_addr *inp);
-
-#endif
diff -Nru fio-2.1.3/lib/lfsr.c fio-3.16/lib/lfsr.c
--- fio-2.1.3/lib/lfsr.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/lfsr.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,7 +1,7 @@
 #include <stdio.h>
-#include <math.h>
 
 #include "lfsr.h"
+#include "../compiler/compiler.h"
 
 /*
  * LFSR taps retrieved from:
@@ -10,7 +10,7 @@
  * The memory overhead of the following tap table should be relatively small,
  * no more than 400 bytes.
  */
-static uint8_t taps[64][FIO_MAX_TAPS] =
+static uint8_t lfsr_taps[64][FIO_MAX_TAPS] =
 {
 	{0}, {0}, {0},		//LFSRs with less that 3-bits cannot exist
 	{3, 2},			//Tap position for 3-bit LFSR
@@ -78,7 +78,7 @@
 
 #define __LFSR_NEXT(__fl, __v)						\
 	__v = ((__v >> 1) | __fl->cached_bit) ^			\
-			(((__v & 1UL) - 1UL) & __fl->xormask);
+			(((__v & 1ULL) - 1ULL) & __fl->xormask);
 
 static inline void __lfsr_next(struct fio_lfsr *fl, unsigned int spin)
 {
@@ -88,21 +88,37 @@
 	 */
 	switch (spin) {
 		case 15: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case 14: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case 13: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case 12: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case 11: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case 10: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  9: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  8: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  7: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  6: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  5: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  4: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  3: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  2: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  1: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		case  0: __LFSR_NEXT(fl, fl->last_val);
+		/* fall through */
 		default: break;
 	}
 }
@@ -123,7 +139,7 @@
  * c. Check if the calculated value exceeds the desirable range. In this case,
  *    go back to b, else return.
  */
-int lfsr_next(struct fio_lfsr *fl, uint64_t *off, uint64_t last)
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off)
 {
 	if (fl->num_vals++ > fl->max_val)
 		return 1;
@@ -132,11 +148,9 @@
 		if (fl->cycle_length && !--fl->cycle_length) {
 			__lfsr_next(fl, fl->spin + 1);
 			fl->cycle_length = fl->cached_cycle_length;
-			goto check;
-		}
-		__lfsr_next(fl, fl->spin);
-check: ;
-	} while (fl->last_val > fl->max_val);
+		} else
+			__lfsr_next(fl, fl->spin);
+	} while (fio_unlikely(fl->last_val > fl->max_val));
 
 	*off = fl->last_val;
 	return 0;
@@ -148,7 +162,7 @@
 	uint64_t xormask = 0;
 
 	for(i = 0; i < FIO_MAX_TAPS && taps[i] != 0; i++)
-		xormask |= 1UL << (taps[i] - 1);
+		xormask |= 1ULL << (taps[i] - 1);
 
 	return xormask;
 }
@@ -159,12 +173,12 @@
 
 	/*
 	 * For an LFSR, there is always a prohibited state (all ones).
-	 * Thus, if we need to find the proper LFSR for our size, we must take that
-	 * into account.
+	 * Thus, if we need to find the proper LFSR for our size, we must
+	 * take that into account.
 	 */
 	for (i = 3; i < 64; i++)
-		if ((1UL << i) > size)
-			return taps[i];
+		if ((1ULL << i) > size)
+			return lfsr_taps[i];
 
 	return NULL;
 }
@@ -187,7 +201,7 @@
  * Thus, [1] is equivalent to (y * i) % (spin + 1) == 0;
  * Also, the cycle's length will be (x * i) + (y * i) / (spin + 1)
  */
-int prepare_spin(struct fio_lfsr *fl, unsigned int spin)
+static int prepare_spin(struct fio_lfsr *fl, unsigned int spin)
 {
 	uint64_t max = (fl->cached_bit << 1) - 1;
 	uint64_t x, y;
@@ -218,7 +232,7 @@
 	return 0;
 }
 
-int lfsr_reset(struct fio_lfsr *fl, unsigned long seed)
+int lfsr_reset(struct fio_lfsr *fl, uint64_t seed)
 {
 	uint64_t bitmask = (fl->cached_bit << 1) - 1;
 
@@ -232,18 +246,18 @@
 	return 0;
 }
 
-int lfsr_init(struct fio_lfsr *fl, uint64_t nums, unsigned long seed,
-		unsigned int spin)
+int lfsr_init(struct fio_lfsr *fl, uint64_t nums, uint64_t seed,
+	      unsigned int spin)
 {
-	uint8_t *lfsr_taps;
+	uint8_t *taps;
 
-	lfsr_taps = find_lfsr(nums);
-	if (!lfsr_taps)
+	taps = find_lfsr(nums);
+	if (!taps)
 		return 1;
 
 	fl->max_val = nums - 1;
-	fl->xormask = lfsr_create_xormask(lfsr_taps);
-	fl->cached_bit = 1UL << (lfsr_taps[0] - 1);
+	fl->xormask = lfsr_create_xormask(taps);
+	fl->cached_bit = 1ULL << (taps[0] - 1);
 
 	if (prepare_spin(fl, spin))
 		return 1;
diff -Nru fio-2.1.3/lib/lfsr.h fio-3.16/lib/lfsr.h
--- fio-2.1.3/lib/lfsr.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/lfsr.h	2019-09-20 01:01:52.000000000 +0000
@@ -22,9 +22,9 @@
 	unsigned int spin;
 };
 
-int lfsr_next(struct fio_lfsr *fl, uint64_t *off, uint64_t);
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off);
 int lfsr_init(struct fio_lfsr *fl, uint64_t size,
-		unsigned long seed, unsigned int spin);
-int lfsr_reset(struct fio_lfsr *fl, unsigned long seed);
+	      uint64_t seed, unsigned int spin);
+int lfsr_reset(struct fio_lfsr *fl, uint64_t seed);
 
 #endif
diff -Nru fio-2.1.3/lib/memalign.c fio-3.16/lib/memalign.c
--- fio-2.1.3/lib/memalign.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/memalign.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,43 @@
+#include <assert.h>
+#include <stdlib.h>
+
+#include "memalign.h"
+#include "smalloc.h"
+
+#define PTR_ALIGN(ptr, mask)   \
+	(char *)((uintptr_t)((ptr) + (mask)) & ~(mask))
+
+struct align_footer {
+	unsigned int offset;
+};
+
+void *fio_memalign(size_t alignment, size_t size, bool shared)
+{
+	struct align_footer *f;
+	void *ptr, *ret = NULL;
+
+	assert(!(alignment & (alignment - 1)));
+
+	if (shared)
+		ptr = smalloc(size + alignment + sizeof(*f) - 1);
+	else
+		ptr = malloc(size + alignment + sizeof(*f) - 1);
+
+	if (ptr) {
+		ret = PTR_ALIGN(ptr, alignment - 1);
+		f = ret + size;
+		f->offset = (uintptr_t) ret - (uintptr_t) ptr;
+	}
+
+	return ret;
+}
+
+void fio_memfree(void *ptr, size_t size, bool shared)
+{
+	struct align_footer *f = ptr + size;
+
+	if (shared)
+		sfree(ptr - f->offset);
+	else
+		free(ptr - f->offset);
+}
diff -Nru fio-2.1.3/lib/memalign.h fio-3.16/lib/memalign.h
--- fio-2.1.3/lib/memalign.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/memalign.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+#ifndef FIO_MEMALIGN_H
+#define FIO_MEMALIGN_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+extern void *fio_memalign(size_t alignment, size_t size, bool shared);
+extern void fio_memfree(void *ptr, size_t size, bool shared);
+
+#endif
diff -Nru fio-2.1.3/lib/memcpy.c fio-3.16/lib/memcpy.c
--- fio-2.1.3/lib/memcpy.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/memcpy.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,287 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "memcpy.h"
+#include "rand.h"
+#include "../fio_time.h"
+#include "../gettime.h"
+#include "../os/os.h"
+
+#define BUF_SIZE	32 * 1024 * 1024ULL
+
+#define NR_ITERS	64
+
+struct memcpy_test {
+	const char *name;
+	void *src;
+	void *dst;
+	size_t size;
+};
+
+static struct memcpy_test tests[] = {
+	{
+		.name		= "8 bytes",
+		.size		= 8,
+	},
+	{
+		.name		= "16 bytes",
+		.size		= 16,
+	},
+	{
+		.name		= "96 bytes",
+		.size		= 96,
+	},
+	{
+		.name		= "128 bytes",
+		.size		= 128,
+	},
+	{
+		.name		= "256 bytes",
+		.size		= 256,
+	},
+	{
+		.name		= "512 bytes",
+		.size		= 512,
+	},
+	{
+		.name		= "2048 bytes",
+		.size		= 2048,
+	},
+	{
+		.name		= "8192 bytes",
+		.size		= 8192,
+	},
+	{
+		.name		= "131072 bytes",
+		.size		= 131072,
+	},
+	{
+		.name		= "262144 bytes",
+		.size		= 262144,
+	},
+	{
+		.name		= "524288 bytes",
+		.size		= 524288,
+	},
+	{
+		.name		= NULL,
+	},
+};
+
+struct memcpy_type {
+	const char *name;
+	unsigned int mask;
+	void (*fn)(struct memcpy_test *);
+};
+
+enum {
+	T_MEMCPY	= 1U << 0,
+	T_MEMMOVE	= 1U << 1,
+	T_SIMPLE	= 1U << 2,
+	T_HYBRID	= 1U << 3,
+};
+
+#define do_test(test, fn)	do {					\
+	size_t left, this;						\
+	void *src, *dst;						\
+	int i;								\
+									\
+	for (i = 0; i < NR_ITERS; i++) {				\
+		left = BUF_SIZE;					\
+		src = test->src;					\
+		dst = test->dst;					\
+		while (left) {						\
+			this = test->size;				\
+			if (this > left)				\
+				this = left;				\
+			(fn)(dst, src, this);				\
+			left -= this;					\
+			src += this;					\
+			dst += this;					\
+		}							\
+	}								\
+} while (0)
+
+static void t_memcpy(struct memcpy_test *test)
+{
+	do_test(test, memcpy);
+}
+
+static void t_memmove(struct memcpy_test *test)
+{
+	do_test(test, memmove);
+}
+
+static void simple_memcpy(void *dst, void const *src, size_t len)
+{
+ 	char *d = dst;
+	const char *s = src;
+
+	while (len--)
+		*d++ = *s++;
+}
+
+static void t_simple(struct memcpy_test *test)
+{
+	do_test(test, simple_memcpy);
+}
+
+static void t_hybrid(struct memcpy_test *test)
+{
+	if (test->size >= 64)
+		do_test(test, simple_memcpy);
+	else
+		do_test(test, memcpy);
+}
+
+static struct memcpy_type t[] = {
+	{
+		.name = "memcpy",
+		.mask = T_MEMCPY,
+		.fn = t_memcpy,
+	},
+	{
+		.name = "memmove",
+		.mask = T_MEMMOVE,
+		.fn = t_memmove,
+	},
+	{
+		.name = "simple",
+		.mask = T_SIMPLE,
+		.fn = t_simple,
+	},
+	{
+		.name = "hybrid",
+		.mask = T_HYBRID,
+		.fn = t_hybrid,
+	},
+	{
+		.name = NULL,
+	},
+};
+
+static unsigned int get_test_mask(const char *type)
+{
+	char *ostr, *str = strdup(type);
+	unsigned int mask;
+	char *name;
+	int i;
+
+	ostr = str;
+	mask = 0;
+	while ((name = strsep(&str, ",")) != NULL) {
+		for (i = 0; t[i].name; i++) {
+			if (!strcmp(t[i].name, name)) {
+				mask |= t[i].mask;
+				break;
+			}
+		}
+	}
+
+	free(ostr);
+	return mask;
+}
+
+static int list_types(void)
+{
+	int i;
+
+	for (i = 0; t[i].name; i++)
+		printf("%s\n", t[i].name);
+
+	return 1;
+}
+
+static int setup_tests(void)
+{
+	struct memcpy_test *test;
+	struct frand_state state;
+	void *src, *dst;
+	int i;
+
+	src = malloc(BUF_SIZE);
+	dst = malloc(BUF_SIZE);
+	if (!src || !dst) {
+		free(src);
+		free(dst);
+		return 1;
+	}
+
+	init_rand_seed(&state, 0x8989, 0);
+	fill_random_buf(&state, src, BUF_SIZE);
+
+	for (i = 0; tests[i].name; i++) {
+		test = &tests[i];
+		test->src = src;
+		test->dst = dst;
+	}
+
+	return 0;
+}
+
+static void free_tests(void)
+{
+	free(tests[0].src);
+	free(tests[0].dst);
+}
+
+int fio_memcpy_test(const char *type)
+{
+	unsigned int test_mask = 0;
+	int j, i;
+
+	if (!type)
+		test_mask = ~0U;
+	else if (!strcmp(type, "help") || !strcmp(type, "list"))
+		return list_types();
+	else
+		test_mask = get_test_mask(type);
+
+	if (!test_mask) {
+		fprintf(stderr, "fio: unknown hash `%s`. Available:\n", type);
+		return list_types();
+	}
+
+	if (setup_tests()) {
+		fprintf(stderr, "setting up mem regions failed\n");
+		return 1;
+	}
+
+	for (i = 0; t[i].name; i++) {
+		struct timespec ts;
+		double mb_sec;
+		uint64_t usec;
+
+		if (!(t[i].mask & test_mask))
+			continue;
+
+		/*
+		 * For first run, make sure CPUs are spun up and that
+		 * we've touched the data.
+		 */
+		usec_spin(100000);
+		t[i].fn(&tests[0]);
+
+		printf("%s\n", t[i].name);
+
+		for (j = 0; tests[j].name; j++) {
+			fio_gettime(&ts, NULL);
+			t[i].fn(&tests[j]);
+			usec = utime_since_now(&ts);
+
+			if (usec) {
+				unsigned long long mb = NR_ITERS * BUF_SIZE;
+
+				mb_sec = (double) mb / (double) usec;
+				mb_sec /= (1.024 * 1.024);
+				printf("\t%s:\t%8.2f MiB/sec\n", tests[j].name, mb_sec);
+			} else
+				printf("\t%s:inf MiB/sec\n", tests[j].name);
+		}
+	}
+
+	free_tests();
+	return 0;
+}
diff -Nru fio-2.1.3/lib/memcpy.h fio-3.16/lib/memcpy.h
--- fio-2.1.3/lib/memcpy.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/memcpy.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,6 @@
+#ifndef FIO_MEMCPY_H
+#define FIO_MEMCPY_H
+
+int fio_memcpy_test(const char *type);
+
+#endif
diff -Nru fio-2.1.3/lib/mountcheck.c fio-3.16/lib/mountcheck.c
--- fio-2.1.3/lib/mountcheck.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/mountcheck.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <string.h>
+
+#ifdef CONFIG_GETMNTENT
+#include <mntent.h>
+
+#include "mountcheck.h"
+
+#define MTAB	"/etc/mtab"
+
+int device_is_mounted(const char *dev)
+{
+	FILE *mtab;
+	struct mntent *mnt;
+	int ret = 0;
+
+	mtab = setmntent(MTAB, "r");
+	if (!mtab)
+		return 0;
+
+	while ((mnt = getmntent(mtab)) != NULL) {
+		if (!mnt->mnt_fsname)
+			continue;
+		if (!strcmp(mnt->mnt_fsname, dev)) {
+			ret = 1;
+			break;
+		}
+	}
+
+	endmntent(mtab);
+	return ret;
+}
+
+#elif defined(CONFIG_GETMNTINFO)
+/* for most BSDs */
+#include <sys/param.h>
+#include <sys/mount.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#elif defined(CONFIG_GETMNTINFO_STATVFS)
+/* for NetBSD */
+#include <sys/statvfs.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statvfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#else
+/* others */
+
+int device_is_mounted(const char *dev)
+{
+	return 0;
+}
+
+#endif
diff -Nru fio-2.1.3/lib/mountcheck.h fio-3.16/lib/mountcheck.h
--- fio-2.1.3/lib/mountcheck.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/mountcheck.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,6 @@
+#ifndef FIO_MOUNT_CHECK_H
+#define FIO_MOUNT_CHECK_H
+
+extern int device_is_mounted(const char *);
+
+#endif
diff -Nru fio-2.1.3/lib/nowarn_snprintf.h fio-3.16/lib/nowarn_snprintf.h
--- fio-2.1.3/lib/nowarn_snprintf.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/nowarn_snprintf.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,27 @@
+#ifndef _NOWARN_SNPRINTF_H_
+#define _NOWARN_SNPRINTF_H_
+
+#include <stdio.h>
+#include <stdarg.h>
+
+static inline int nowarn_snprintf(char *str, size_t size, const char *format,
+				  ...)
+{
+	va_list args;
+	int res;
+
+	va_start(args, format);
+#if __GNUC__ -0 >= 8
+#pragma GCC diagnostic push "-Wformat-truncation"
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+#endif
+	res = vsnprintf(str, size, format, args);
+#if __GNUC__ -0 >= 8
+#pragma GCC diagnostic pop "-Wformat-truncation"
+#endif
+	va_end(args);
+
+	return res;
+}
+
+#endif
diff -Nru fio-2.1.3/lib/num2str.c fio-3.16/lib/num2str.c
--- fio-2.1.3/lib/num2str.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/num2str.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,71 +1,120 @@
+#include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 
-/*
- * Cheesy number->string conversion, complete with carry rounding error.
+#include "../compiler/compiler.h"
+#include "num2str.h"
+
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
+/**
+ * num2str() - Cheesy number->string conversion, complete with carry rounding error.
+ * @num: quantity (e.g., number of blocks, bytes or bits)
+ * @maxlen: max number of digits in the output string (not counting prefix and units, but counting .)
+ * @base: multiplier for num (e.g., if num represents Ki, use 1024)
+ * @pow2: select unit prefix - 0=power-of-10 decimal SI, nonzero=power-of-2 binary IEC
+ * @units: select units - N2S_* constants defined in num2str.h
+ * @returns a malloc'd buffer containing "number[<unit prefix>][<units>]"
  */
-char *num2str(unsigned long num, int maxlen, int base, int pow2, int unit_base)
+char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units)
 {
-	const char *postfix[] = { "", "K", "M", "G", "P", "E" };
-	const char *byte_postfix[] = { "", "B", "bit" };
-	const unsigned int thousand[] = { 1000, 1024 };
-	unsigned int modulo, decimals;
-	int byte_post_index = 0, post_index, carry = 0;
-	char tmp[32];
+	const char *sistr[] = { "", "k", "M", "G", "T", "P" };
+	const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi" };
+	const char **unitprefix;
+	static const char *const unitstr[] = {
+		[N2S_NONE]	= "",
+		[N2S_PERSEC]	= "/s",
+		[N2S_BYTE]	= "B",
+		[N2S_BIT]	= "bit",
+		[N2S_BYTEPERSEC]= "B/s",
+		[N2S_BITPERSEC]	= "bit/s"
+	};
+	const unsigned int thousand = pow2 ? 1024 : 1000;
+	unsigned int modulo;
+	int post_index, carry = 0;
+	char tmp[32], fmt[32];
 	char *buf;
 
+	compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
+	assert(units < ARRAY_SIZE(unitstr));
+
 	buf = malloc(128);
+	if (!buf)
+		return NULL;
+
+	if (pow2)
+		unitprefix = iecstr;
+	else
+		unitprefix = sistr;
 
 	for (post_index = 0; base > 1; post_index++)
-		base /= thousand[!!pow2];
+		base /= thousand;
 
-	switch (unit_base) {
-	case 1:
-		byte_post_index = 2;
+	switch (units) {
+	case N2S_NONE:
+		break;
+	case N2S_PERSEC:
+		break;
+	case N2S_BYTE:
+		break;
+	case N2S_BIT:
 		num *= 8;
 		break;
-	case 8:
-		byte_post_index = 1;
+	case N2S_BYTEPERSEC:
+		break;
+	case N2S_BITPERSEC:
+		num *= 8;
 		break;
 	}
 
+	/*
+	 * Divide by K/Ki until string length of num <= maxlen.
+	 */
 	modulo = -1U;
-	while (post_index < sizeof(postfix)) {
-		sprintf(tmp, "%lu", num);
+	while (post_index < ARRAY_SIZE(sistr)) {
+		sprintf(tmp, "%llu", (unsigned long long) num);
 		if (strlen(tmp) <= maxlen)
 			break;
 
-		modulo = num % thousand[!!pow2];
-		num /= thousand[!!pow2];
-		carry = modulo >= thousand[!!pow2] / 2;
+		modulo = num % thousand;
+		num /= thousand;
+		carry = modulo >= thousand / 2;
 		post_index++;
 	}
 
+	/*
+	 * If no modulo, then we're done.
+	 */
 	if (modulo == -1U) {
 done:
-		sprintf(buf, "%lu%s%s", num, postfix[post_index],
-			byte_postfix[byte_post_index]);
+		if (post_index >= ARRAY_SIZE(sistr))
+			post_index = 0;
+
+		sprintf(buf, "%llu%s%s", (unsigned long long) num,
+			unitprefix[post_index], unitstr[units]);
 		return buf;
 	}
 
-	sprintf(tmp, "%lu", num);
-	decimals = maxlen - strlen(tmp);
-	if (decimals <= 1) {
+	/*
+	 * If no room for decimals, then we're done.
+	 */
+	sprintf(tmp, "%llu", (unsigned long long) num);
+	if ((int)(maxlen - strlen(tmp)) <= 1) {
 		if (carry)
 			num++;
 		goto done;
 	}
 
-	do {
-		sprintf(tmp, "%u", modulo);
-		if (strlen(tmp) <= decimals - 1)
-			break;
-
-		modulo = (modulo + 9) / 10;
-	} while (1);
+	/*
+	 * Fill in everything and return the result.
+	 */
+	assert(maxlen - strlen(tmp) - 1 > 0);
+	assert(modulo < thousand);
+	sprintf(fmt, "%%.%df", (int)(maxlen - strlen(tmp) - 1));
+	sprintf(tmp, fmt, (double)modulo / (double)thousand);
 
-	sprintf(buf, "%lu.%u%s%s", num, modulo, postfix[post_index],
-		byte_postfix[byte_post_index]);
+	sprintf(buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
+			unitprefix[post_index], unitstr[units]);
 	return buf;
 }
diff -Nru fio-2.1.3/lib/num2str.h fio-3.16/lib/num2str.h
--- fio-2.1.3/lib/num2str.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/num2str.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,17 @@
+#ifndef FIO_NUM2STR_H
+#define FIO_NUM2STR_H
+
+#include <inttypes.h>
+
+enum n2s_unit {
+	N2S_NONE	= 0,
+	N2S_PERSEC	= 1,
+	N2S_BYTE	= 2,
+	N2S_BIT		= 3,
+	N2S_BYTEPERSEC	= 4,
+	N2S_BITPERSEC	= 5,
+};
+
+extern char *num2str(uint64_t, int, int, int, enum n2s_unit);
+
+#endif
diff -Nru fio-2.1.3/lib/output_buffer.c fio-3.16/lib/output_buffer.c
--- fio-2.1.3/lib/output_buffer.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/output_buffer.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,41 @@
+#include <string.h>
+#include <stdlib.h>
+
+#include "output_buffer.h"
+#include "../minmax.h"
+
+#define BUF_INC	1024
+
+void buf_output_init(struct buf_output *out)
+{
+	out->max_buflen = 0;
+	out->buflen = 0;
+	out->buf = NULL;
+}
+
+void buf_output_free(struct buf_output *out)
+{
+	free(out->buf);
+	buf_output_init(out);
+}
+
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
+{
+	if (out->max_buflen - out->buflen < len) {
+		size_t need = len - (out->max_buflen - out->buflen);
+		size_t old_max = out->max_buflen;
+
+		need = max((size_t) BUF_INC, need);
+		out->max_buflen += need;
+		out->buf = realloc(out->buf, out->max_buflen);
+
+		old_max = max(old_max, out->buflen + len);
+		if (old_max + need > out->max_buflen)
+			need = out->max_buflen - old_max;
+		memset(&out->buf[old_max], 0, need);
+	}
+
+	memcpy(&out->buf[out->buflen], buf, len);
+	out->buflen += len;
+	return len;
+}
diff -Nru fio-2.1.3/lib/output_buffer.h fio-3.16/lib/output_buffer.h
--- fio-2.1.3/lib/output_buffer.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/output_buffer.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,16 @@
+#ifndef FIO_OUTPUT_BUFFER_H
+#define FIO_OUTPUT_BUFFER_H
+
+#include <stddef.h>
+
+struct buf_output {
+	char *buf;
+	size_t buflen;
+	size_t max_buflen;
+};
+
+void buf_output_init(struct buf_output *out);
+void buf_output_free(struct buf_output *out);
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len);
+
+#endif
diff -Nru fio-2.1.3/lib/pattern.c fio-3.16/lib/pattern.c
--- fio-2.1.3/lib/pattern.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/pattern.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,536 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "strntol.h"
+#include "pattern.h"
+#include "../minmax.h"
+#include "../oslib/strcasestr.h"
+#include "../oslib/strndup.h"
+
+/**
+ * parse_file() - parses binary file to fill buffer
+ * @beg - string input, extract filename from this
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_file(const char *beg, char *out,
+			      unsigned int out_len,
+			      unsigned int *filled)
+{
+	const char *end;
+	char *file;
+	int fd;
+	ssize_t count;
+
+	if (!out_len)
+		goto err_out;
+
+	assert(*beg == '\'');
+	beg++;
+	end = strchr(beg, '\'');
+	if (!end)
+		goto err_out;
+
+	file = strndup(beg, end - beg);
+	if (file == NULL)
+		goto err_out;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		goto err_free_out;
+
+	count = read(fd, out, out_len);
+	if (count == -1)
+		goto err_free_close_out;
+
+	*filled = count;
+	close(fd);
+	free(file);
+
+	/* Catch up quote */
+	return end + 1;
+
+err_free_close_out:
+	close(fd);
+err_free_out:
+	free(file);
+err_out:
+	return NULL;
+
+}
+
+/**
+ * parse_string() - parses string in double quotes, like "abc"
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_string(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+
+	if (!out_len)
+		return NULL;
+
+	assert(*beg == '"');
+	beg++;
+	end = strchr(beg, '"');
+	if (!end)
+		return NULL;
+	if (end - beg > out_len)
+		return NULL;
+
+	memcpy(out, beg, end - beg);
+	*filled = end - beg;
+
+	/* Catch up quote */
+	return end + 1;
+}
+
+/**
+ * parse_number() - parses numbers
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Supports decimals in the range [INT_MIN, INT_MAX] and
+ * hexidecimals of any size, which should be started with
+ * prefix 0x or 0X.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_number(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+	unsigned int val;
+	long lval;
+	int num, i;
+
+	if (!out_len)
+		return NULL;
+
+	num = 0;
+	sscanf(beg, "0%*[xX]%*[0-9a-fA-F]%n", &num);
+	if (num == 0) {
+		/* Here we are trying to parse decimal */
+
+		char *_end;
+
+		/* Looking ahead */
+		_end = strcasestr(beg, "0x");
+		if (_end)
+			num = _end - beg;
+		if (num)
+			lval = strntol(beg, num, &_end, 10);
+		else
+			lval = strtol(beg, &_end, 10);
+		if (beg == _end || lval > INT_MAX || lval < INT_MIN)
+			return NULL;
+		end = _end;
+		i = 0;
+		if (!lval) {
+			num    = 0;
+			out[i] = 0x00;
+			i      = 1;
+		} else {
+			val = (unsigned int)lval;
+			for (; val && out_len; out_len--, i++, val >>= 8)
+				out[i] = val & 0xff;
+			if (val)
+				return NULL;
+		}
+	} else {
+		assert(num > 2);
+
+		/* Catch up 0x prefix */
+		num -= 2;
+		beg += 2;
+
+		/* Look back, handle this combined string: 0xff0x14 */
+		if (beg[num] && !strncasecmp(&beg[num - 1], "0x", 2))
+			num--;
+
+		end  = beg + num;
+
+		for (i = 0; num && out_len;
+		     out_len--, i++, num -= 2, beg += 2) {
+			const char *fmt;
+
+			fmt = (num & 1 ? "%1hhx" : "%2hhx");
+			sscanf(beg, fmt, &out[i]);
+			if (num & 1) {
+				num++;
+				beg--;
+			}
+		}
+		if (num)
+			return NULL;
+	}
+
+	*filled = i;
+	return end;
+
+}
+
+/**
+ * parse_format() - parses formats, like %o, etc
+ * @in - string input
+ * @out - output buffer where space for format should be reserved
+ * @parsed - number of bytes which were already parsed so far
+ * @out_len - length of the output buffer
+ * @fmt_desc - format descritor array, what we expect to find
+ * @fmt_desc_sz - size of the format descritor array
+ * @fmt - format array, the output
+ * @fmt_sz - size of format array
+ *
+ * This function tries to find formats, e.g.:
+ *   %o - offset of the block
+ *
+ * In case of successfull parsing it fills the format param
+ * with proper offset and the size of the expected value, which
+ * should be pasted into buffer using the format 'func' callback.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_format(const char *in, char *out, unsigned int parsed,
+				unsigned int out_len, unsigned int *filled,
+				const struct pattern_fmt_desc *fmt_desc,
+				unsigned int fmt_desc_sz,
+				struct pattern_fmt *fmt, unsigned int fmt_sz)
+{
+	int i;
+	struct pattern_fmt *f = NULL;
+	unsigned int len = 0;
+
+	if (!out_len || !fmt_desc || !fmt_desc_sz || !fmt || !fmt_sz)
+		return NULL;
+
+	assert(*in == '%');
+
+	for (i = 0; i < fmt_desc_sz; i++) {
+		const struct pattern_fmt_desc *desc;
+
+		desc = &fmt_desc[i];
+		len  = strlen(desc->fmt);
+		if (0 == strncmp(in, desc->fmt, len)) {
+			fmt->desc = desc;
+			fmt->off  = parsed;
+			f = fmt;
+			break;
+		}
+	}
+
+	if (!f)
+		return NULL;
+	if (f->desc->len > out_len)
+		return NULL;
+
+	memset(out, '\0', f->desc->len);
+	*filled = f->desc->len;
+
+	return in + len;
+}
+
+/**
+ * parse_and_fill_pattern() - Parses combined input, which consists of strings,
+ *                            numbers and pattern formats.
+ * @in - string input
+ * @in_len - size of the input string
+ * @out - output buffer where parsed result will be put
+ * @out_len - lengths of the output buffer
+ * @fmt_desc - array of pattern format descriptors [input]
+ * @fmt_desc_sz - size of the format descriptor array
+ * @fmt - array of pattern formats [output]
+ * @fmt_sz - pointer where the size of pattern formats array stored [input],
+ *           after successfull parsing this pointer will contain the number
+ *           of parsed formats if any [output].
+ *
+ * strings:
+ *   bytes sequence in double quotes, e.g. "123".
+ *   NOTE: there is no way to escape quote, so "123\"abc" does not work.
+ *
+ * numbers:
+ *   hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ *                 e.g. 0xff12ceff1100ff
+ *   decimal     - decimal number in range [INT_MIN, INT_MAX]
+ *
+ * formats:
+ *   %o - offset of block, reserved 8 bytes.
+ *
+ * Explicit examples of combined string:
+ * #1                  #2                 #3        #4
+ *    in="abcd"          in=-1024           in=66     in=0xFF0X1
+ *   out=61 62 63 64    out=00 fc ff ff    out=42    out=ff 01
+ *
+ * #5                                #6
+ *    in=%o                            in="123"0xFFeeCC
+ *   out=00 00 00 00 00 00 00 00      out=31 32 33 ff ec cc
+ *
+ * #7
+ *   in=-100xab"1"%o"2"
+ *  out=f6 ff ff ff ab 31 00 00 00 00 00 00 00 00 32
+ *
+ * #9
+ *    in=%o0xdeadbeef%o
+ *   out=00 00 00 00 00 00 00 00 de ad be ef 00 00 00 00 00 00 00 00
+ *
+ * #10
+ *    in=0xfefefefefefefefefefefefefefefefefefefefefe
+ *   out=fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
+ *
+ * Returns number of bytes filled or err < 0 in case of failure.
+ */
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out)
+{
+	const char *beg, *end, *out_beg = out;
+	unsigned int total = 0, fmt_rem = 0;
+
+	if (!in || !in_len || !out || !out_len)
+		return -EINVAL;
+	if (fmt_sz_out)
+		fmt_rem = *fmt_sz_out;
+
+	beg = in;
+	do {
+		unsigned int filled;
+		int parsed_fmt;
+
+		filled     = 0;
+		parsed_fmt = 0;
+
+		switch (*beg) {
+		case '\'':
+			end = parse_file(beg, out, out_len, &filled);
+			break;
+		case '"':
+			end = parse_string(beg, out, out_len, &filled);
+			break;
+		case '%':
+			end = parse_format(beg, out, out - out_beg, out_len,
+					   &filled, fmt_desc, fmt_desc_sz,
+					   fmt, fmt_rem);
+			parsed_fmt = 1;
+			break;
+		default:
+			end = parse_number(beg, out, out_len, &filled);
+			break;
+		}
+
+		if (!end)
+			return -EINVAL;
+
+		if (parsed_fmt) {
+			assert(fmt_rem);
+			fmt_rem--;
+			fmt++;
+		}
+
+		assert(end - beg <= in_len);
+		in_len -= end - beg;
+		beg     = end;
+
+		assert(filled);
+		assert(filled <= out_len);
+		out_len -= filled;
+		out     += filled;
+		total   += filled;
+
+	} while (in_len);
+
+	if (fmt_sz_out)
+		*fmt_sz_out -= fmt_rem;
+	return total;
+}
+
+/**
+ * dup_pattern() - Duplicates part of the pattern all over the buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+static int dup_pattern(char *out, unsigned int out_len, unsigned int pattern_len)
+{
+	unsigned int left, len, off;
+
+	if (out_len <= pattern_len)
+		/* Normal case */
+		return 0;
+
+	off  = pattern_len;
+	left = (out_len - off);
+	len  = min(left, off);
+
+	/* Duplicate leftover */
+	while (left) {
+		memcpy(out + off, out, len);
+		left -= len;
+		off <<= 1;
+		len   = min(left, off);
+	}
+
+	return 0;
+}
+
+/**
+ * cpy_pattern() - Copies pattern to the buffer.
+ *
+ * Function copies pattern along the whole buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len)
+{
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}
+
+/**
+ * cmp_pattern() - Compares pattern and buffer.
+ *
+ * For the sake of performance this function avoids any loops.
+ * Firstly it tries to compare the buffer itself, checking that
+ * buffer consists of repeating patterns along the buffer size.
+ *
+ * If the difference is not found then the function tries to compare
+ * buffer and pattern.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len)
+{
+	int rc;
+	unsigned int size;
+
+	/* Find the difference in buffer */
+	if (len > pattern_size) {
+		rc = memcmp(buf, buf + pattern_size, len - pattern_size);
+		if (rc)
+			return -EILSEQ;
+	}
+	/* Compare second part of the pattern with buffer */
+	if (off) {
+		size = min(len, pattern_size - off);
+		rc = memcmp(buf, pattern + off, size);
+		if (rc)
+			return -EILSEQ;
+		buf += size;
+		len -= size;
+	}
+	/* Compare first part of the pattern or the whole pattern
+	 * with buffer */
+	if (len) {
+		size = min(len, (off ? off : pattern_size));
+		rc = memcmp(buf, pattern, size);
+		if (rc)
+			return -EILSEQ;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format_inplace() - Pastes parsed formats to the pattern.
+ *
+ * This function pastes formats to the pattern. If @fmt_sz is 0
+ * function does nothing and pattern buffer is left untouched.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv)
+{
+	int i, rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !fmt)
+		return -EINVAL;
+
+	/* Paste formats for first pattern chunk */
+	for (i = 0; i < fmt_sz; i++) {
+		struct pattern_fmt *f;
+
+		f = &fmt[i];
+		if (pattern_len <= f->off)
+			break;
+		len = min(pattern_len - f->off, f->desc->len);
+		rc  = f->desc->paste(pattern + f->off, len, priv);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format() - Pastes parsed formats to the buffer.
+ *
+ * This function copies pattern to the buffer, pastes format
+ * into it and then duplicates pattern all over the buffer size.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv)
+{
+	int rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	rc = paste_format_inplace(out, len, fmt, fmt_sz, priv);
+	if (rc)
+		return rc;
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}
diff -Nru fio-2.1.3/lib/pattern.h fio-3.16/lib/pattern.h
--- fio-2.1.3/lib/pattern.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/pattern.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,45 @@
+#ifndef FIO_PARSE_PATTERN_H
+#define FIO_PARSE_PATTERN_H
+
+/**
+ * Pattern format description. The input for 'parse_pattern'.
+ * Describes format with its name and callback, which should
+ * be called to paste something inside the buffer.
+ */
+struct pattern_fmt_desc {
+	const char  *fmt;
+	unsigned int len;
+	int (*paste)(char *buf, unsigned int len, void *priv);
+};
+
+/**
+ * Pattern format. The output of 'parse_pattern'.
+ * Describes the exact position inside the xbuffer.
+ */
+struct pattern_fmt {
+	unsigned int off;
+	const struct pattern_fmt_desc *desc;
+};
+
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out);
+
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv);
+
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv);
+
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len);
+
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len);
+
+#endif
diff -Nru fio-2.1.3/lib/pow2.h fio-3.16/lib/pow2.h
--- fio-2.1.3/lib/pow2.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/pow2.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+#ifndef FIO_POW2_H
+#define FIO_POW2_H
+
+#include <inttypes.h>
+#include "types.h"
+
+static inline bool is_power_of_2(uint64_t val)
+{
+	return (val != 0 && ((val & (val - 1)) == 0));
+}
+
+#endif
diff -Nru fio-2.1.3/lib/prio_tree.c fio-3.16/lib/prio_tree.c
--- fio-2.1.3/lib/prio_tree.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/prio_tree.c	2019-09-20 01:01:52.000000000 +0000
@@ -11,11 +11,15 @@
  * 02Feb2004	Initial version
  */
 
+#include <assert.h>
 #include <stdlib.h>
 #include <limits.h>
-#include "../fio.h"
+
+#include "../compiler/compiler.h"
 #include "prio_tree.h"
 
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
 /*
  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
  * which is useful for storing intervals, e.g, we can consider a vma as a closed
@@ -49,7 +53,7 @@
 
 static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
 
-void fio_init prio_tree_init(void)
+static void fio_init prio_tree_init(void)
 {
 	unsigned int i;
 
diff -Nru fio-2.1.3/lib/prio_tree.h fio-3.16/lib/prio_tree.h
--- fio-2.1.3/lib/prio_tree.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/prio_tree.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,7 +2,6 @@
 #define _LINUX_PRIO_TREE_H
 
 #include <inttypes.h>
-#include "../hash.h"
 
 struct prio_tree_node {
 	struct prio_tree_node	*left;
diff -Nru fio-2.1.3/lib/rand.c fio-3.16/lib/rand.c
--- fio-2.1.3/lib/rand.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/rand.c	2019-09-20 01:01:52.000000000 +0000
@@ -35,14 +35,17 @@
 
 #include <string.h>
 #include "rand.h"
+#include "pattern.h"
 #include "../hash.h"
 
-static inline int __seed(unsigned int x, unsigned int m)
+int arch_random;
+
+static inline uint64_t __seed(uint64_t x, uint64_t m)
 {
 	return (x < m) ? x + m : x;
 }
 
-static void __init_rand(struct frand_state *state, unsigned int seed)
+static void __init_rand32(struct taus88_state *state, unsigned int seed)
 {
 	int cranks = 6;
 
@@ -53,35 +56,76 @@
 	state->s3 = __seed(LCG(state->s2, seed), 15);
 
 	while (cranks--)
-		__rand(state);
+		__rand32(state);
 }
 
-void init_rand(struct frand_state *state)
+static void __init_rand64(struct taus258_state *state, uint64_t seed)
 {
-	__init_rand(state, 1);
+	int cranks = 6;
+
+#define LCG64(x, seed)  ((x) * 6906969069ULL ^ (seed))
+
+	state->s1 = __seed(LCG64((2^31) + (2^17) + (2^7), seed), 1);
+	state->s2 = __seed(LCG64(state->s1, seed), 7);
+	state->s3 = __seed(LCG64(state->s2, seed), 15);
+	state->s4 = __seed(LCG64(state->s3, seed), 33);
+	state->s5 = __seed(LCG64(state->s4, seed), 49);
+
+	while (cranks--)
+		__rand64(state);
 }
 
-void init_rand_seed(struct frand_state *state, unsigned int seed)
+void init_rand(struct frand_state *state, bool use64)
 {
-	__init_rand(state, seed);
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, 1);
+	else
+		__init_rand64(&state->state64, 1);
 }
 
-void __fill_random_buf(void *buf, unsigned int len, unsigned long seed)
+void init_rand_seed(struct frand_state *state, unsigned int seed, bool use64)
 {
-	long *ptr = buf;
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, seed);
+	else
+		__init_rand64(&state->state64, seed);
+}
+
+void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
+{
+	void *ptr = buf;
+
+	while (len) {
+		int this_len;
 
-	while ((void *) ptr - buf < len) {
-		*ptr = seed;
-		ptr++;
+		if (len >= sizeof(int64_t)) {
+			*((int64_t *) ptr) = seed;
+			this_len = sizeof(int64_t);
+		} else if (len >= sizeof(int32_t)) {
+			*((int32_t *) ptr) = seed;
+			this_len = sizeof(int32_t);
+		} else if (len >= sizeof(int16_t)) {
+			*((int16_t *) ptr) = seed;
+			this_len = sizeof(int16_t);
+		} else {
+			*((int8_t *) ptr) = seed;
+			this_len = sizeof(int8_t);
+		}
+		ptr += this_len;
+		len -= this_len;
 		seed *= GOLDEN_RATIO_PRIME;
 		seed >>= 3;
 	}
 }
 
-unsigned long fill_random_buf(struct frand_state *fs, void *buf,
-			      unsigned int len)
+uint64_t fill_random_buf(struct frand_state *fs, void *buf,
+			 unsigned int len)
 {
-	unsigned long r = __rand(fs);
+	uint64_t r = __rand(fs);
 
 	if (sizeof(int) != sizeof(long *))
 		r *= (unsigned long) __rand(fs);
@@ -90,44 +134,66 @@
 	return r;
 }
 
-unsigned long fill_random_buf_percentage(struct frand_state *fs, void *buf,
-					 unsigned int percentage,
-					 unsigned int segment, unsigned int len)
+void __fill_random_buf_percentage(uint64_t seed, void *buf,
+				  unsigned int percentage,
+				  unsigned int segment, unsigned int len,
+				  char *pattern, unsigned int pbytes)
 {
-	unsigned long r = __rand(fs);
 	unsigned int this_len;
 
 	if (percentage == 100) {
-		memset(buf, 0, len);
-		return 0;
+		if (pbytes)
+			(void)cpy_pattern(pattern, pbytes, buf, len);
+		else
+			memset(buf, 0, len);
+		return;
 	}
 
 	if (segment > len)
 		segment = len;
 
-	if (sizeof(int) != sizeof(long *))
-		r *= (unsigned long) __rand(fs);
-
 	while (len) {
 		/*
 		 * Fill random chunk
 		 */
-		this_len = (segment * (100 - percentage)) / 100;
+		this_len = ((unsigned long long)segment * (100 - percentage)) / 100;
 		if (this_len > len)
 			this_len = len;
 
-		__fill_random_buf(buf, this_len, r);
+		__fill_random_buf(buf, this_len, seed);
 
 		len -= this_len;
+		if (!len)
+			break;
 		buf += this_len;
+		this_len = segment - this_len;
 
 		if (this_len > len)
 			this_len = len;
+		else if (len - this_len <= sizeof(long))
+			this_len = len;
+
+		if (pbytes)
+			(void)cpy_pattern(pattern, pbytes, buf, this_len);
+		else
+			memset(buf, 0, this_len);
 
-		memset(buf, 0, this_len);
 		len -= this_len;
 		buf += this_len;
 	}
+}
+
+uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
+				    unsigned int percentage,
+				    unsigned int segment, unsigned int len,
+				    char *pattern, unsigned int pbytes)
+{
+	uint64_t r = __rand(fs);
+
+	if (sizeof(int) != sizeof(long *))
+		r *= (unsigned long) __rand(fs);
 
+	__fill_random_buf_percentage(r, buf, percentage, segment, len,
+					pattern, pbytes);
 	return r;
 }
diff -Nru fio-2.1.3/lib/rand.h fio-3.16/lib/rand.h
--- fio-2.1.3/lib/rand.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/rand.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,66 @@
 #ifndef FIO_RAND_H
 #define FIO_RAND_H
 
-#define FRAND_MAX	(-1U)
+#include <inttypes.h>
+#include <assert.h>
+#include "types.h"
 
-struct frand_state {
+#define FRAND32_MAX	(-1U)
+#define FRAND64_MAX	(-1ULL)
+
+struct taus88_state {
 	unsigned int s1, s2, s3;
 };
 
-static inline unsigned int __rand(struct frand_state *state)
+struct taus258_state {
+	uint64_t s1, s2, s3, s4, s5;
+};
+
+struct frand_state {
+	unsigned int use64;
+	union {
+		struct taus88_state state32;
+		struct taus258_state state64;
+	};
+};
+
+static inline uint64_t rand_max(struct frand_state *state)
+{
+	if (state->use64)
+		return FRAND64_MAX;
+	else
+		return FRAND32_MAX;
+}
+
+static inline void __frand32_copy(struct taus88_state *dst,
+				  struct taus88_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+}
+
+static inline void __frand64_copy(struct taus258_state *dst,
+				  struct taus258_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+	dst->s4 = src->s4;
+	dst->s5 = src->s5;
+}
+
+static inline void frand_copy(struct frand_state *dst, struct frand_state *src)
+{
+	if (src->use64)
+		__frand64_copy(&dst->state64, &src->state64);
+	else
+		__frand32_copy(&dst->state32, &src->state32);
+
+	dst->use64 = src->use64;
+}
+
+static inline unsigned int __rand32(struct taus88_state *state)
 {
 #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
 
@@ -18,10 +71,88 @@
 	return (state->s1 ^ state->s2 ^ state->s3);
 }
 
-extern void init_rand(struct frand_state *);
-extern void init_rand_seed(struct frand_state *, unsigned int seed);
-extern void __fill_random_buf(void *buf, unsigned int len, unsigned long seed);
-extern unsigned long fill_random_buf(struct frand_state *, void *buf, unsigned int len);
-extern unsigned long fill_random_buf_percentage(struct frand_state *, void *buf, unsigned int percentage, unsigned int segment, unsigned int len);
+static inline uint64_t __rand64(struct taus258_state *state)
+{
+	uint64_t xval;
+
+	xval = ((state->s1 <<  1) ^ state->s1) >> 53;
+	state->s1 = ((state->s1 & 18446744073709551614ULL) << 10) ^ xval;
+
+	xval = ((state->s2 << 24) ^ state->s2) >> 50;
+	state->s2 = ((state->s2 & 18446744073709551104ULL) <<  5) ^ xval;
+
+	xval = ((state->s3 <<  3) ^ state->s3) >> 23;
+	state->s3 = ((state->s3 & 18446744073709547520ULL) << 29) ^ xval;
+
+	xval = ((state->s4 <<  5) ^ state->s4) >> 24;
+	state->s4 = ((state->s4 & 18446744073709420544ULL) << 23) ^ xval;
+
+	xval = ((state->s5 <<  3) ^ state->s5) >> 33;
+	state->s5 = ((state->s5 & 18446744073701163008ULL) <<  8) ^ xval;
+
+	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4 ^ state->s5);
+}
+
+static inline uint64_t __rand(struct frand_state *state)
+{
+	if (state->use64)
+		return __rand64(&state->state64);
+	else
+		return __rand32(&state->state32);
+}
+
+static inline double __rand_0_1(struct frand_state *state)
+{
+	if (state->use64) {
+		uint64_t val = __rand64(&state->state64);
+
+		return (val + 1.0) / (FRAND64_MAX + 1.0);
+	} else {
+		uint32_t val = __rand32(&state->state32);
+
+		return (val + 1.0) / (FRAND32_MAX + 1.0);
+	}
+}
+
+static inline uint32_t rand32_upto(struct frand_state *state, uint32_t end)
+{
+	uint32_t r;
+
+	assert(!state->use64);
+
+	r = __rand32(&state->state32);
+	end++;
+	return (int) ((double)end * (r / (FRAND32_MAX + 1.0)));
+}
+
+static inline uint64_t rand64_upto(struct frand_state *state, uint64_t end)
+{
+	uint64_t r;
+
+	assert(state->use64);
+
+	r = __rand64(&state->state64);
+	end++;
+	return (uint64_t) ((double)end * (r / (FRAND64_MAX + 1.0)));
+}
+
+/*
+ * Generate a random value between 'start' and 'end', both inclusive
+ */
+static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
+				    uint64_t end)
+{
+	if (state->use64)
+		return start + rand64_upto(state, end - start);
+	else
+		return start + rand32_upto(state, end - start);
+}
+
+extern void init_rand(struct frand_state *, bool);
+extern void init_rand_seed(struct frand_state *, unsigned int seed, bool);
+extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
+extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
+extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
+extern uint64_t fill_random_buf_percentage(struct frand_state *, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
 
 #endif
diff -Nru fio-2.1.3/lib/rbtree.c fio-3.16/lib/rbtree.c
--- fio-2.1.3/lib/rbtree.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/rbtree.c	2019-09-20 01:01:52.000000000 +0000
@@ -15,17 +15,17 @@
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 
   linux/lib/rbtree.c
 */
 
 #include "rbtree.h"
 
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+static void __rb_rotate_left(struct fio_rb_node *node, struct rb_root *root)
 {
-	struct rb_node *right = node->rb_right;
-	struct rb_node *parent = rb_parent(node);
+	struct fio_rb_node *right = node->rb_right;
+	struct fio_rb_node *parent = rb_parent(node);
 
 	if ((node->rb_right = right->rb_left))
 		rb_set_parent(right->rb_left, node);
@@ -45,10 +45,10 @@
 	rb_set_parent(node, right);
 }
 
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+static void __rb_rotate_right(struct fio_rb_node *node, struct rb_root *root)
 {
-	struct rb_node *left = node->rb_left;
-	struct rb_node *parent = rb_parent(node);
+	struct fio_rb_node *left = node->rb_left;
+	struct fio_rb_node *parent = rb_parent(node);
 
 	if ((node->rb_left = left->rb_right))
 		rb_set_parent(left->rb_right, node);
@@ -68,9 +68,9 @@
 	rb_set_parent(node, left);
 }
 
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
+void rb_insert_color(struct fio_rb_node *node, struct rb_root *root)
 {
-	struct rb_node *parent, *gparent;
+	struct fio_rb_node *parent, *gparent;
 
 	while ((parent = rb_parent(node)) && rb_is_red(parent))
 	{
@@ -79,7 +79,7 @@
 		if (parent == gparent->rb_left)
 		{
 			{
-				register struct rb_node *uncle = gparent->rb_right;
+				register struct fio_rb_node *uncle = gparent->rb_right;
 				if (uncle && rb_is_red(uncle))
 				{
 					rb_set_black(uncle);
@@ -92,7 +92,7 @@
 
 			if (parent->rb_right == node)
 			{
-				register struct rb_node *tmp;
+				register struct fio_rb_node *tmp;
 				__rb_rotate_left(parent, root);
 				tmp = parent;
 				parent = node;
@@ -104,7 +104,7 @@
 			__rb_rotate_right(gparent, root);
 		} else {
 			{
-				register struct rb_node *uncle = gparent->rb_left;
+				register struct fio_rb_node *uncle = gparent->rb_left;
 				if (uncle && rb_is_red(uncle))
 				{
 					rb_set_black(uncle);
@@ -117,7 +117,7 @@
 
 			if (parent->rb_left == node)
 			{
-				register struct rb_node *tmp;
+				register struct fio_rb_node *tmp;
 				__rb_rotate_right(parent, root);
 				tmp = parent;
 				parent = node;
@@ -133,10 +133,11 @@
 	rb_set_black(root->rb_node);
 }
 
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+static void __rb_erase_color(struct fio_rb_node *node,
+			     struct fio_rb_node *parent,
 			     struct rb_root *root)
 {
-	struct rb_node *other;
+	struct fio_rb_node *other;
 
 	while ((!node || rb_is_black(node)) && node != root->rb_node)
 	{
@@ -161,7 +162,7 @@
 			{
 				if (!other->rb_right || rb_is_black(other->rb_right))
 				{
-					struct rb_node *o_left;
+					struct fio_rb_node *o_left;
 					if ((o_left = other->rb_left))
 						rb_set_black(o_left);
 					rb_set_red(other);
@@ -198,7 +199,7 @@
 			{
 				if (!other->rb_left || rb_is_black(other->rb_left))
 				{
-					register struct rb_node *o_right;
+					register struct fio_rb_node *o_right;
 					if ((o_right = other->rb_right))
 						rb_set_black(o_right);
 					rb_set_red(other);
@@ -219,9 +220,9 @@
 		rb_set_black(node);
 }
 
-void rb_erase(struct rb_node *node, struct rb_root *root)
+void rb_erase(struct fio_rb_node *node, struct rb_root *root)
 {
-	struct rb_node *child, *parent;
+	struct fio_rb_node *child, *parent;
 	int color;
 
 	if (!node->rb_left)
@@ -230,7 +231,7 @@
 		child = node->rb_left;
 	else
 	{
-		struct rb_node *old = node, *left;
+		struct fio_rb_node *old = node, *left;
 
 		node = node->rb_right;
 		while ((left = node->rb_left) != NULL)
@@ -289,9 +290,9 @@
 /*
  * This function returns the first node (in sort order) of the tree.
  */
-struct rb_node *rb_first(struct rb_root *root)
+struct fio_rb_node *rb_first(struct rb_root *root)
 {
-	struct rb_node	*n;
+	struct fio_rb_node	*n;
 
 	n = root->rb_node;
 	if (!n)
@@ -301,9 +302,9 @@
 	return n;
 }
 
-struct rb_node *rb_next(const struct rb_node *node)
+struct fio_rb_node *rb_next(const struct fio_rb_node *node)
 {
-	struct rb_node *parent;
+	struct fio_rb_node *parent;
 
 	if (RB_EMPTY_NODE(node))
 		return NULL;
@@ -316,7 +317,7 @@
 		node = node->rb_right; 
 		while (node->rb_left)
 			node=node->rb_left;
-		return (struct rb_node *)node;
+		return (struct fio_rb_node *)node;
 	}
 
 	/*
diff -Nru fio-2.1.3/lib/rbtree.h fio-3.16/lib/rbtree.h
--- fio-2.1.3/lib/rbtree.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/rbtree.h	2019-09-20 01:01:52.000000000 +0000
@@ -14,7 +14,7 @@
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 
   linux/include/linux/rbtree.h
 
@@ -34,7 +34,7 @@
 static inline struct page * rb_search_page_cache(struct inode * inode,
 						 unsigned long offset)
 {
-	struct rb_node * n = inode->i_rb_page_cache.rb_node;
+	struct fio_rb_node * n = inode->i_rb_page_cache.rb_node;
 	struct page * page;
 
 	while (n)
@@ -53,10 +53,10 @@
 
 static inline struct page * __rb_insert_page_cache(struct inode * inode,
 						   unsigned long offset,
-						   struct rb_node * node)
+						   struct fio_rb_node * node)
 {
-	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
-	struct rb_node * parent = NULL;
+	struct fio_rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct fio_rb_node * parent = NULL;
 	struct page * page;
 
 	while (*p)
@@ -79,7 +79,7 @@
 
 static inline struct page * rb_insert_page_cache(struct inode * inode,
 						 unsigned long offset,
-						 struct rb_node * node)
+						 struct fio_rb_node * node)
 {
 	struct page * ret;
 	if ((ret = __rb_insert_page_cache(inode, offset, node)))
@@ -97,34 +97,34 @@
 #include <stdlib.h>
 #include <inttypes.h>
 
-struct rb_node
+struct fio_rb_node
 {
 	intptr_t rb_parent_color;
 #define	RB_RED		0
 #define	RB_BLACK	1
-	struct rb_node *rb_right;
-	struct rb_node *rb_left;
+	struct fio_rb_node *rb_right;
+	struct fio_rb_node *rb_left;
 } __attribute__((aligned(sizeof(long))));
     /* The alignment might seem pointless, but allegedly CRIS needs it */
 
 struct rb_root
 {
-	struct rb_node *rb_node;
+	struct fio_rb_node *rb_node;
 };
 
 
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_parent(r)   ((struct fio_rb_node *)((r)->rb_parent_color & ~3))
 #define rb_color(r)   ((r)->rb_parent_color & 1)
 #define rb_is_red(r)   (!rb_color(r))
 #define rb_is_black(r) rb_color(r)
 #define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
 #define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
 
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+static inline void rb_set_parent(struct fio_rb_node *rb, struct fio_rb_node *p)
 {
 	rb->rb_parent_color = (rb->rb_parent_color & 3) | (uintptr_t)p;
 }
-static inline void rb_set_color(struct rb_node *rb, int color)
+static inline void rb_set_color(struct fio_rb_node *rb, int color)
 {
 	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
 }
@@ -136,15 +136,16 @@
 #define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
 #define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
 
-extern void rb_insert_color(struct rb_node *, struct rb_root *);
-extern void rb_erase(struct rb_node *, struct rb_root *);
+extern void rb_insert_color(struct fio_rb_node *, struct rb_root *);
+extern void rb_erase(struct fio_rb_node *, struct rb_root *);
 
 /* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_first(struct rb_root *);
-extern struct rb_node *rb_next(const struct rb_node *);
+extern struct fio_rb_node *rb_first(struct rb_root *);
+extern struct fio_rb_node *rb_next(const struct fio_rb_node *);
 
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
+static inline void rb_link_node(struct fio_rb_node * node,
+				struct fio_rb_node * parent,
+				struct fio_rb_node ** rb_link)
 {
 	node->rb_parent_color = (uintptr_t)parent;
 	node->rb_left = node->rb_right = NULL;
diff -Nru fio-2.1.3/lib/seqlock.h fio-3.16/lib/seqlock.h
--- fio-2.1.3/lib/seqlock.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/seqlock.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,49 @@
+#ifndef FIO_SEQLOCK_H
+#define FIO_SEQLOCK_H
+
+#include "types.h"
+#include "../arch/arch.h"
+
+struct seqlock {
+	volatile int sequence;
+};
+
+static inline void seqlock_init(struct seqlock *s)
+{
+	s->sequence = 0;
+}
+
+static inline unsigned int read_seqlock_begin(struct seqlock *s)
+{
+	unsigned int seq;
+
+	do {
+		seq = s->sequence;
+		if (!(seq & 1))
+			break;
+		nop;
+	} while (1);
+
+	read_barrier();
+	return seq;
+}
+
+static inline bool read_seqlock_retry(struct seqlock *s, unsigned int seq)
+{
+	read_barrier();
+	return s->sequence != seq;
+}
+
+static inline void write_seqlock_begin(struct seqlock *s)
+{
+	s->sequence++;
+	write_barrier();
+}
+
+static inline void write_seqlock_end(struct seqlock *s)
+{
+	write_barrier();
+	s->sequence++;
+}
+
+#endif
diff -Nru fio-2.1.3/lib/strcasestr.c fio-3.16/lib/strcasestr.c
--- fio-2.1.3/lib/strcasestr.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/strcasestr.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,25 +0,0 @@
-#include <ctype.h>
-#include <stddef.h>
-
-char *strcasestr(const char *s1, const char *s2)
-{
-	const char *s = s1;
-	const char *p = s2;
-
-	do {
-		if (!*p)
-			return (char *) s1;
-		if ((*p == *s) ||
-		    (tolower(*p) == tolower(*s))) {
-			++p;
-			++s;
-		} else {
-			p = s2;
-			if (!*s)
-				return NULL;
-			s = ++s1;
-		}
-	} while (1);
-
-	return *p ? NULL : (char *) s1;
-}
diff -Nru fio-2.1.3/lib/strcasestr.h fio-3.16/lib/strcasestr.h
--- fio-2.1.3/lib/strcasestr.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/strcasestr.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,13 +0,0 @@
-#ifdef CONFIG_STRCASESTR
-
-#include <string.h>
-
-#else
-
-#ifndef FIO_STRCASESTR_H
-#define FIO_STRCASESTR_H
-
-char *strcasestr(const char *haystack, const char *needle);
-
-#endif
-#endif
diff -Nru fio-2.1.3/lib/strntol.c fio-3.16/lib/strntol.c
--- fio-2.1.3/lib/strntol.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/strntol.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,33 @@
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include "strntol.h"
+
+long strntol(const char *str, size_t sz, char **end, int base)
+{
+	/* Expect that digit representation of LONG_MAX/MIN
+	 * not greater than this buffer */
+	char buf[24];
+	long ret;
+	const char *beg = str;
+
+	/* Catch up leading spaces */
+	for (; beg && sz && *beg == ' '; beg++, sz--)
+		;
+
+	if (!sz || sz >= sizeof(buf)) {
+		if (end)
+			*end = (char *)str;
+		return 0;
+	}
+
+	memcpy(buf, beg, sz);
+	buf[sz] = '\0';
+	ret = strtol(buf, end, base);
+	if (ret == LONG_MIN || ret == LONG_MAX)
+		return ret;
+	if (end)
+		*end = (char *)beg + (*end - buf);
+	return ret;
+}
diff -Nru fio-2.1.3/lib/strntol.h fio-3.16/lib/strntol.h
--- fio-2.1.3/lib/strntol.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/strntol.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+#ifndef FIO_STRNTOL_H
+#define FIO_STRNTOL_H
+
+#include <stdint.h>
+
+long strntol(const char *str, size_t sz, char **end, int base);
+
+#endif
diff -Nru fio-2.1.3/lib/strsep.c fio-3.16/lib/strsep.c
--- fio-2.1.3/lib/strsep.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/strsep.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,29 +0,0 @@
-#include <stdio.h>
-
-char *strsep(char **stringp, const char *delim)
-{
-	char *s, *tok;
-	const char *spanp;
-	int c, sc;
-
-	s = *stringp;
-	if (!s)
-		return NULL;
-
-	tok = s;
-	do {
-		c = *s++;
-		spanp = delim;
-		do {
-			sc = *spanp++;
-			if (sc == c) {
-				if (c == 0)
-					s = NULL;
-				else
-					s[-1] = 0;
-				*stringp = s;
-				return tok;
-			}
-		} while (sc != 0);
-	} while (1);
-}
diff -Nru fio-2.1.3/lib/strsep.h fio-3.16/lib/strsep.h
--- fio-2.1.3/lib/strsep.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/strsep.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,6 +0,0 @@
-#ifndef FIO_STRSEP_LIB_H
-#define FIO_STRSEP_LIB_H
-
-char *strsep(char **, const char *);
-
-#endif
diff -Nru fio-2.1.3/lib/types.h fio-3.16/lib/types.h
--- fio-2.1.3/lib/types.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/lib/types.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,20 @@
+#ifndef FIO_TYPES_H
+#define FIO_TYPES_H
+
+#if !defined(CONFIG_HAVE_BOOL) && !defined(__cplusplus)
+typedef int bool;
+#ifndef false
+#define false	0
+#endif
+#ifndef true
+#define true	1
+#endif
+#else
+#include <stdbool.h> /* IWYU pragma: export */
+#endif
+
+#if !defined(CONFIG_HAVE_KERNEL_RWF_T)
+typedef int __kernel_rwf_t;
+#endif
+
+#endif
diff -Nru fio-2.1.3/lib/zipf.c fio-3.16/lib/zipf.c
--- fio-2.1.3/lib/zipf.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/zipf.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,21 +1,14 @@
 #include <math.h>
 #include <string.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <fcntl.h>
-#include "ieee754.h"
-#include "../log.h"
 #include "zipf.h"
 #include "../minmax.h"
 #include "../hash.h"
 
-#define ZIPF_MAX_GEN	10000000
+#define ZIPF_MAX_GEN	10000000UL
 
 static void zipf_update(struct zipf_state *zs)
 {
-	unsigned long to_gen;
+	uint64_t to_gen;
 	unsigned int i;
 
 	/*
@@ -23,23 +16,23 @@
 	 * 10M max, that should be doable in 1-2s on even slow machines.
 	 * Precision will take a slight hit, but nothing major.
 	 */
-	to_gen = min(zs->nranges, ZIPF_MAX_GEN);
+	to_gen = min(zs->nranges, (uint64_t) ZIPF_MAX_GEN);
 
 	for (i = 0; i < to_gen; i++)
 		zs->zetan += pow(1.0 / (double) (i + 1), zs->theta);
 }
 
-static void shared_rand_init(struct zipf_state *zs, unsigned long nranges,
+static void shared_rand_init(struct zipf_state *zs, uint64_t nranges,
 			     unsigned int seed)
 {
 	memset(zs, 0, sizeof(*zs));
 	zs->nranges = nranges;
 
-	init_rand_seed(&zs->rand, seed);
+	init_rand_seed(&zs->rand, seed, 0);
 	zs->rand_off = __rand(&zs->rand);
 }
 
-void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta,
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
 	       unsigned int seed)
 {
 	shared_rand_init(zs, nranges, seed);
@@ -50,7 +43,7 @@
 	zipf_update(zs);
 }
 
-unsigned long long zipf_next(struct zipf_state *zs)
+uint64_t zipf_next(struct zipf_state *zs)
 {
 	double alpha, eta, rand_uni, rand_z;
 	unsigned long long n = zs->nranges;
@@ -59,7 +52,7 @@
 	alpha = 1.0 / (1.0 - zs->theta);
 	eta = (1.0 - pow(2.0 / n, 1.0 - zs->theta)) / (1.0 - zs->zeta2 / zs->zetan);
 
-	rand_uni = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+	rand_uni = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
 	rand_z = rand_uni * zs->zetan;
 
 	if (rand_z < 1.0)
@@ -69,20 +62,35 @@
 	else
 		val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
 
-	return (__hash_u64(val - 1) + zs->rand_off) % zs->nranges;
+	val--;
+
+	if (!zs->disable_hash)
+		val = __hash_u64(val);
+
+	return (val + zs->rand_off) % zs->nranges;
 }
 
-void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
 		 unsigned int seed)
 {
 	shared_rand_init(zs, nranges, seed);
 	zs->pareto_pow = log(h) / log(1.0 - h);
 }
 
-unsigned long long pareto_next(struct zipf_state *zs)
+uint64_t pareto_next(struct zipf_state *zs)
 {
-	double rand = (double) __rand(&zs->rand) / (double) FRAND_MAX;
-	unsigned long long n = zs->nranges - 1;
+	double rand = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
+	unsigned long long n;
+
+	n = (zs->nranges - 1) * pow(rand, zs->pareto_pow);
+
+	if (!zs->disable_hash)
+		n = __hash_u64(n);
 
-	return (__hash_u64(n * pow(rand, zs->pareto_pow)) + zs->rand_off) % zs->nranges;
+	return (n + zs->rand_off)  % zs->nranges;
+}
+
+void zipf_disable_hash(struct zipf_state *zs)
+{
+	zs->disable_hash = true;
 }
diff -Nru fio-2.1.3/lib/zipf.h fio-3.16/lib/zipf.h
--- fio-2.1.3/lib/zipf.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/lib/zipf.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,6 +3,7 @@
 
 #include <inttypes.h>
 #include "rand.h"
+#include "types.h"
 
 struct zipf_state {
 	uint64_t nranges;
@@ -12,12 +13,14 @@
 	double pareto_pow;
 	struct frand_state rand;
 	uint64_t rand_off;
+	bool disable_hash;
 };
 
-void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta, unsigned int seed);
-unsigned long long zipf_next(struct zipf_state *zs);
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta, unsigned int seed);
+uint64_t zipf_next(struct zipf_state *zs);
 
-void pareto_init(struct zipf_state *zs, unsigned long nranges, double h, unsigned int seed);
-unsigned long long pareto_next(struct zipf_state *zs);
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h, unsigned int seed);
+uint64_t pareto_next(struct zipf_state *zs);
+void zipf_disable_hash(struct zipf_state *zs);
 
 #endif
diff -Nru fio-2.1.3/libfio.c fio-3.16/libfio.c
--- fio-2.1.3/libfio.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/libfio.c	2019-09-20 01:01:52.000000000 +0000
@@ -18,32 +18,31 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  *
  */
 
 #include <string.h>
-#include <sys/types.h>
 #include <signal.h>
 #include <stdint.h>
 #include <locale.h>
+#include <fcntl.h>
 
 #include "fio.h"
 #include "smalloc.h"
 #include "os/os.h"
+#include "filelock.h"
+#include "helper_thread.h"
+#include "filehash.h"
 
-/*
- * Just expose an empty list, if the OS does not support disk util stats
- */
-#ifndef FIO_HAVE_DISK_UTIL
 FLIST_HEAD(disk_list);
-#endif
 
 unsigned long arch_flags = 0;
 
 uintptr_t page_mask = 0;
 uintptr_t page_size = 0;
 
+/* see os/os.h */
 static const char *fio_os_strings[os_nr] = {
 	"Invalid",
 	"Linux",
@@ -52,10 +51,14 @@
 	"HP-UX",
 	"OSX",
 	"NetBSD",
+	"OpenBSD",
 	"Solaris",
-	"Windows"
+	"Windows",
+	"Android",
+	"DragonFly",
 };
 
+/* see arch/arch.h */
 static const char *fio_arch_strings[arch_nr] = {
 	"Invalid",
 	"x86-64",
@@ -69,25 +72,33 @@
 	"arm",
 	"sh",
 	"hppa",
+	"mips",
+	"aarch64",
 	"generic"
 };
 
-static void reset_io_counters(struct thread_data *td)
+static void reset_io_counters(struct thread_data *td, int all)
 {
 	int ddir;
 
-	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
-		td->stat_io_bytes[ddir] = 0;
-		td->this_io_bytes[ddir] = 0;
-		td->stat_io_blocks[ddir] = 0;
-		td->this_io_blocks[ddir] = 0;
-		td->rate_bytes[ddir] = 0;
-		td->rate_blocks[ddir] = 0;
-		td->io_issues[ddir] = 0;
+	if (all) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td->stat_io_bytes[ddir] = 0;
+			td->this_io_bytes[ddir] = 0;
+			td->stat_io_blocks[ddir] = 0;
+			td->this_io_blocks[ddir] = 0;
+			td->rate_bytes[ddir] = 0;
+			td->rate_blocks[ddir] = 0;
+			td->bytes_done[ddir] = 0;
+			td->rate_io_issue_bytes[ddir] = 0;
+			td->rate_next_io_time[ddir] = 0;
+			td->last_usec[ddir] = 0;
+		}
 	}
+
 	td->zone_bytes = 0;
 
-	td->last_was_sync = 0;
+	td->last_was_sync = false;
 	td->rwmix_issues = 0;
 
 	/*
@@ -97,29 +108,31 @@
 		td->nr_done_files = 0;
 }
 
-void clear_io_state(struct thread_data *td)
+void clear_io_state(struct thread_data *td, int all)
 {
 	struct fio_file *f;
 	unsigned int i;
 
-	reset_io_counters(td);
+	reset_io_counters(td, all);
 
 	close_files(td);
-	for_each_file(td, f, i)
+	for_each_file(td, f, i) {
 		fio_file_clear_done(f);
+		f->file_offset = get_start_offset(td, f);
+	}
 
 	/*
-	 * Set the same seed to get repeatable runs
+	 * Re-Seed random number generator if rand_repeatable is true
 	 */
-	td_fill_rand_seeds(td);
+	if (td->o.rand_repeatable)
+		td_fill_rand_seeds(td);
 }
 
 void reset_all_stats(struct thread_data *td)
 {
-	struct timeval tv;
 	int i;
 
-	reset_io_counters(td);
+	reset_io_counters(td, 1);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		td->io_bytes[i] = 0;
@@ -130,9 +143,15 @@
 		td->rwmix_issues = 0;
 	}
 
-	fio_gettime(&tv, NULL);
-	memcpy(&td->epoch, &tv, sizeof(tv));
-	memcpy(&td->start, &tv, sizeof(tv));
+	set_epoch_time(td, td->o.log_unix_epoch);
+	memcpy(&td->start, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+
+	lat_target_reset(td);
+	clear_rusage_stat(td);
+	helper_reset();
 }
 
 void reset_fio_state(void)
@@ -159,17 +178,62 @@
 	return NULL;
 }
 
+static const char *td_runstates[] = {
+	"NOT_CREATED",
+	"CREATED",
+	"INITIALIZED",
+	"RAMP",
+	"SETTING_UP",
+	"RUNNING",
+	"PRE_READING",
+	"VERIFYING",
+	"FSYNCING",
+	"FINISHING",
+	"EXITED",
+	"REAPED",
+};
+
+const char *runstate_to_name(int runstate)
+{
+	compiletime_assert(TD_LAST == 12, "td runstate list");
+	if (runstate >= 0 && runstate < TD_LAST)
+		return td_runstates[runstate];
+
+	return "invalid";
+}
+
 void td_set_runstate(struct thread_data *td, int runstate)
 {
 	if (td->runstate == runstate)
 		return;
 
-	dprint(FD_PROCESS, "pid=%d: runstate %d -> %d\n", (int) td->pid,
-						td->runstate, runstate);
+	dprint(FD_PROCESS, "pid=%d: runstate %s -> %s\n", (int) td->pid,
+						runstate_to_name(td->runstate),
+						runstate_to_name(runstate));
 	td->runstate = runstate;
 }
 
-void fio_terminate_threads(int group_id)
+int td_bump_runstate(struct thread_data *td, int new_state)
+{
+	int old_state = td->runstate;
+
+	td_set_runstate(td, new_state);
+	return old_state;
+}
+
+void td_restore_runstate(struct thread_data *td, int old_state)
+{
+	td_set_runstate(td, old_state);
+}
+
+void fio_mark_td_terminate(struct thread_data *td)
+{
+	fio_gettime(&td->terminate_time, NULL);
+	write_barrier();
+	td->terminate = true;
+}
+
+void fio_terminate_threads(unsigned int group_id)
 {
 	struct thread_data *td;
 	pid_t pid = getpid();
@@ -178,10 +242,14 @@
 	dprint(FD_PROCESS, "terminate group_id=%d\n", group_id);
 
 	for_each_td(td, i) {
-		if (group_id == TERMINATE_ALL || groupid == td->groupid) {
+		if (group_id == TERMINATE_ALL || group_id == td->groupid) {
 			dprint(FD_PROCESS, "setting terminate on %s/%d\n",
 						td->o.name, (int) td->pid);
-			td->terminate = 1;
+
+			if (td->terminate)
+				continue;
+
+			fio_mark_td_terminate(td);
 			td->o.start_delay = 0;
 
 			/*
@@ -201,6 +269,50 @@
 	}
 }
 
+int fio_running_or_pending_io_threads(void)
+{
+	struct thread_data *td;
+	int i;
+	int nr_io_threads = 0;
+
+	for_each_td(td, i) {
+		if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
+			continue;
+		nr_io_threads++;
+		if (td->runstate < TD_EXITED)
+			return 1;
+	}
+
+	if (!nr_io_threads)
+		return -1; /* we only had cpuio threads to begin with */
+	return 0;
+}
+
+int fio_set_fd_nonblocking(int fd, const char *who)
+{
+	int flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		log_err("fio: %s failed to get file flags: %s\n", who, strerror(errno));
+	else {
+		int new_flags = flags | O_NONBLOCK;
+
+		new_flags = fcntl(fd, F_SETFL, new_flags);
+		if (new_flags < 0)
+			log_err("fio: %s failed to get file flags: %s\n", who, strerror(errno));
+	}
+
+	return flags;
+}
+
+enum {
+	ENDIAN_INVALID_BE = 1,
+	ENDIAN_INVALID_LE,
+	ENDIAN_INVALID_CONFIG,
+	ENDIAN_BROKEN,
+};
+
 static int endian_check(void)
 {
 	union {
@@ -217,16 +329,16 @@
 
 #if defined(CONFIG_LITTLE_ENDIAN)
 	if (be)
-		return 1;
+		return ENDIAN_INVALID_BE;
 #elif defined(CONFIG_BIG_ENDIAN)
 	if (le)
-		return 1;
+		return ENDIAN_INVALID_LE;
 #else
-	return 1;
+	return ENDIAN_INVALID_CONFIG;
 #endif
 
 	if (!le && !be)
-		return 1;
+		return ENDIAN_BROKEN;
 
 	return 0;
 }
@@ -234,9 +346,48 @@
 int initialize_fio(char *envp[])
 {
 	long ps;
+	int err;
 
-	if (endian_check()) {
+	/*
+	 * We need these to be properly 64-bit aligned, otherwise we
+	 * can run into problems on archs that fault on unaligned fp
+	 * access (ARM).
+	 */
+	compiletime_assert((offsetof(struct thread_data, ts) % sizeof(void *)) == 0, "ts");
+	compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list");
+	compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time");
+	compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count");
+	compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile");
+	compiletime_assert((offsetof(struct thread_data, ts.clat_stat) % 8) == 0, "ts.clat_stat");
+	compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta");
+	compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h");
+	compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
+	compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
+	compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
+
+	compiletime_assert(__TD_F_LAST <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+	compiletime_assert(BSSPLIT_MAX <= ZONESPLIT_MAX, "bsssplit/zone max");
+
+	err = endian_check();
+	if (err) {
 		log_err("fio: endianness settings appear wrong.\n");
+		switch (err) {
+		case ENDIAN_INVALID_BE:
+			log_err("fio: got big-endian when configured for little\n");
+			break;
+		case ENDIAN_INVALID_LE:
+			log_err("fio: got little-endian when configured for big\n");
+			break;
+		case ENDIAN_INVALID_CONFIG:
+			log_err("fio: not configured to any endianness\n");
+			break;
+		case ENDIAN_BROKEN:
+			log_err("fio: failed to detect endianness\n");
+			break;
+		default:
+			assert(0);
+			break;
+		}
 		log_err("fio: please report this to fio@vger.kernel.org\n");
 		return 1;
 	}
@@ -249,6 +400,13 @@
 
 	sinit();
 
+	if (fio_filelock_init()) {
+		log_err("fio: failed initializing filelock subsys\n");
+		return 1;
+	}
+
+	file_hash_init();
+
 	/*
 	 * We need locale for number printing, if it isn't set then just
 	 * go with the US format.
@@ -268,3 +426,8 @@
 	fio_keywords_init();
 	return 0;
 }
+
+void deinitialize_fio(void)
+{
+	fio_keywords_exit();
+}
diff -Nru fio-2.1.3/LICENSE fio-3.16/LICENSE
--- fio-2.1.3/LICENSE	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/LICENSE	1970-01-01 00:00:00.000000000 +0000
@@ -1,17 +0,0 @@
-As specified by the COPYING file, fio is free software published under version
-2 of the GPL license. That covers the copying part of the license. By using fio,
-you are also promising to uphold the following moral obligations:
-
-- If you publish results that are done using fio, it must be clearly stated
-  that fio was used. The specific version should also be listed.
-
-- If you develop features or bug fixes for fio, they should be sent upstream
-  for inclusion into the main repository. This isn't specific to fio, that
-  is a general rule for any open source project. It's just the Right Thing
-  to do. Plus it means that you don't have to maintain the feature or change
-  internally. In the long run, this is saving you a lot of time.
-
-I would consider the above to fall under "common courtesy", but since
-people tend to have differing opinions of that, it doesn't hurt to spell out
-my expectations clearly.
-
diff -Nru fio-2.1.3/log.c fio-3.16/log.c
--- fio-2.1.3/log.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/log.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,76 +1,97 @@
 #include <unistd.h>
-#include <fcntl.h>
 #include <string.h>
 #include <stdarg.h>
 #include <syslog.h>
 
 #include "fio.h"
+#include "oslib/asprintf.h"
 
-int log_valist(const char *str, va_list args)
+size_t log_info_buf(const char *buf, size_t len)
 {
-	char buffer[1024];
-	size_t len;
-
-	len = vsnprintf(buffer, sizeof(buffer), str, args);
-	len = min(len, sizeof(buffer) - 1);
+	/*
+	 * buf could be NULL (not just "").
+	 */
+	if (!buf)
+		return 0;
 
-	if (is_backend)
-		len = fio_server_text_output(FIO_LOG_INFO, buffer, len);
-	if (log_syslog)
-		syslog(LOG_INFO, "%s", buffer);
-	else
-		len = fwrite(buffer, len, 1, f_out);
+	if (is_backend) {
+		ssize_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len);
+		if (ret != -1)
+			return ret;
+	}
 
-	return len;
+	if (log_syslog) {
+		syslog(LOG_INFO, "%s", buf);
+		return len;
+	} else
+		return fwrite(buf, len, 1, f_out);
 }
 
-int log_local_buf(const char *buf, size_t len)
+size_t log_valist(const char *fmt, va_list args)
 {
-	if (log_syslog)
-		syslog(LOG_INFO, "%s", buf);
-	else
-		len = fwrite(buf, len, 1, f_out);
+	char *buffer;
+	int len;
+
+	len = vasprintf(&buffer, fmt, args);
+	if (len < 0)
+		return 0;
+	len = log_info_buf(buffer, len);
+	free(buffer);
 
 	return len;
 }
 
-int log_local(const char *format, ...)
+/* add prefix for the specified type in front of the valist */
+void log_prevalist(int type, const char *fmt, va_list args)
+{
+	char *buf1, *buf2;
+	int len;
+	pid_t pid;
+
+	pid = gettid();
+	if (fio_debug_jobp && *fio_debug_jobp != -1U
+	    && pid != *fio_debug_jobp)
+		return;
+
+	len = vasprintf(&buf1, fmt, args);
+	if (len < 0)
+		return;
+	len = asprintf(&buf2, "%-8s %-5u %s", debug_levels[type].name,
+		       (int) pid, buf1);
+	free(buf1);
+	if (len < 0)
+		return;
+	len = log_info_buf(buf2, len);
+	free(buf2);
+}
+
+ssize_t log_info(const char *format, ...)
 {
-	char buffer[1024];
 	va_list args;
-	size_t len;
+	ssize_t ret;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	ret = log_valist(format, args);
 	va_end(args);
-	len = min(len, sizeof(buffer) - 1);
 
-	if (log_syslog)
-		syslog(LOG_INFO, "%s", buffer);
-	else
-		len = fwrite(buffer, len, 1, f_out);
-
-	return len;
+	return ret;
 }
 
-int log_info(const char *format, ...)
+size_t __log_buf(struct buf_output *buf, const char *format, ...)
 {
-	char buffer[1024];
+	char *buffer;
 	va_list args;
-	size_t len;
+	int len;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	len = vasprintf(&buffer, format, args);
 	va_end(args);
-	len = min(len, sizeof(buffer) - 1);
+	if (len < 0)
+		return 0;
+	len = buf_output_add(buf, buffer, len);
+	free(buffer);
 
-	if (is_backend)
-		return fio_server_text_output(FIO_LOG_INFO, buffer, len);
-	else if (log_syslog) {
-		syslog(LOG_INFO, "%s", buffer);
-		return len;
-	} else
-		return fwrite(buffer, len, 1, f_out);
+	return len;
 }
 
 int log_info_flush(void)
@@ -81,31 +102,38 @@
 	return fflush(f_out);
 }
 
-int log_err(const char *format, ...)
+ssize_t log_err(const char *format, ...)
 {
-	char buffer[1024];
+	ssize_t ret;
+	int len;
+	char *buffer;
 	va_list args;
-	size_t len;
 
 	va_start(args, format);
-	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	len = vasprintf(&buffer, format, args);
 	va_end(args);
-	len = min(len, sizeof(buffer) - 1);
+	if (len < 0)
+		return len;
+
+	if (is_backend) {
+		ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
+		if (ret != -1)
+			goto done;
+	}
 
-	if (is_backend)
-		return fio_server_text_output(FIO_LOG_ERR, buffer, len);
-	else if (log_syslog) {
+	if (log_syslog) {
 		syslog(LOG_INFO, "%s", buffer);
-		return len;
+		ret = len;
 	} else {
-		if (f_err != stderr) {
-			int fio_unused ret;
-
+		if (f_err != stderr)
 			ret = fwrite(buffer, len, 1, stderr);
-		}
 
-		return fwrite(buffer, len, 1, f_err);
+		ret = fwrite(buffer, len, 1, f_err);
 	}
+
+done:
+	free(buffer);
+	return ret;
 }
 
 const char *log_get_level(int level)
diff -Nru fio-2.1.3/log.h fio-3.16/log.h
--- fio-2.1.3/log.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/log.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,16 +3,31 @@
 
 #include <stdio.h>
 #include <stdarg.h>
+#include <unistd.h>
+
+#include "lib/output_buffer.h"
 
 extern FILE *f_out;
 extern FILE *f_err;
 
-extern int log_err(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
-extern int log_info(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
-extern int log_valist(const char *str, va_list);
-extern int log_local_buf(const char *buf, size_t);
+extern ssize_t log_err(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern ssize_t log_info(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern size_t __log_buf(struct buf_output *, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3)));
+extern size_t log_valist(const char *str, va_list);
+extern void log_prevalist(int type, const char *str, va_list);
+extern size_t log_info_buf(const char *buf, size_t len);
 extern int log_info_flush(void);
 
+#define log_buf(buf, format, args...)			\
+({							\
+	size_t __ret;					\
+	if ((buf) != NULL)				\
+		__ret = __log_buf(buf, format, ##args);	\
+	else						\
+		__ret = log_info(format, ##args);	\
+	__ret;						\
+})
+
 enum {
 	FIO_LOG_DEBUG	= 1,
 	FIO_LOG_INFO	= 2,
diff -Nru fio-2.1.3/Makefile fio-3.16/Makefile
--- fio-2.1.3/Makefile	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/Makefile	2019-09-20 01:01:52.000000000 +0000
@@ -1,45 +1,76 @@
-ifneq ($(wildcard config-host.mak),)
-all:
-include config-host.mak
-config-host-mak: configure
-	@echo $@ is out-of-date, running configure
-	@sed -n "/.*Configured with/s/[^:]*: //p" $@ | sh
-else
-config-host.mak:
-ifneq ($(MAKECMDGOALS),clean)
-	@echo "Running configure for you..."
-	@./configure
+ifeq ($(SRCDIR),)
+SRCDIR := .
 endif
-all:
+
+VPATH := $(SRCDIR)
+
+all: fio
+
+config-host.mak: configure
+	@if [ ! -e "$@" ]; then					\
+	  echo "Running configure ...";				\
+	  ./configure;						\
+	else							\
+	  echo "$@ is out-of-date, running configure";		\
+	  sed -n "/.*Configured with/s/[^:]*: //p" "$@" | sh;	\
+	fi
+
+ifneq ($(MAKECMDGOALS),clean)
 include config-host.mak
 endif
 
-DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
-CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 $(DEBUGFLAGS)
-OPTFLAGS= -O3 -g -ffast-math
-CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS)
+DEBUGFLAGS = -DFIO_INC_DEBUG
+CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
+OPTFLAGS= -g -ffast-math
+CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
 LIBS	+= -lm $(EXTLIBS)
 PROGS	= fio
-SCRIPTS = tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv)
+
+ifndef CONFIG_FIO_NO_OPT
+  CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+endif
+ifdef CONFIG_BUILD_NATIVE
+  CFLAGS += -march=native
+endif
 
 ifdef CONFIG_GFIO
   PROGS += gfio
 endif
 
-SOURCE := gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
-		eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \
-		lib/rbtree.c smalloc.c filehash.c profile.c debug.c lib/rand.c \
-		lib/num2str.c lib/ieee754.c $(wildcard crc/*.c) engines/cpu.c \
+SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
+		$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \
+		gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
+		eta.c verify.c memory.c io_u.c parse.c fio_sem.c rwlock.c \
+		pshared.c options.c \
+		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
-		memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \
-		cconv.c lib/prio_tree.c json.c lib/zipf.c lib/axmap.c \
-		lib/lfsr.c gettime-thread.c helpers.c lib/flist_sort.c \
-		lib/hweight.c lib/getrusage.c idletime.c td_error.c \
-		profiles/tiobench.c profiles/act.c io_u_queue.c
-
-ifdef CONFIG_64BIT_LLP64
-  CFLAGS += -DBITS_PER_LONG=32
+		engines/ftruncate.c engines/filecreate.c \
+		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
+		gettime-thread.c helpers.c json.c idletime.c td_error.c \
+		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
+		workqueue.c rate-submit.c optgroup.c helper_thread.c \
+		steadystate.c zone-dist.c
+
+ifdef CONFIG_LIBHDFS
+  HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
+  HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
+  CFLAGS += $(HDFSFLAGS)
+  SOURCE += engines/libhdfs.c
+endif
+
+ifdef CONFIG_LIBISCSI
+  CFLAGS += $(LIBISCSI_CFLAGS)
+  LIBS += $(LIBISCSI_LIBS)
+  SOURCE += engines/libiscsi.c
+endif
+
+ifdef CONFIG_LIBNBD
+  CFLAGS += $(LIBNBD_CFLAGS)
+  LIBS += $(LIBNBD_LIBS)
+  SOURCE += engines/nbd.c
 endif
+
 ifdef CONFIG_64BIT
   CFLAGS += -DBITS_PER_LONG=64
 endif
@@ -67,36 +98,78 @@
 ifdef CONFIG_GUASI
   SOURCE += engines/guasi.c
 endif
-ifdef CONFIG_FUSION_AW
-  SOURCE += engines/fusion-aw.c
-endif
 ifdef CONFIG_SOLARISAIO
   SOURCE += engines/solarisaio.c
 endif
 ifdef CONFIG_WINDOWSAIO
   SOURCE += engines/windowsaio.c
 endif
+ifdef CONFIG_RADOS
+  SOURCE += engines/rados.c
+endif
+ifdef CONFIG_RBD
+  SOURCE += engines/rbd.c
+endif
+ifdef CONFIG_HTTP
+  SOURCE += engines/http.c
+endif
+SOURCE += oslib/asprintf.c
 ifndef CONFIG_STRSEP
-  SOURCE += lib/strsep.c
+  SOURCE += oslib/strsep.c
 endif
 ifndef CONFIG_STRCASESTR
-  SOURCE += lib/strcasestr.c
+  SOURCE += oslib/strcasestr.c
+endif
+ifndef CONFIG_STRLCAT
+  SOURCE += oslib/strlcat.c
+endif
+ifndef CONFIG_HAVE_STRNDUP
+  SOURCE += oslib/strndup.c
 endif
 ifndef CONFIG_GETOPT_LONG_ONLY
-  SOURCE += lib/getopt_long.c
+  SOURCE += oslib/getopt_long.c
 endif
 ifndef CONFIG_INET_ATON
-  SOURCE += lib/inet_aton.c
+  SOURCE += oslib/inet_aton.c
+endif
+ifdef CONFIG_GFAPI
+  SOURCE += engines/glusterfs.c
+  SOURCE += engines/glusterfs_sync.c
+  SOURCE += engines/glusterfs_async.c
+  ifdef CONFIG_GF_FADVISE
+    CFLAGS += "-DGFAPI_USE_FADVISE"
+  endif
+endif
+ifdef CONFIG_MTD
+  SOURCE += engines/mtd.c
+  SOURCE += oslib/libmtd.c
+  SOURCE += oslib/libmtd_legacy.c
+endif
+ifdef CONFIG_PMEMBLK
+  SOURCE += engines/pmemblk.c
+endif
+ifdef CONFIG_LINUX_DEVDAX
+  SOURCE += engines/dev-dax.c
+endif
+ifdef CONFIG_LIBPMEM
+  SOURCE += engines/libpmem.c
+endif
+ifdef CONFIG_IME
+  SOURCE += engines/ime.c
+endif
+ifdef CONFIG_LINUX_BLKZONED
+  SOURCE += zbd.c
 endif
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
-		engines/binject.c
+		oslib/linux-dev-lookup.c engines/io_uring.c
   LIBS += -lpthread -ldl
   LDFLAGS += -rdynamic
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
-  SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c
+  SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
+		oslib/linux-dev-lookup.c
   LIBS += -ldl
   LDFLAGS += -rdynamic
 endif
@@ -105,13 +178,23 @@
   CPPFLAGS += -D__EXTENSIONS__
 endif
 ifeq ($(CONFIG_TARGET_OS), FreeBSD)
+  SOURCE += trim.c
   LIBS	 += -lpthread -lrt
   LDFLAGS += -rdynamic
 endif
+ifeq ($(CONFIG_TARGET_OS), OpenBSD)
+  LIBS	 += -lpthread
+  LDFLAGS += -rdynamic
+endif
 ifeq ($(CONFIG_TARGET_OS), NetBSD)
   LIBS	 += -lpthread -lrt
   LDFLAGS += -rdynamic
 endif
+ifeq ($(CONFIG_TARGET_OS), DragonFly)
+  SOURCE += trim.c
+  LIBS	 += -lpthread -lrt
+  LDFLAGS += -rdynamic
+endif
 ifeq ($(CONFIG_TARGET_OS), AIX)
   LIBS	 += -lpthread -ldl -lrt
   CPPFLAGS += -D_LARGE_FILES -D__ppc__
@@ -125,22 +208,28 @@
   LIBS	 += -lpthread -ldl
 endif
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
-  SOURCE := $(filter-out engines/mmap.c,$(SOURCE))
   SOURCE += os/windows/posix.c
   LIBS	 += -lpthread -lpsapi -lws2_32
   CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
 endif
 
-OBJS = $(SOURCE:.c=.o)
+OBJS := $(SOURCE:.c=.o)
 
 FIO_OBJS = $(OBJS) fio.o
+
 GFIO_OBJS = $(OBJS) gfio.o graph.o tickmarks.o ghelpers.o goptions.o gerror.o \
 			gclient.o gcompat.o cairo_text_helpers.o printing.o
 
+ifdef CONFIG_ARITHMETIC
+FIO_OBJS += lex.yy.o y.tab.o
+GFIO_OBJS += lex.yy.o y.tab.o
+endif
+
 -include $(OBJS:.o=.d)
 
 T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o
+T_SMALLOC_OBJS += gettime.o fio_sem.o pshared.o smalloc.o t/log.o t/debug.o \
+		  t/arch.o
 T_SMALLOC_PROGS = t/stest
 
 T_IEEE_OBJS = t/ieee754.o
@@ -148,34 +237,109 @@
 T_IEEE_PROGS = t/ieee754
 
 T_ZIPF_OBS = t/genzipf.o
-T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
-T_ZIPF_PROGS = t/genzipf
+T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \
+		lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o \
+		oslib/strndup.o
+T_ZIPF_PROGS = t/fio-genzipf
 
 T_AXMAP_OBJS = t/axmap.o
 T_AXMAP_OBJS += lib/lfsr.o lib/axmap.o
 T_AXMAP_PROGS = t/axmap
 
 T_LFSR_TEST_OBJS = t/lfsr-test.o
-T_LFSR_TEST_OBJS += lib/lfsr.o
+T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o fio_sem.o pshared.o \
+		    t/log.o t/debug.o t/arch.o
 T_LFSR_TEST_PROGS = t/lfsr-test
 
+T_GEN_RAND_OBJS = t/gen-rand.o
+T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
+			oslib/strcasestr.o oslib/strndup.o
+T_GEN_RAND_PROGS = t/gen-rand
+
+ifeq ($(CONFIG_TARGET_OS), Linux)
+T_BTRACE_FIO_OBJS = t/btrace2fio.o
+T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o oslib/linux-dev-lookup.o
+T_BTRACE_FIO_PROGS = t/fio-btrace2fio
+endif
+
+T_DEDUPE_OBJS = t/dedupe.o
+T_DEDUPE_OBJS += lib/rbtree.o t/log.o fio_sem.o pshared.o smalloc.o gettime.o \
+		crc/md5.o lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o \
+		t/arch.o crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o \
+		crc/crc32c-arm64.o crc/fnv.o
+T_DEDUPE_PROGS = t/fio-dedupe
+
+T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o
+T_VS_PROGS = t/fio-verify-state
+
+T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
+T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
+
+T_IOU_RING_OBJS = t/io_uring.o
+T_IOU_RING_PROGS = t/io_uring
+
+T_MEMLOCK_OBJS = t/memlock.o
+T_MEMLOCK_PROGS = t/memlock
+
+T_TT_OBJS = t/time-test.o
+T_TT_PROGS = t/time-test
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
 T_OBJS += $(T_ZIPF_OBJS)
 T_OBJS += $(T_AXMAP_OBJS)
 T_OBJS += $(T_LFSR_TEST_OBJS)
+T_OBJS += $(T_GEN_RAND_OBJS)
+T_OBJS += $(T_BTRACE_FIO_OBJS)
+T_OBJS += $(T_DEDUPE_OBJS)
+T_OBJS += $(T_VS_OBJS)
+T_OBJS += $(T_PIPE_ASYNC_OBJS)
+T_OBJS += $(T_MEMLOCK_OBJS)
+T_OBJS += $(T_TT_OBJS)
+T_OBJS += $(T_IOU_RING_OBJS)
+
+ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
+    T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o
+    T_SMALLOC_OBJS += os/windows/posix.o lib/hweight.o
+    T_LFSR_TEST_OBJS += os/windows/posix.o lib/hweight.o
+endif
 
-T_PROGS = $(T_SMALLOC_PROGS)
-T_PROGS += $(T_IEEE_PROGS)
+T_TEST_PROGS = $(T_SMALLOC_PROGS)
+T_TEST_PROGS += $(T_IEEE_PROGS)
 T_PROGS += $(T_ZIPF_PROGS)
-T_PROGS += $(T_AXMAP_PROGS)
-T_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_AXMAP_PROGS)
+T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_GEN_RAND_PROGS)
+T_PROGS += $(T_BTRACE_FIO_PROGS)
+T_PROGS += $(T_DEDUPE_PROGS)
+T_PROGS += $(T_VS_PROGS)
+
+PROGS += $(T_PROGS)
+
+ifdef CONFIG_HAVE_CUNIT
+UT_OBJS = unittests/unittest.o
+UT_OBJS += unittests/lib/memalign.o
+UT_OBJS += unittests/lib/strntol.o
+UT_OBJS += unittests/oslib/strlcat.o
+UT_OBJS += unittests/oslib/strndup.o
+UT_TARGET_OBJS = lib/memalign.o
+UT_TARGET_OBJS += lib/strntol.o
+UT_TARGET_OBJS += oslib/strlcat.o
+UT_TARGET_OBJS += oslib/strndup.o
+UT_PROGS = unittests/unittest
+else
+UT_OBJS =
+UT_TARGET_OBJS =
+UT_PROGS =
+endif
 
 ifneq ($(findstring $(MAKEFLAGS),s),s)
 ifndef V
 	QUIET_CC	= @echo '   ' CC $@;
-	QUIET_LINK	= @echo '   ' LINK $@;
-	QUIET_DEP	= @echo '   ' DEP $@;
+	QUIET_LINK	= @echo ' ' LINK $@;
+	QUIET_DEP	= @echo '  ' DEP $@;
+	QUIET_YACC	= @echo ' ' YACC $@;
+	QUIET_LEX	= @echo '  ' LEX $@;
 endif
 endif
 
@@ -184,7 +348,7 @@
 else
 	INSTALL = install
 endif
-prefix = /usr/local
+prefix = $(INSTALL_PREFIX)
 bindir = $(prefix)/bin
 
 ifeq ($(CONFIG_TARGET_OS), Darwin)
@@ -195,55 +359,112 @@
 sharedir = $(prefix)/share/fio
 endif
 
-all: $(PROGS) $(SCRIPTS) FORCE
+all: $(PROGS) $(T_TEST_PROGS) $(UT_PROGS) $(SCRIPTS) FORCE
 
-.PHONY: all install clean
+.PHONY: all install clean test
 .PHONY: FORCE cscope
 
 FIO-VERSION-FILE: FORCE
-	@$(SHELL) ./FIO-VERSION-GEN
+	@$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
 -include FIO-VERSION-FILE
 
 override CFLAGS += -DFIO_VERSION='"$(FIO_VERSION)"'
 
-.c.o: FORCE FIO-VERSION-FILE
+%.o : %.c
+	@mkdir -p $(dir $@)
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
-	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $*.c > $*.d
+	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
 	@mv -f $*.d $*.d.tmp
 	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
-	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -1 | \
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+		sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
 		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
 	@rm -f $*.d.tmp
 
-init.o: FIO-VERSION-FILE init.c
-	$(QUIET_CC)$(CC) -o init.o $(CFLAGS) $(CPPFLAGS) -c init.c
+ifdef CONFIG_ARITHMETIC
+lex.yy.c: exp/expression-parser.l
+ifdef CONFIG_LEX_USE_O
+	$(QUIET_LEX)$(LEX) -o $@ $<
+else
+	$(QUIET_LEX)$(LEX) $<
+endif
+
+lex.yy.o: lex.yy.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.o: y.tab.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.c: exp/expression-parser.y
+	$(QUIET_YACC)$(YACC) -o $@ -l -d -b y $<
+
+y.tab.h: y.tab.c
+
+lexer.h: lex.yy.c
+
+exp/test-expression-parser.o: exp/test-expression-parser.c
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+exp/test-expression-parser: exp/test-expression-parser.o
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) $< y.tab.o lex.yy.o -o $@ $(LIBS)
+
+parse.o: lex.yy.o y.tab.o
+endif
+
+init.o: init.c FIO-VERSION-FILE
+	@mkdir -p $(dir $@)
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
+	@mv -f $*.d $*.d.tmp
+	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+		sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
+		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
+	@rm -f $*.d.tmp
 
 gcompat.o: gcompat.c gcompat.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gcompat.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 goptions.o: goptions.c goptions.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c goptions.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 ghelpers.o: ghelpers.c ghelpers.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c ghelpers.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gerror.o: gerror.c gerror.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gerror.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gclient.o: gclient.c gclient.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gclient.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gfio.o: gfio.c ghelpers.c
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gfio.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 graph.o: graph.c graph.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c graph.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 cairo_text_helpers.o: cairo_text_helpers.c cairo_text_helpers.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c cairo_text_helpers.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 printing.o: printing.c printing.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c printing.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+t/io_uring.o: os/linux/io_uring.h
+t/io_uring: $(T_IOU_RING_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IOU_RING_OBJS) $(LIBS)
+
+t/read-to-pipe-async: $(T_PIPE_ASYNC_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
+
+t/memlock: $(T_MEMLOCK_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
 
 t/stest: $(T_SMALLOC_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
@@ -252,12 +473,12 @@
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IEEE_OBJS) $(LIBS)
 
 fio: $(FIO_OBJS)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
 
 gfio: $(GFIO_OBJS)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) -o gfio $(GFIO_OBJS) $(LIBS) $(GTK_LDFLAGS)
+	$(QUIET_LINK)$(CC) $(filter-out -static, $(LDFLAGS)) -o gfio $(GFIO_OBJS) $(LIBS) $(GFIO_LIBS) $(GTK_LDFLAGS) $(HDFSLIB)
 
-t/genzipf: $(T_ZIPF_OBJS)
+t/fio-genzipf: $(T_ZIPF_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS)
 
 t/axmap: $(T_AXMAP_OBJS)
@@ -266,11 +487,34 @@
 t/lfsr-test: $(T_LFSR_TEST_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
 
+t/gen-rand: $(T_GEN_RAND_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
+
+ifeq ($(CONFIG_TARGET_OS), Linux)
+t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
+endif
+
+t/fio-dedupe: $(T_DEDUPE_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+
+t/fio-verify-state: $(T_VS_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
+
+t/time-test: $(T_TT_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
+
+ifdef CONFIG_HAVE_CUNIT
+unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(UT_OBJS) $(UT_TARGET_OBJS) -lcunit
+endif
+
 clean: FORCE
-	-rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) core.* core gfio FIO-VERSION-FILE *.d config-host.mak config-host.h
+	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -rf  doc/output
 
 distclean: clean FORCE
-	@rm -f cscope.out
+	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf
 
 cscope:
 	@cscope -b -R
@@ -278,12 +522,39 @@
 tools/plot/fio2gnuplot.1:
 	@cat tools/plot/fio2gnuplot.manpage | txt2man -t fio2gnuplot >  tools/plot/fio2gnuplot.1
 
+doc: tools/plot/fio2gnuplot.1
+	@man -t ./fio.1 | ps2pdf - fio.pdf
+	@man -t tools/fio_generate_plots.1 | ps2pdf - fio_generate_plots.pdf
+	@man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf
+	@man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf
+
+test: fio
+	./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K
+
+fulltest:
+	sudo modprobe null_blk &&				 	\
+	if [ ! -e /usr/include/libzbc/zbc.h ]; then			\
+	  git clone https://github.com/hgst/libzbc &&		 	\
+	  (cd libzbc &&						 	\
+	   ./autogen.sh &&					 	\
+	   ./configure --prefix=/usr &&				 	\
+	   make -j &&						 	\
+	   sudo make install)						\
+	fi &&					 			\
+	sudo t/zbd/run-tests-against-regular-nullb &&		 	\
+	if [ -e /sys/module/null_blk/parameters/zoned ]; then		\
+		sudo t/zbd/run-tests-against-zoned-nullb;	 	\
+	fi
+
 install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
 	$(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
 	$(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir)
 	$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 fio.1 $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 tools/fio_generate_plots.1 $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 tools/plot/fio2gnuplot.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/fio.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/fio_generate_plots.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/fio2gnuplot.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/hist/fiologparser_hist.py.1 $(DESTDIR)$(mandir)/man1
 	$(INSTALL) -m 755 -d $(DESTDIR)$(sharedir)
-	$(INSTALL) -m 644 tools/plot/*gpm $(DESTDIR)$(sharedir)/
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/*gpm $(DESTDIR)$(sharedir)/
+
+.PHONY: test fulltest
diff -Nru fio-2.1.3/memalign.c fio-3.16/memalign.c
--- fio-2.1.3/memalign.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/memalign.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,36 +0,0 @@
-#include <stdlib.h>
-#include <assert.h>
-#include <inttypes.h>
-
-#include "memalign.h"
-
-struct align_footer {
-	unsigned int offset;
-};
-
-#define PTR_ALIGN(ptr, mask)	\
-	(char *) (((uintptr_t) ((ptr) + (mask)) & ~(mask)))
-
-void *fio_memalign(size_t alignment, size_t size)
-{
-	struct align_footer *f;
-	void *ptr, *ret = NULL;
-
-	assert(!(alignment & (alignment - 1)));
-
-	ptr = malloc(size + alignment + size + sizeof(*f) - 1);
-	if (ptr) {
-		ret = PTR_ALIGN(ptr, alignment);
-		f = ret + size;
-		f->offset = (uintptr_t) ret - (uintptr_t) ptr;
-	}
-
-	return ret;
-}
-
-void fio_memfree(void *ptr, size_t size)
-{
-	struct align_footer *f = ptr + size;
-
-	free(ptr - f->offset);
-}
diff -Nru fio-2.1.3/memalign.h fio-3.16/memalign.h
--- fio-2.1.3/memalign.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/memalign.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,7 +0,0 @@
-#ifndef FIO_MEMALIGN_H
-#define FIO_MEMALIGN_H
-
-extern void *fio_memalign(size_t alignment, size_t size);
-extern void fio_memfree(void *ptr, size_t size);
-
-#endif
diff -Nru fio-2.1.3/memory.c fio-3.16/memory.c
--- fio-2.1.3/memory.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/memory.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,11 +1,10 @@
 /*
  * Memory helpers
  */
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#include <sys/stat.h>
 
 #include "fio.h"
 #ifndef FIO_NO_HAVE_SHM_H
@@ -33,13 +32,13 @@
 	dprint(FD_MEM, "pinning %llu bytes\n", td->o.lockmem);
 
 	/*
-	 * Don't allow mlock of more than real_mem-128MB
+	 * Don't allow mlock of more than real_mem-128MiB
 	 */
 	phys_mem = os_phys_mem();
 	if (phys_mem) {
 		if ((td->o.lockmem + 128 * 1024 * 1024) > phys_mem) {
 			td->o.lockmem = phys_mem - 128 * 1024 * 1024;
-			log_info("fio: limiting mlocked memory to %lluMB\n",
+			log_info("fio: limiting mlocked memory to %lluMiB\n",
 							td->o.lockmem >> 20);
 		}
 	}
@@ -63,6 +62,7 @@
 
 static int alloc_mem_shm(struct thread_data *td, unsigned int total_mem)
 {
+#ifndef CONFIG_NO_SHM
 	int flags = IPC_CREAT | S_IRUSR | S_IWUSR;
 
 	if (td->o.mem_type == MEM_SHMHUGE) {
@@ -88,7 +88,7 @@
 					" support huge pages.\n");
 			} else if (errno == ENOMEM) {
 				log_err("fio: no huge pages available, do you"
-					" need to alocate some? See HOWTO.\n");
+					" need to allocate some? See HOWTO.\n");
 			}
 		}
 
@@ -104,22 +104,28 @@
 	}
 
 	return 0;
+#else
+	log_err("fio: shm not supported\n");
+	return 1;
+#endif
 }
 
 static void free_mem_shm(struct thread_data *td)
 {
+#ifndef CONFIG_NO_SHM
 	struct shmid_ds sbuf;
 
 	dprint(FD_MEM, "shmdt/ctl %d %p\n", td->shm_id, td->orig_buffer);
 	shmdt(td->orig_buffer);
 	shmctl(td->shm_id, IPC_RMID, &sbuf);
+#endif
 }
 
 static int alloc_mem_mmap(struct thread_data *td, size_t total_mem)
 {
 	int flags = 0;
 
-	td->mmapfd = 1;
+	td->mmapfd = -1;
 
 	if (td->o.mem_type == MEM_MMAPHUGE) {
 		unsigned long mask = td->o.hugepage_size - 1;
@@ -131,6 +137,9 @@
 	}
 
 	if (td->o.mmapfile) {
+		if (access(td->o.mmapfile, F_OK) == 0)
+			td->flags |= TD_F_MMAP_KEEP;
+
 		td->mmapfd = open(td->o.mmapfile, O_RDWR|O_CREAT, 0644);
 
 		if (td->mmapfd < 0) {
@@ -139,12 +148,14 @@
 			return 1;
 		}
 		if (td->o.mem_type != MEM_MMAPHUGE &&
+		    td->o.mem_type != MEM_MMAPSHARED &&
 		    ftruncate(td->mmapfd, total_mem) < 0) {
 			td_verror(td, errno, "truncate mmap file");
 			td->orig_buffer = NULL;
 			return 1;
 		}
-		if (td->o.mem_type == MEM_MMAPHUGE)
+		if (td->o.mem_type == MEM_MMAPHUGE ||
+		    td->o.mem_type == MEM_MMAPSHARED)
 			flags |= MAP_SHARED;
 		else
 			flags |= MAP_PRIVATE;
@@ -158,9 +169,10 @@
 	if (td->orig_buffer == MAP_FAILED) {
 		td_verror(td, errno, "mmap");
 		td->orig_buffer = NULL;
-		if (td->mmapfd) {
+		if (td->mmapfd != 1 && td->mmapfd != -1) {
 			close(td->mmapfd);
-			unlink(td->o.mmapfile);
+			if (td->o.mmapfile && !(td->flags & TD_F_MMAP_KEEP))
+				unlink(td->o.mmapfile);
 		}
 
 		return 1;
@@ -175,8 +187,10 @@
 						td->orig_buffer);
 	munmap(td->orig_buffer, td->orig_buffer_size);
 	if (td->o.mmapfile) {
-		close(td->mmapfd);
-		unlink(td->o.mmapfile);
+		if (td->mmapfd != -1)
+			close(td->mmapfd);
+		if (!(td->flags & TD_F_MMAP_KEEP))
+			unlink(td->o.mmapfile);
 		free(td->o.mmapfile);
 	}
 }
@@ -196,6 +210,78 @@
 	free(td->orig_buffer);
 }
 
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+#ifdef CONFIG_CUDA
+	CUresult ret;
+	char name[128];
+
+	ret = cuInit(0);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed initialize cuda driver api\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device count\n");
+		return 1;
+	}
+	dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+	if (td->gpu_dev_cnt == 0) {
+		log_err("fio: no GPU device found. "
+			"Can not perform GPUDirect RDMA.\n");
+		return 1;
+	}
+
+	td->gpu_dev_id = td->o.gpu_dev_id;
+	ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get GPU device\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device name\n");
+		return 1;
+	}
+	dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+	       td->gpu_dev_id, name);
+
+	ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed to create cuda context: %d\n", ret);
+		return 1;
+	}
+
+	ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+		return 1;
+	}
+	td->orig_buffer = (void *) td->dev_mem_ptr;
+
+	dprint(FD_MEM, "cudaMalloc %llu %p\n",				\
+	       (unsigned long long) total_mem, td->orig_buffer);
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+#ifdef CONFIG_CUDA
+	if (td->dev_mem_ptr != NULL)
+		cuMemFree(td->dev_mem_ptr);
+
+	if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+		log_err("fio: failed to destroy cuda context\n");
+#endif
+}
+
 /*
  * Set up the buffer area we need for io.
  */
@@ -204,13 +290,13 @@
 	size_t total_mem;
 	int ret = 0;
 
-	if (td->io_ops->flags & FIO_NOIO)
+	if (td_ioengine_flagged(td, FIO_NOIO))
 		return 0;
 
 	total_mem = td->orig_buffer_size;
 
-	if (td->o.odirect || td->o.mem_align ||
-	    (td->io_ops->flags & FIO_MEMALIGN)) {
+	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+	    td_ioengine_flagged(td, FIO_MEMALIGN)) {
 		total_mem += page_mask;
 		if (td->o.mem_align && td->o.mem_align > page_size)
 			total_mem += td->o.mem_align - page_size;
@@ -218,12 +304,25 @@
 
 	dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
 
-	if (td->o.mem_type == MEM_MALLOC)
+	/*
+	 * If the IO engine has hooks to allocate/free memory, use those. But
+	 * error out if the user explicitly asked for something else.
+	 */
+	if (td->io_ops->iomem_alloc) {
+		if (fio_option_is_set(&td->o, mem_type)) {
+			log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+			ret = 1;
+		} else
+			ret = td->io_ops->iomem_alloc(td, total_mem);
+	} else if (td->o.mem_type == MEM_MALLOC)
 		ret = alloc_mem_malloc(td, total_mem);
 	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
 		ret = alloc_mem_shm(td, total_mem);
-	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE)
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
 		ret = alloc_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		ret = alloc_mem_cudamalloc(td, total_mem);
 	else {
 		log_err("fio: bad mem type: %d\n", td->o.mem_type);
 		ret = 1;
@@ -240,15 +339,21 @@
 	unsigned int total_mem;
 
 	total_mem = td->orig_buffer_size;
-	if (td->o.odirect)
+	if (td->o.odirect || td->o.oatomic)
 		total_mem += page_mask;
 
-	if (td->o.mem_type == MEM_MALLOC)
+	if (td->io_ops->iomem_alloc) {
+		if (td->io_ops->iomem_free)
+			td->io_ops->iomem_free(td);
+	} else if (td->o.mem_type == MEM_MALLOC)
 		free_mem_malloc(td);
 	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
 		free_mem_shm(td);
-	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE)
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
 		free_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		free_mem_cudamalloc(td);
 	else
 		log_err("Bad memory type %u\n", td->o.mem_type);
 
diff -Nru fio-2.1.3/minmax.h fio-3.16/minmax.h
--- fio-2.1.3/minmax.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/minmax.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,10 +2,24 @@
 #define FIO_MIN_MAX_H
 
 #ifndef min
-#define min(a, b)	((a) < (b) ? (a) : (b))
+#define min(x,y) ({ \
+	__typeof__(x) _x = (x);	\
+	__typeof__(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x < _y ? _x : _y; })
 #endif
+
 #ifndef max
-#define max(a, b)	((a) > (b) ? (a) : (b))
+#define max(x,y) ({ \
+	__typeof__(x) _x = (x);	\
+	__typeof__(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x > _y ? _x : _y; })
 #endif
 
+#define min_not_zero(x, y) ({		\
+	__typeof__(x) __x = (x);		\
+	__typeof__(y) __y = (y);		\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
 #endif
diff -Nru fio-2.1.3/MORAL-LICENSE fio-3.16/MORAL-LICENSE
--- fio-2.1.3/MORAL-LICENSE	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/MORAL-LICENSE	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,17 @@
+As specified by the COPYING file, fio is free software published under version
+2 of the GPL license. That covers the copying part of the license. When using
+fio, you are encouraged to uphold the following moral obligations:
+
+- If you publish results that are done using fio, it should be clearly stated
+  that fio was used. The specific version should also be listed.
+
+- If you develop features or bug fixes for fio, they should be sent upstream
+  for inclusion into the main repository. This isn't specific to fio, that
+  is a general rule for any open source project. It's just the Right Thing
+  to do. Plus it means that you don't have to maintain the feature or change
+  internally. In the long run, this is saving you a lot of time.
+
+I would consider the above to fall under "common courtesy", but since
+people tend to have differing opinions of that, it doesn't hurt to spell out
+my expectations clearly.
+
diff -Nru fio-2.1.3/mutex.c fio-3.16/mutex.c
--- fio-2.1.3/mutex.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/mutex.c	1970-01-01 00:00:00.000000000 +0000
@@ -1,228 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <time.h>
-#include <errno.h>
-#include <pthread.h>
-#include <sys/mman.h>
-#include <assert.h>
-
-#include "fio.h"
-#include "log.h"
-#include "mutex.h"
-#include "arch/arch.h"
-#include "os/os.h"
-#include "helpers.h"
-#include "time.h"
-#include "gettime.h"
-
-void fio_mutex_remove(struct fio_mutex *mutex)
-{
-	assert(mutex->magic == FIO_MUTEX_MAGIC);
-	pthread_cond_destroy(&mutex->cond);
-	munmap((void *) mutex, sizeof(*mutex));
-}
-
-struct fio_mutex *fio_mutex_init(int value)
-{
-	struct fio_mutex *mutex = NULL;
-	pthread_mutexattr_t attr;
-	pthread_condattr_t cond;
-	int ret;
-
-	mutex = (void *) mmap(NULL, sizeof(struct fio_mutex),
-				PROT_READ | PROT_WRITE,
-				OS_MAP_ANON | MAP_SHARED, -1, 0);
-	if (mutex == MAP_FAILED) {
-		perror("mmap mutex");
-		mutex = NULL;
-		goto err;
-	}
-
-	mutex->value = value;
-	mutex->magic = FIO_MUTEX_MAGIC;
-
-	ret = pthread_mutexattr_init(&attr);
-	if (ret) {
-		log_err("pthread_mutexattr_init: %s\n", strerror(ret));
-		goto err;
-	}
-
-	/*
-	 * Not all platforms support process shared mutexes (FreeBSD)
-	 */
-#ifdef FIO_HAVE_PSHARED_MUTEX
-	ret = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
-	if (ret) {
-		log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret));
-		goto err;
-	}
-#endif
-
-	pthread_condattr_init(&cond);
-#ifdef FIO_HAVE_PSHARED_MUTEX
-	pthread_condattr_setpshared(&cond, PTHREAD_PROCESS_SHARED);
-#endif
-	pthread_cond_init(&mutex->cond, &cond);
-
-	ret = pthread_mutex_init(&mutex->lock, &attr);
-	if (ret) {
-		log_err("pthread_mutex_init: %s\n", strerror(ret));
-		goto err;
-	}
-
-	pthread_condattr_destroy(&cond);
-	pthread_mutexattr_destroy(&attr);
-
-	return mutex;
-err:
-	if (mutex)
-		fio_mutex_remove(mutex);
-
-	return NULL;
-}
-
-static int mutex_timed_out(struct timeval *t, unsigned int seconds)
-{
-	return mtime_since_now(t) >= seconds * 1000;
-}
-
-int fio_mutex_down_timeout(struct fio_mutex *mutex, unsigned int seconds)
-{
-	struct timeval tv_s;
-	struct timespec t;
-	int ret = 0;
-
-	assert(mutex->magic == FIO_MUTEX_MAGIC);
-
-	gettimeofday(&tv_s, NULL);
-	t.tv_sec = tv_s.tv_sec + seconds;
-	t.tv_nsec = tv_s.tv_usec * 1000;
-
-	pthread_mutex_lock(&mutex->lock);
-
-	while (!mutex->value && !ret) {
-		mutex->waiters++;
-
-		/*
-		 * Some platforms (FreeBSD 9?) seems to return timed out
-		 * way too early, double check.
-		 */
-		ret = pthread_cond_timedwait(&mutex->cond, &mutex->lock, &t);
-		if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, seconds))
-			ret = 0;
-
-		mutex->waiters--;
-	}
-
-	if (!ret) {
-		mutex->value--;
-		pthread_mutex_unlock(&mutex->lock);
-	}
-
-	return ret;
-}
-
-void fio_mutex_down(struct fio_mutex *mutex)
-{
-	assert(mutex->magic == FIO_MUTEX_MAGIC);
-
-	pthread_mutex_lock(&mutex->lock);
-
-	while (!mutex->value) {
-		mutex->waiters++;
-		pthread_cond_wait(&mutex->cond, &mutex->lock);
-		mutex->waiters--;
-	}
-
-	mutex->value--;
-	pthread_mutex_unlock(&mutex->lock);
-}
-
-void fio_mutex_up(struct fio_mutex *mutex)
-{
-	assert(mutex->magic == FIO_MUTEX_MAGIC);
-
-	pthread_mutex_lock(&mutex->lock);
-	read_barrier();
-	if (!mutex->value && mutex->waiters)
-		pthread_cond_signal(&mutex->cond);
-	mutex->value++;
-	pthread_mutex_unlock(&mutex->lock);
-}
-
-void fio_rwlock_write(struct fio_rwlock *lock)
-{
-	assert(lock->magic == FIO_RWLOCK_MAGIC);
-	pthread_rwlock_wrlock(&lock->lock);
-}
-
-void fio_rwlock_read(struct fio_rwlock *lock)
-{
-	assert(lock->magic == FIO_RWLOCK_MAGIC);
-	pthread_rwlock_rdlock(&lock->lock);
-}
-
-void fio_rwlock_unlock(struct fio_rwlock *lock)
-{
-	assert(lock->magic == FIO_RWLOCK_MAGIC);
-	pthread_rwlock_unlock(&lock->lock);
-}
-
-void fio_rwlock_remove(struct fio_rwlock *lock)
-{
-	assert(lock->magic == FIO_RWLOCK_MAGIC);
-	munmap((void *) lock, sizeof(*lock));
-}
-
-struct fio_rwlock *fio_rwlock_init(void)
-{
-	struct fio_rwlock *lock;
-	pthread_rwlockattr_t attr;
-	int ret;
-
-	lock = (void *) mmap(NULL, sizeof(struct fio_rwlock),
-				PROT_READ | PROT_WRITE,
-				OS_MAP_ANON | MAP_SHARED, -1, 0);
-	if (lock == MAP_FAILED) {
-		perror("mmap rwlock");
-		lock = NULL;
-		goto err;
-	}
-
-	lock->magic = FIO_RWLOCK_MAGIC;
-
-	ret = pthread_rwlockattr_init(&attr);
-	if (ret) {
-		log_err("pthread_rwlockattr_init: %s\n", strerror(ret));
-		goto err;
-	}
-#ifdef FIO_HAVE_PSHARED_MUTEX
-	ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
-	if (ret) {
-		log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret));
-		goto destroy_attr;
-	}
-
-	ret = pthread_rwlock_init(&lock->lock, &attr);
-#else
-	ret = pthread_rwlock_init(&lock->lock, NULL);
-#endif
-
-	if (ret) {
-		log_err("pthread_rwlock_init: %s\n", strerror(ret));
-		goto destroy_attr;
-	}
-
-	pthread_rwlockattr_destroy(&attr);
-
-	return lock;
-destroy_attr:
-	pthread_rwlockattr_destroy(&attr);
-err:
-	if (lock)
-		fio_rwlock_remove(lock);
-	return NULL;
-}
diff -Nru fio-2.1.3/mutex.h fio-3.16/mutex.h
--- fio-2.1.3/mutex.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/mutex.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,39 +0,0 @@
-#ifndef FIO_MUTEX_H
-#define FIO_MUTEX_H
-
-#include <pthread.h>
-
-#define FIO_MUTEX_MAGIC		0x4d555445U
-#define FIO_RWLOCK_MAGIC	0x52574c4fU
-
-struct fio_mutex {
-	pthread_mutex_t lock;
-	pthread_cond_t cond;
-	int value;
-	int waiters;
-	int magic;
-};
-
-struct fio_rwlock {
-	pthread_rwlock_t lock;
-	int magic;
-};
-
-enum {
-	FIO_MUTEX_LOCKED	= 0,
-	FIO_MUTEX_UNLOCKED	= 1,
-};
-
-extern struct fio_mutex *fio_mutex_init(int);
-extern void fio_mutex_remove(struct fio_mutex *);
-extern void fio_mutex_up(struct fio_mutex *);
-extern void fio_mutex_down(struct fio_mutex *);
-extern int fio_mutex_down_timeout(struct fio_mutex *, unsigned int);
-
-extern void fio_rwlock_read(struct fio_rwlock *);
-extern void fio_rwlock_write(struct fio_rwlock *);
-extern void fio_rwlock_unlock(struct fio_rwlock *);
-extern struct fio_rwlock *fio_rwlock_init(void);
-extern void fio_rwlock_remove(struct fio_rwlock *);
-
-#endif
diff -Nru fio-2.1.3/optgroup.c fio-3.16/optgroup.c
--- fio-2.1.3/optgroup.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/optgroup.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <inttypes.h>
+#include "optgroup.h"
+#include "compiler/compiler.h"
+
+/*
+ * Option grouping
+ */
+static const struct opt_group fio_opt_groups[] = {
+	{
+		.name	= "General",
+		.mask	= FIO_OPT_C_GENERAL,
+	},
+	{
+		.name	= "I/O",
+		.mask	= FIO_OPT_C_IO,
+	},
+	{
+		.name	= "File",
+		.mask	= FIO_OPT_C_FILE,
+	},
+	{
+		.name	= "Statistics",
+		.mask	= FIO_OPT_C_STAT,
+	},
+	{
+		.name	= "Logging",
+		.mask	= FIO_OPT_C_LOG,
+	},
+	{
+		.name	= "Profiles",
+		.mask	= FIO_OPT_C_PROFILE,
+	},
+	{
+		.name	= "I/O engines",
+		.mask	= FIO_OPT_C_ENGINE,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group fio_opt_cat_groups[] = {
+	{
+		.name	= "Rate",
+		.mask	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "Zone",
+		.mask	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "Read/write mix",
+		.mask	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "Verify",
+		.mask	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "Trim",
+		.mask	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "I/O Logging",
+		.mask	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "I/O Depth",
+		.mask	= FIO_OPT_G_IO_DEPTH,
+	},
+	{
+		.name	= "I/O Flow",
+		.mask	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "Description",
+		.mask	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "Filename",
+		.mask	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "General I/O",
+		.mask	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "Cgroups",
+		.mask	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "Runtime",
+		.mask	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "Process",
+		.mask	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "Job credentials / priority",
+		.mask	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "Clock settings",
+		.mask	= FIO_OPT_G_CLOCK,
+	},
+	{
+		.name	= "I/O Type",
+		.mask	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "I/O Thinktime",
+		.mask	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "Randomizations",
+		.mask	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "I/O buffers",
+		.mask	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "Tiobench profile",
+		.mask	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "Error handling",
+		.mask	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "Ext4 defrag I/O engine", /* e4defrag */
+		.mask	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= "Network I/O engine", /* net */
+		.mask	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "RDMA I/O engine", /* rdma */
+		.mask	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "libaio I/O engine", /* libaio */
+		.mask	= FIO_OPT_G_LIBAIO,
+	},
+	{
+		.name	= "ACT Aerospike like benchmark profile",
+		.mask	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "Latency profiling",
+		.mask	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "RBD I/O engine", /* rbd */
+		.mask	= FIO_OPT_G_RBD,
+	},
+	{
+		.name	= "GlusterFS I/O engine", /* gfapi,gfapi_async */
+		.mask	= FIO_OPT_G_GFAPI,
+	},
+	{
+		.name	= "MTD I/O engine", /* mtd */
+		.mask	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= "libhdfs I/O engine", /* libhdfs */
+		.mask	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "NBD I/O engine", /* NBD */
+		.mask	= FIO_OPT_G_NBD,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group *group_from_mask(const struct opt_group *ogs,
+					       uint64_t *mask,
+					       uint64_t inv_mask)
+{
+	int i;
+
+	if (*mask == inv_mask || !*mask)
+		return NULL;
+
+	for (i = 0; ogs[i].name; i++) {
+		const struct opt_group *og = &ogs[i];
+
+		if (*mask & og->mask) {
+			*mask &= ~(og->mask);
+			return og;
+		}
+	}
+
+	return NULL;
+}
+
+const struct opt_group *opt_group_from_mask(uint64_t *mask)
+{
+	return group_from_mask(fio_opt_groups, mask, FIO_OPT_C_INVALID);
+}
+
+const struct opt_group *opt_group_cat_from_mask(uint64_t *mask)
+{
+	compiletime_assert(__FIO_OPT_G_NR <= 8 * sizeof(uint64_t),
+				"__FIO_OPT_G_NR");
+
+	return group_from_mask(fio_opt_cat_groups, mask, FIO_OPT_G_INVALID);
+}
diff -Nru fio-2.1.3/optgroup.h fio-3.16/optgroup.h
--- fio-2.1.3/optgroup.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/optgroup.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,114 @@
+#ifndef FIO_OPT_GROUP_H
+#define FIO_OPT_GROUP_H
+
+struct opt_group {
+	const char *name;
+	uint64_t mask;
+};
+
+enum opt_category {
+	__FIO_OPT_C_GENERAL	= 0,
+	__FIO_OPT_C_IO,
+	__FIO_OPT_C_FILE,
+	__FIO_OPT_C_STAT,
+	__FIO_OPT_C_LOG,
+	__FIO_OPT_C_PROFILE,
+	__FIO_OPT_C_ENGINE,
+	__FIO_OPT_C_NR,
+
+	FIO_OPT_C_GENERAL	= (1ULL << __FIO_OPT_C_GENERAL),
+	FIO_OPT_C_IO		= (1ULL << __FIO_OPT_C_IO),
+	FIO_OPT_C_FILE		= (1ULL << __FIO_OPT_C_FILE),
+	FIO_OPT_C_STAT		= (1ULL << __FIO_OPT_C_STAT),
+	FIO_OPT_C_LOG		= (1ULL << __FIO_OPT_C_LOG),
+	FIO_OPT_C_PROFILE	= (1ULL << __FIO_OPT_C_PROFILE),
+	FIO_OPT_C_ENGINE	= (1ULL << __FIO_OPT_C_ENGINE),
+	FIO_OPT_C_INVALID	= (1ULL << __FIO_OPT_C_NR),
+};
+
+enum opt_category_group {
+	__FIO_OPT_G_RATE	= 0,
+	__FIO_OPT_G_ZONE,
+	__FIO_OPT_G_RWMIX,
+	__FIO_OPT_G_VERIFY,
+	__FIO_OPT_G_TRIM,
+	__FIO_OPT_G_IOLOG,
+	__FIO_OPT_G_IO_DEPTH,
+	__FIO_OPT_G_IO_FLOW,
+	__FIO_OPT_G_DESC,
+	__FIO_OPT_G_FILENAME,
+	__FIO_OPT_G_IO_BASIC,
+	__FIO_OPT_G_CGROUP,
+	__FIO_OPT_G_RUNTIME,
+	__FIO_OPT_G_PROCESS,
+	__FIO_OPT_G_CRED,
+	__FIO_OPT_G_CLOCK,
+	__FIO_OPT_G_IO_TYPE,
+	__FIO_OPT_G_THINKTIME,
+	__FIO_OPT_G_RANDOM,
+	__FIO_OPT_G_IO_BUF,
+	__FIO_OPT_G_TIOBENCH,
+	__FIO_OPT_G_ERR,
+	__FIO_OPT_G_E4DEFRAG,
+	__FIO_OPT_G_NETIO,
+	__FIO_OPT_G_RDMA,
+	__FIO_OPT_G_LIBAIO,
+	__FIO_OPT_G_ACT,
+	__FIO_OPT_G_LATPROF,
+	__FIO_OPT_G_RBD,
+	__FIO_OPT_G_HTTP,
+	__FIO_OPT_G_GFAPI,
+	__FIO_OPT_G_MTD,
+	__FIO_OPT_G_HDFS,
+	__FIO_OPT_G_SG,
+	__FIO_OPT_G_MMAP,
+	__FIO_OPT_G_ISCSI,
+	__FIO_OPT_G_NBD,
+	__FIO_OPT_G_IOURING,
+	__FIO_OPT_G_NR,
+
+	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
+	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
+	FIO_OPT_G_RWMIX		= (1ULL << __FIO_OPT_G_RWMIX),
+	FIO_OPT_G_VERIFY	= (1ULL << __FIO_OPT_G_VERIFY),
+	FIO_OPT_G_TRIM		= (1ULL << __FIO_OPT_G_TRIM),
+	FIO_OPT_G_IOLOG		= (1ULL << __FIO_OPT_G_IOLOG),
+	FIO_OPT_G_IO_DEPTH	= (1ULL << __FIO_OPT_G_IO_DEPTH),
+	FIO_OPT_G_IO_FLOW	= (1ULL << __FIO_OPT_G_IO_FLOW),
+	FIO_OPT_G_DESC		= (1ULL << __FIO_OPT_G_DESC),
+	FIO_OPT_G_FILENAME	= (1ULL << __FIO_OPT_G_FILENAME),
+	FIO_OPT_G_IO_BASIC	= (1ULL << __FIO_OPT_G_IO_BASIC),
+	FIO_OPT_G_CGROUP	= (1ULL << __FIO_OPT_G_CGROUP),
+	FIO_OPT_G_RUNTIME	= (1ULL << __FIO_OPT_G_RUNTIME),
+	FIO_OPT_G_PROCESS	= (1ULL << __FIO_OPT_G_PROCESS),
+	FIO_OPT_G_CRED		= (1ULL << __FIO_OPT_G_CRED),
+	FIO_OPT_G_CLOCK		= (1ULL << __FIO_OPT_G_CLOCK),
+	FIO_OPT_G_IO_TYPE	= (1ULL << __FIO_OPT_G_IO_TYPE),
+	FIO_OPT_G_THINKTIME	= (1ULL << __FIO_OPT_G_THINKTIME),
+	FIO_OPT_G_RANDOM	= (1ULL << __FIO_OPT_G_RANDOM),
+	FIO_OPT_G_IO_BUF	= (1ULL << __FIO_OPT_G_IO_BUF),
+	FIO_OPT_G_TIOBENCH	= (1ULL << __FIO_OPT_G_TIOBENCH),
+	FIO_OPT_G_ERR		= (1ULL << __FIO_OPT_G_ERR),
+	FIO_OPT_G_E4DEFRAG	= (1ULL << __FIO_OPT_G_E4DEFRAG),
+	FIO_OPT_G_NETIO		= (1ULL << __FIO_OPT_G_NETIO),
+	FIO_OPT_G_RDMA		= (1ULL << __FIO_OPT_G_RDMA),
+	FIO_OPT_G_LIBAIO	= (1ULL << __FIO_OPT_G_LIBAIO),
+	FIO_OPT_G_ACT		= (1ULL << __FIO_OPT_G_ACT),
+	FIO_OPT_G_LATPROF	= (1ULL << __FIO_OPT_G_LATPROF),
+	FIO_OPT_G_RBD		= (1ULL << __FIO_OPT_G_RBD),
+	FIO_OPT_G_HTTP		= (1ULL << __FIO_OPT_G_HTTP),
+	FIO_OPT_G_GFAPI		= (1ULL << __FIO_OPT_G_GFAPI),
+	FIO_OPT_G_MTD		= (1ULL << __FIO_OPT_G_MTD),
+	FIO_OPT_G_HDFS		= (1ULL << __FIO_OPT_G_HDFS),
+	FIO_OPT_G_SG		= (1ULL << __FIO_OPT_G_SG),
+	FIO_OPT_G_MMAP		= (1ULL << __FIO_OPT_G_MMAP),
+	FIO_OPT_G_INVALID	= (1ULL << __FIO_OPT_G_NR),
+	FIO_OPT_G_ISCSI         = (1ULL << __FIO_OPT_G_ISCSI),
+	FIO_OPT_G_NBD		= (1ULL << __FIO_OPT_G_NBD),
+	FIO_OPT_G_IOURING	= (1ULL << __FIO_OPT_G_IOURING),
+};
+
+extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
+extern const struct opt_group *opt_group_cat_from_mask(uint64_t *mask);
+
+#endif
diff -Nru fio-2.1.3/options.c fio-3.16/options.c
--- fio-2.1.3/options.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/options.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,18 +4,27 @@
 #include <ctype.h>
 #include <string.h>
 #include <assert.h>
-#include <libgen.h>
-#include <fcntl.h>
-#include <sys/types.h>
 #include <sys/stat.h>
+#include <netinet/in.h>
 
 #include "fio.h"
 #include "verify.h"
 #include "parse.h"
-#include "lib/fls.h"
+#include "lib/pattern.h"
 #include "options.h"
+#include "optgroup.h"
 
-#include "crc/crc32c.h"
+char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
+
+#define cb_data_to_td(data)	container_of(data, struct thread_data, o)
+
+static struct pattern_fmt_desc fmt_desc[] = {
+	{
+		.fmt   = "%o",
+		.len   = FIELD_SIZE(struct io_u *, offset),
+		.paste = paste_blockoff
+	}
+};
 
 /*
  * Check if mmap/mmaphuge has a :/foo/bar/file at the end. If so, return that.
@@ -33,120 +42,138 @@
 	return strdup(p);
 }
 
-static int converthexchartoint(char a)
-{
-	int base;
-
-	switch (a) {
-	case '0'...'9':
-		base = '0';
-		break;
-	case 'A'...'F':
-		base = 'A' - 10;
-		break;
-	case 'a'...'f':
-		base = 'a' - 10;
-		break;
-	default:
-		base = 0;
-	}
-	return a - base;
-}
-
 static int bs_cmp(const void *p1, const void *p2)
 {
 	const struct bssplit *bsp1 = p1;
 	const struct bssplit *bsp2 = p2;
 
-	return bsp1->perc < bsp2->perc;
+	return (int) bsp1->perc - (int) bsp2->perc;
 }
 
-static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
+struct split {
+	unsigned int nr;
+	unsigned long long val1[ZONESPLIT_MAX];
+	unsigned long long val2[ZONESPLIT_MAX];
+};
+
+static int split_parse_ddir(struct thread_options *o, struct split *split,
+			    char *str, bool absolute, unsigned int max_splits)
 {
-	struct bssplit *bssplit;
-	unsigned int i, perc, perc_missing;
-	unsigned int max_bs, min_bs;
+	unsigned long long perc;
+	unsigned int i;
 	long long val;
 	char *fname;
 
-	o->bssplit_nr[ddir] = 4;
-	bssplit = malloc(4 * sizeof(struct bssplit));
+	split->nr = 0;
 
 	i = 0;
-	max_bs = 0;
-	min_bs = -1;
 	while ((fname = strsep(&str, ":")) != NULL) {
 		char *perc_str;
 
 		if (!strlen(fname))
 			break;
 
-		/*
-		 * grow struct buffer, if needed
-		 */
-		if (i == o->bssplit_nr[ddir]) {
-			o->bssplit_nr[ddir] <<= 1;
-			bssplit = realloc(bssplit, o->bssplit_nr[ddir]
-						  * sizeof(struct bssplit));
-		}
-
 		perc_str = strstr(fname, "/");
 		if (perc_str) {
 			*perc_str = '\0';
 			perc_str++;
-			perc = atoi(perc_str);
-			if (perc > 100)
-				perc = 100;
-			else if (!perc)
-				perc = -1;
-		} else
-			perc = -1;
-
-		if (str_to_decimal(fname, &val, 1, o)) {
-			log_err("fio: bssplit conversion failed\n");
-			free(o->bssplit);
-			return 1;
+			if (absolute) {
+				if (str_to_decimal(perc_str, &val, 1, o, 0, 0)) {
+					log_err("fio: split conversion failed\n");
+					return 1;
+				}
+				perc = val;
+			} else {
+				perc = atoi(perc_str);
+				if (perc > 100)
+					perc = 100;
+				else if (!perc)
+					perc = -1U;
+			}
+		} else {
+			if (absolute)
+				perc = 0;
+			else
+				perc = -1U;
 		}
 
-		if (val > max_bs)
-			max_bs = val;
-		if (val < min_bs)
-			min_bs = val;
+		if (str_to_decimal(fname, &val, 1, o, 0, 0)) {
+			log_err("fio: split conversion failed\n");
+			return 1;
+		}
 
-		bssplit[i].bs = val;
-		bssplit[i].perc = perc;
+		split->val1[i] = val;
+		split->val2[i] = perc;
 		i++;
+		if (i == max_splits) {
+			log_err("fio: hit max of %d split entries\n", i);
+			break;
+		}
 	}
 
-	o->bssplit_nr[ddir] = i;
+	split->nr = i;
+	return 0;
+}
+
+static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
+			bool data)
+{
+	unsigned int i, perc, perc_missing;
+	unsigned long long max_bs, min_bs;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, str, data, BSSPLIT_MAX))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	max_bs = 0;
+	min_bs = -1;
+	o->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+	o->bssplit_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		if (split.val1[i] > max_bs)
+			max_bs = split.val1[i];
+		if (split.val1[i] < min_bs)
+			min_bs = split.val1[i];
+
+		o->bssplit[ddir][i].bs = split.val1[i];
+		o->bssplit[ddir][i].perc =split.val2[i];
+	}
 
 	/*
 	 * Now check if the percentages add up, and how much is missing
 	 */
 	perc = perc_missing = 0;
 	for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-		struct bssplit *bsp = &bssplit[i];
+		struct bssplit *bsp = &o->bssplit[ddir][i];
 
-		if (bsp->perc == (unsigned char) -1)
+		if (bsp->perc == -1U)
 			perc_missing++;
 		else
 			perc += bsp->perc;
 	}
 
-	if (perc > 100) {
+	if (perc > 100 && perc_missing > 1) {
 		log_err("fio: bssplit percentages add to more than 100%%\n");
-		free(bssplit);
+		free(o->bssplit[ddir]);
+		o->bssplit[ddir] = NULL;
 		return 1;
 	}
+
 	/*
 	 * If values didn't have a percentage set, divide the remains between
 	 * them.
 	 */
 	if (perc_missing) {
+		if (perc_missing == 1 && o->bssplit_nr[ddir] == 1)
+			perc = 100;
 		for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-			struct bssplit *bsp = &bssplit[i];
+			struct bssplit *bsp = &o->bssplit[ddir][i];
 
-			if (bsp->perc == (unsigned char) -1)
+			if (bsp->perc == -1U)
 				bsp->perc = (100 - perc) / perc_missing;
 		}
 	}
@@ -157,59 +184,79 @@
 	/*
 	 * now sort based on percentages, for ease of lookup
 	 */
-	qsort(bssplit, o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
-	o->bssplit[ddir] = bssplit;
+	qsort(o->bssplit[ddir], o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
 	return 0;
 }
 
-static int str_bssplit_cb(void *data, const char *input)
+typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *, bool);
+
+static int str_split_parse(struct thread_data *td, char *str,
+			   split_parse_fn *fn, bool data)
 {
-	struct thread_data *td = data;
-	char *str, *p, *odir, *ddir;
+	char *odir, *ddir;
 	int ret = 0;
 
-	if (parse_dryrun())
-		return 0;
-
-	p = str = strdup(input);
-
-	strip_blank_front(&str);
-	strip_blank_end(str);
-
 	odir = strchr(str, ',');
 	if (odir) {
 		ddir = strchr(odir + 1, ',');
 		if (ddir) {
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, ddir + 1);
+			ret = fn(&td->o, DDIR_TRIM, ddir + 1, data);
 			if (!ret)
 				*ddir = '\0';
 		} else {
 			char *op;
 
 			op = strdup(odir + 1);
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+			ret = fn(&td->o, DDIR_TRIM, op, data);
 
 			free(op);
 		}
 		if (!ret)
-			ret = bssplit_ddir(&td->o, DDIR_WRITE, odir + 1);
+			ret = fn(&td->o, DDIR_WRITE, odir + 1, data);
 		if (!ret) {
 			*odir = '\0';
-			ret = bssplit_ddir(&td->o, DDIR_READ, str);
+			ret = fn(&td->o, DDIR_READ, str, data);
 		}
 	} else {
 		char *op;
 
 		op = strdup(str);
-		ret = bssplit_ddir(&td->o, DDIR_WRITE, op);
+		ret = fn(&td->o, DDIR_WRITE, op, data);
 		free(op);
 
 		if (!ret) {
 			op = strdup(str);
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+			ret = fn(&td->o, DDIR_TRIM, op, data);
 			free(op);
 		}
-		ret = bssplit_ddir(&td->o, DDIR_READ, str);
+		if (!ret)
+			ret = fn(&td->o, DDIR_READ, str, data);
+	}
+
+	return ret;
+}
+
+static int str_bssplit_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p;
+	int ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	ret = str_split_parse(td, str, bssplit_ddir, false);
+
+	if (parse_dryrun()) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.bssplit[i]);
+			td->o.bssplit[i] = NULL;
+			td->o.bssplit_nr[i] = 0;
+		}
 	}
 
 	free(p);
@@ -226,7 +273,7 @@
 			    "EINVAL", "ENFILE", "EMFILE", "ENOTTY",
 			    "ETXTBSY","EFBIG", "ENOSPC", "ESPIPE",
 			    "EROFS","EMLINK", "EPIPE", "EDOM", "ERANGE" };
-	int i = 0, num = sizeof(err) / sizeof(void *);
+	int i = 0, num = sizeof(err) / sizeof(char *);
 
 	while (i < num) {
 		if (!strcmp(err[i], str))
@@ -236,7 +283,8 @@
 	return 0;
 }
 
-static int ignore_error_type(struct thread_data *td, int etype, char *str)
+static int ignore_error_type(struct thread_data *td, enum error_type_bit etype,
+				char *str)
 {
 	unsigned int i;
 	int *error;
@@ -248,7 +296,7 @@
 	}
 
 	td->o.ignore_error_nr[etype] = 4;
-	error = malloc(4 * sizeof(struct bssplit));
+	error = calloc(4, sizeof(int));
 
 	i = 0;
 	while ((fname = strsep(&str, ":")) != NULL) {
@@ -269,11 +317,12 @@
 		} else {
 			error[i] = atoi(fname);
 			if (error[i] < 0)
-				error[i] = error[i];
+				error[i] = -error[i];
 		}
 		if (!error[i]) {
-			log_err("Unknown error %s, please use number value \n",
+			log_err("Unknown error %s, please use number value\n",
 				  fname);
+			td->o.ignore_error_nr[etype] = 0;
 			free(error);
 			return 1;
 		}
@@ -283,16 +332,58 @@
 		td->o.continue_on_error |= 1 << etype;
 		td->o.ignore_error_nr[etype] = i;
 		td->o.ignore_error[etype] = error;
+	} else {
+		td->o.ignore_error_nr[etype] = 0;
+		free(error);
 	}
+
 	return 0;
 
 }
 
+static int str_replay_skip_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p, *n;
+	int ret = 0;
+
+	if (parse_dryrun())
+		return 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	while (p) {
+		n = strchr(p, ',');
+		if (n)
+			*n++ = '\0';
+		if (!strcmp(p, "read"))
+			td->o.replay_skip |= 1u << DDIR_READ;
+		else if (!strcmp(p, "write"))
+			td->o.replay_skip |= 1u << DDIR_WRITE;
+		else if (!strcmp(p, "trim"))
+			td->o.replay_skip |= 1u << DDIR_TRIM;
+		else if (!strcmp(p, "sync"))
+			td->o.replay_skip |= 1u << DDIR_SYNC;
+		else {
+			log_err("Unknown skip type: %s\n", p);
+			ret = 1;
+			break;
+		}
+		p = n;
+	}
+	free(str);
+	return ret;
+}
+
 static int str_ignore_error_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *str, *p, *n;
-	int type = 0, ret = 1;
+	int ret = 1;
+	enum error_type_bit type = 0;
 
 	if (parse_dryrun())
 		return 0;
@@ -318,9 +409,9 @@
 
 static int str_rw_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	struct thread_options *o = &td->o;
-	char *nr = get_opt_postfix(str);
+	char *nr;
 
 	if (parse_dryrun())
 		return 0;
@@ -328,6 +419,7 @@
 	o->ddir_seq_nr = 1;
 	o->ddir_seq_add = 0;
 
+	nr = get_opt_postfix(str);
 	if (!nr)
 		return 0;
 
@@ -336,7 +428,7 @@
 	else {
 		long long val;
 
-		if (str_to_decimal(nr, &val, 1, o)) {
+		if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
 			log_err("fio: rw postfix parsing failed\n");
 			free(nr);
 			return 1;
@@ -351,9 +443,10 @@
 
 static int str_mem_cb(void *data, const char *mem)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
-	if (td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAP)
+	if (td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAP ||
+	    td->o.mem_type == MEM_MMAPSHARED)
 		td->o.mmapfile = get_opt_postfix(mem);
 
 	return 0;
@@ -361,7 +454,7 @@
 
 static int fio_clock_source_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	fio_clock_source = td->o.clocksource;
 	fio_clock_source_set = 1;
@@ -371,7 +464,7 @@
 
 static int str_rwmix_read_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->o.rwmix[DDIR_READ] = *val;
 	td->o.rwmix[DDIR_WRITE] = 100 - *val;
@@ -380,7 +473,7 @@
 
 static int str_rwmix_write_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->o.rwmix[DDIR_WRITE] = *val;
 	td->o.rwmix[DDIR_READ] = 100 - *val;
@@ -389,14 +482,39 @@
 
 static int str_exitall_cb(void)
 {
-	exitall_on_terminate = 1;
+	exitall_on_terminate = true;
 	return 0;
 }
 
 #ifdef FIO_HAVE_CPU_AFFINITY
+int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
+{
+	unsigned int i, index, cpus_in_mask;
+	const long max_cpu = cpus_online();
+
+	cpus_in_mask = fio_cpu_count(mask);
+	if (!cpus_in_mask)
+		return 0;
+
+	cpu_index = cpu_index % cpus_in_mask;
+
+	index = 0;
+	for (i = 0; i < max_cpu; i++) {
+		if (!fio_cpu_isset(mask, i))
+			continue;
+
+		if (cpu_index != index)
+			fio_cpu_clear(mask, i);
+
+		index++;
+	}
+
+	return fio_cpu_count(mask);
+}
+
 static int str_cpumask_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	unsigned int i;
 	long max_cpu;
 	int ret;
@@ -415,9 +533,9 @@
 
 	for (i = 0; i < sizeof(int) * 8; i++) {
 		if ((1 << i) & *val) {
-			if (i > max_cpu) {
+			if (i >= max_cpu) {
 				log_err("fio: CPU %d too large (max=%ld)\n", i,
-								max_cpu);
+								max_cpu - 1);
 				return 1;
 			}
 			dprint(FD_PARSE, "set cpu allowed %d\n", i);
@@ -425,7 +543,6 @@
 		}
 	}
 
-	td->o.cpumask_set = 1;
 	return 0;
 }
 
@@ -476,9 +593,9 @@
 				ret = 1;
 				break;
 			}
-			if (icpu > max_cpu) {
+			if (icpu >= max_cpu) {
 				log_err("fio: CPU %d too large (max=%ld)\n",
-							icpu, max_cpu);
+							icpu, max_cpu - 1);
 				ret = 1;
 				break;
 			}
@@ -492,43 +609,48 @@
 	}
 
 	free(p);
-	if (!ret)
-		td->o.cpumask_set = 1;
 	return ret;
 }
 
 static int str_cpus_allowed_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
-	int ret;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (parse_dryrun())
 		return 0;
 
-	ret = set_cpus_allowed(td, &td->o.cpumask, input);
-	if (!ret)
-		td->o.cpumask_set = 1;
-
-	return ret;
+	return set_cpus_allowed(td, &td->o.cpumask, input);
 }
 
 static int str_verify_cpus_allowed_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
-	int ret;
+	struct thread_data *td = cb_data_to_td(data);
 
-	ret = set_cpus_allowed(td, &td->o.verify_cpumask, input);
-	if (!ret)
-		td->o.verify_cpumask_set = 1;
+	if (parse_dryrun())
+		return 0;
 
-	return ret;
+	return set_cpus_allowed(td, &td->o.verify_cpumask, input);
 }
-#endif
+
+#ifdef CONFIG_ZLIB
+static int str_log_cpus_allowed_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	return set_cpus_allowed(td, &td->o.log_gz_cpumask, input);
+}
+#endif /* CONFIG_ZLIB */
+
+#endif /* FIO_HAVE_CPU_AFFINITY */
 
 #ifdef CONFIG_LIBNUMA
 static int str_numa_cpunodes_cb(void *data, char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
+	struct bitmask *verify_bitmask;
 
 	if (parse_dryrun())
 		return 0;
@@ -538,24 +660,26 @@
 	 * numa_allocate_nodemask(), so it should be freed by
 	 * numa_free_nodemask().
 	 */
-	td->o.numa_cpunodesmask = numa_parse_nodestring(input);
-	if (td->o.numa_cpunodesmask == NULL) {
+	verify_bitmask = numa_parse_nodestring(input);
+	if (verify_bitmask == NULL) {
 		log_err("fio: numa_parse_nodestring failed\n");
 		td_verror(td, 1, "str_numa_cpunodes_cb");
 		return 1;
 	}
+	numa_free_nodemask(verify_bitmask);
 
-	td->o.numa_cpumask_set = 1;
+	td->o.numa_cpunodes = strdup(input);
 	return 0;
 }
 
 static int str_numa_mpol_cb(void *data, char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	const char * const policy_types[] =
 		{ "default", "prefer", "bind", "interleave", "local", NULL };
 	int i;
 	char *nodelist;
+	struct bitmask *verify_bitmask;
 
 	if (parse_dryrun())
 		return 0;
@@ -635,12 +759,15 @@
 		break;
 	case MPOL_INTERLEAVE:
 	case MPOL_BIND:
-		td->o.numa_memnodesmask = numa_parse_nodestring(nodelist);
-		if (td->o.numa_memnodesmask == NULL) {
+		verify_bitmask = numa_parse_nodestring(nodelist);
+		if (verify_bitmask == NULL) {
 			log_err("fio: numa_parse_nodestring failed\n");
 			td_verror(td, 1, "str_numa_memnodes_cb");
 			return 1;
 		}
+		td->o.numa_memnodes = strdup(nodelist);
+		numa_free_nodemask(verify_bitmask);
+
 		break;
 	case MPOL_LOCAL:
 	case MPOL_DEFAULT:
@@ -648,9 +775,7 @@
 		break;
 	}
 
-	td->o.numa_memmask_set = 1;
 	return 0;
-
 out:
 	return 1;
 }
@@ -658,13 +783,78 @@
 
 static int str_fst_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
-	char *nr = get_opt_postfix(str);
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	bool done = false;
+	char *nr;
 
 	td->file_service_nr = 1;
-	if (nr) {
-		td->file_service_nr = atoi(nr);
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_RANDOM:
+	case FIO_FSERVICE_RR:
+	case FIO_FSERVICE_SEQ:
+		nr = get_opt_postfix(str);
+		if (nr) {
+			td->file_service_nr = atoi(nr);
+			free(nr);
+		}
+		done = true;
+		break;
+	case FIO_FSERVICE_ZIPF:
+		val = FIO_DEF_ZIPF;
+		break;
+	case FIO_FSERVICE_PARETO:
+		val = FIO_DEF_PARETO;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		val = 0.0;
+		break;
+	default:
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		return 1;
+	}
+
+	if (done)
+		return 0;
+
+	nr = get_opt_postfix(str);
+	if (nr && !str_to_float(nr, &val, 0)) {
+		log_err("fio: file service type random postfix parsing failed\n");
 		free(nr);
+		return 1;
+	}
+
+	free(nr);
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_ZIPF:
+		if (val == 1.00) {
+			log_err("fio: zipf theta must be different than 1.0\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->zipf_theta = val;
+		break;
+	case FIO_FSERVICE_PARETO:
+		if (val <= 0.00 || val >= 1.00) {
+                          log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->pareto_h = val;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		if (val < 0.00 || val >= 100.00) {
+                          log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->gauss_dev = val;
+		break;
 	}
 
 	return 0;
@@ -673,7 +863,7 @@
 #ifdef CONFIG_SYNC_FILE_RANGE
 static int str_sfr_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *nr = get_opt_postfix(str);
 
 	td->sync_file_range_nr = 1;
@@ -686,24 +876,175 @@
 }
 #endif
 
+static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
+			   char *str, bool absolute)
+{
+	unsigned int i, perc, perc_missing, sperc, sperc_missing;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, str, absolute, ZONESPLIT_MAX))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	o->zone_split[ddir] = malloc(split.nr * sizeof(struct zone_split));
+	o->zone_split_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		o->zone_split[ddir][i].access_perc = split.val1[i];
+		if (absolute)
+			o->zone_split[ddir][i].size = split.val2[i];
+		else
+			o->zone_split[ddir][i].size_perc = split.val2[i];
+	}
+
+	/*
+	 * Now check if the percentages add up, and how much is missing
+	 */
+	perc = perc_missing = 0;
+	sperc = sperc_missing = 0;
+	for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &o->zone_split[ddir][i];
+
+		if (zsp->access_perc == (uint8_t) -1U)
+			perc_missing++;
+		else
+			perc += zsp->access_perc;
+
+		if (!absolute) {
+			if (zsp->size_perc == (uint8_t) -1U)
+				sperc_missing++;
+			else
+				sperc += zsp->size_perc;
+		}
+	}
+
+	if (perc > 100 || sperc > 100) {
+		log_err("fio: zone_split percentages add to more than 100%%\n");
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+	if (perc < 100) {
+		log_err("fio: access percentage don't add up to 100 for zoned "
+			"random distribution (got=%u)\n", perc);
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+
+	/*
+	 * If values didn't have a percentage set, divide the remains between
+	 * them.
+	 */
+	if (perc_missing) {
+		if (perc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			perc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->access_perc == (uint8_t) -1U)
+				zsp->access_perc = (100 - perc) / perc_missing;
+		}
+	}
+	if (sperc_missing) {
+		if (sperc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			sperc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->size_perc == (uint8_t) -1U)
+				zsp->size_perc = (100 - sperc) / sperc_missing;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_zoned_distribution(struct thread_data *td, const char *input,
+				    bool absolute)
+{
+	const char *pre = absolute ? "zoned_abs:" : "zoned:";
+	char *str, *p;
+	int i, ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	/* We expect it to start like that, bail if not */
+	if (strncmp(str, pre, strlen(pre))) {
+		log_err("fio: mismatch in zoned input <%s>\n", str);
+		free(p);
+		return 1;
+	}
+	str += strlen(pre);
+
+	ret = str_split_parse(td, str, zone_split_ddir, absolute);
+
+	free(p);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		int j;
+
+		dprint(FD_PARSE, "zone ddir %d (nr=%u): \n", i, td->o.zone_split_nr[i]);
+
+		for (j = 0; j < td->o.zone_split_nr[i]; j++) {
+			struct zone_split *zsp = &td->o.zone_split[i][j];
+
+			if (absolute) {
+				dprint(FD_PARSE, "\t%d: %u/%llu\n", j,
+						zsp->access_perc,
+						(unsigned long long) zsp->size);
+			} else {
+				dprint(FD_PARSE, "\t%d: %u/%u\n", j,
+						zsp->access_perc,
+						zsp->size_perc);
+			}
+		}
+	}
+
+	if (parse_dryrun()) {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.zone_split[i]);
+			td->o.zone_split[i] = NULL;
+			td->o.zone_split_nr[i] = 0;
+		}
+
+		return ret;
+	}
+
+	if (ret) {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++)
+			td->o.zone_split_nr[i] = 0;
+	}
+
+	return ret;
+}
+
 static int str_random_distribution_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	double val;
 	char *nr;
 
-	if (parse_dryrun())
-		return 0;
-
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
-		val = 1.1;
+		val = FIO_DEF_ZIPF;
 	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
-		val = 0.2;
+		val = FIO_DEF_PARETO;
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		val = 0.0;
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return parse_zoned_distribution(td, str, false);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED_ABS)
+		return parse_zoned_distribution(td, str, true);
 	else
 		return 0;
 
 	nr = get_opt_postfix(str);
-	if (nr && !str_to_float(nr, &val)) {
+	if (nr && !str_to_float(nr, &val, 0)) {
 		log_err("fio: random postfix parsing failed\n");
 		free(nr);
 		return 1;
@@ -716,30 +1057,114 @@
 			log_err("fio: zipf theta must different than 1.0\n");
 			return 1;
 		}
+		if (parse_dryrun())
+			return 0;
 		td->o.zipf_theta.u.f = val;
-	} else {
+	} else if (td->o.random_distribution == FIO_RAND_DIST_PARETO) {
 		if (val <= 0.00 || val >= 1.00) {
 			log_err("fio: pareto input out of range (0 < input < 1.0)\n");
 			return 1;
 		}
+		if (parse_dryrun())
+			return 0;
 		td->o.pareto_h.u.f = val;
+	} else {
+		if (val < 0.00 || val >= 100.0) {
+			log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->o.gauss_dev.u.f = val;
 	}
 
 	return 0;
 }
 
-/*
- * Return next file in the string. Files are separated with ':'. If the ':'
- * is escaped with a '\', then that ':' is part of the filename and does not
- * indicate a new file.
- */
-static char *get_next_file_name(char **ptr)
+static int str_steadystate_cb(void *data, const char *str)
 {
-	char *str = *ptr;
-	char *p, *start;
-
-	if (!str || !strlen(str))
-		return NULL;
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	char *nr;
+	char *pct;
+	long long ll;
+
+	if (td->o.ss_state != FIO_SS_IOPS && td->o.ss_state != FIO_SS_IOPS_SLOPE &&
+	    td->o.ss_state != FIO_SS_BW && td->o.ss_state != FIO_SS_BW_SLOPE) {
+		/* should be impossible to get here */
+		log_err("fio: unknown steady state criterion\n");
+		return 1;
+	}
+
+	nr = get_opt_postfix(str);
+	if (!nr) {
+		log_err("fio: steadystate threshold must be specified in addition to criterion\n");
+		free(nr);
+		return 1;
+	}
+
+	/* ENHANCEMENT Allow fio to understand size=10.2% and use here */
+	pct = strstr(nr, "%");
+	if (pct) {
+		*pct = '\0';
+		strip_blank_end(nr);
+		if (!str_to_float(nr, &val, 0))	{
+			log_err("fio: could not parse steadystate threshold percentage\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state threshold to %f%%\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_state |= FIO_SS_PCT;
+		td->o.ss_limit.u.f = val;
+	} else if (td->o.ss_state & FIO_SS_IOPS) {
+		if (!str_to_float(nr, &val, 0)) {
+			log_err("fio: steadystate IOPS threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state IOPS threshold to %f\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = val;
+	} else {	/* bandwidth criterion */
+		if (str_to_decimal(nr, &ll, 1, td, 0, 0)) {
+			log_err("fio: steadystate BW threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state BW threshold to %lld\n", ll);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = (double) ll;
+	}
+
+	td->ss.state = td->o.ss_state;
+	return 0;
+}
+
+/*
+ * Return next name in the string. Files are separated with ':'. If the ':'
+ * is escaped with a '\', then that ':' is part of the filename and does not
+ * indicate a new file.
+ */
+char *get_next_str(char **ptr)
+{
+	char *str = *ptr;
+	char *p, *start;
+
+	if (!str || !strlen(str))
+		return NULL;
 
 	start = str;
 	do {
@@ -774,9 +1199,70 @@
 	return start;
 }
 
+
+int get_max_str_idx(char *input)
+{
+	unsigned int cur_idx;
+	char *str, *p;
+
+	p = str = strdup(input);
+	for (cur_idx = 0; ; cur_idx++)
+		if (get_next_str(&str) == NULL)
+			break;
+
+	free(p);
+	return cur_idx;
+}
+
+/*
+ * Returns the directory at the index, indexes > entires will be
+ * assigned via modulo division of the index
+ */
+int set_name_idx(char *target, size_t tlen, char *input, int index,
+		 bool unique_filename)
+{
+	unsigned int cur_idx;
+	int len;
+	char *fname, *str, *p;
+
+	p = str = strdup(input);
+
+	index %= get_max_str_idx(input);
+	for (cur_idx = 0; cur_idx <= index; cur_idx++)
+		fname = get_next_str(&str);
+
+	if (client_sockaddr_str[0] && unique_filename) {
+		len = snprintf(target, tlen, "%s/%s.", fname,
+				client_sockaddr_str);
+	} else
+		len = snprintf(target, tlen, "%s/", fname);
+
+	target[tlen - 1] = '\0';
+	free(p);
+
+	return len;
+}
+
+char* get_name_by_idx(char *input, int index)
+{
+	unsigned int cur_idx;
+	char *fname, *str, *p;
+
+	p = str = strdup(input);
+
+	index %= get_max_str_idx(input);
+	for (cur_idx = 0; cur_idx <= index; cur_idx++)
+		fname = get_next_str(&str);
+
+	fname = strdup(fname);
+	free(p);
+
+	return fname;
+}
+
 static int str_filename_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *fname, *str, *p;
 
 	p = str = strdup(input);
@@ -784,46 +1270,56 @@
 	strip_blank_front(&str);
 	strip_blank_end(str);
 
+	/*
+	 * Ignore what we may already have from nrfiles option.
+	 */
 	if (!td->files_index)
 		td->o.nr_files = 0;
 
-	while ((fname = get_next_file_name(&str)) != NULL) {
+	while ((fname = get_next_str(&str)) != NULL) {
 		if (!strlen(fname))
 			break;
-		add_file(td, fname);
-		td->o.nr_files++;
+		add_file(td, fname, 0, 1);
 	}
 
 	free(p);
 	return 0;
 }
 
-static int str_directory_cb(void *data, const char fio_unused *str)
+static int str_directory_cb(void *data, const char fio_unused *unused)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	struct stat sb;
+	char *dirname, *str, *p;
+	int ret = 0;
 
 	if (parse_dryrun())
 		return 0;
 
-	if (lstat(td->o.directory, &sb) < 0) {
-		int ret = errno;
+	p = str = strdup(td->o.directory);
+	while ((dirname = get_next_str(&str)) != NULL) {
+		if (lstat(dirname, &sb) < 0) {
+			ret = errno;
 
-		log_err("fio: %s is not a directory\n", td->o.directory);
-		td_verror(td, ret, "lstat");
-		return 1;
-	}
-	if (!S_ISDIR(sb.st_mode)) {
-		log_err("fio: %s is not a directory\n", td->o.directory);
-		return 1;
+			log_err("fio: %s is not a directory\n", dirname);
+			td_verror(td, ret, "lstat");
+			goto out;
+		}
+		if (!S_ISDIR(sb.st_mode)) {
+			log_err("fio: %s is not a directory\n", dirname);
+			ret = 1;
+			goto out;
+		}
 	}
 
-	return 0;
+out:
+	free(p);
+	return ret;
 }
 
 static int str_opendir_cb(void *data, const char fio_unused *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (parse_dryrun())
 		return 0;
@@ -834,81 +1330,73 @@
 	return add_dir_files(td, td->o.opendir);
 }
 
-static int str_verify_pattern_cb(void *data, const char *input)
+static int str_buffer_pattern_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
-	long off;
-	int i = 0, j = 0, len, k, base = 10, pattern_length;
-	char *loc1, *loc2;
-
-	loc1 = strstr(input, "0x");
-	loc2 = strstr(input, "0X");
-	if (loc1 || loc2)
-		base = 16;
-	off = strtol(input, NULL, base);
-	if (off != LONG_MAX || errno != ERANGE) {
-		while (off) {
-			td->o.verify_pattern[i] = off & 0xff;
-			off >>= 8;
-			i++;
-		}
-	} else {
-		len = strlen(input);
-		k = len - 1;
-		if (base == 16) {
-			if (loc1)
-				j = loc1 - input + 2;
-			else
-				j = loc2 - input + 2;
-		} else
-			return 1;
-		if (len - j < MAX_PATTERN_SIZE * 2) {
-			while (k >= j) {
-				off = converthexchartoint(input[k--]);
-				if (k >= j)
-					off += (converthexchartoint(input[k--])
-						* 16);
-				td->o.verify_pattern[i++] = (char) off;
-			}
-		}
-	}
+	struct thread_data *td = cb_data_to_td(data);
+	int ret;
 
-	/*
-	 * Fill the pattern all the way to the end. This greatly reduces
-	 * the number of memcpy's we have to do when verifying the IO.
-	 */
-	pattern_length = i;
-	while (i > 1 && i * 2 <= MAX_PATTERN_SIZE) {
-		memcpy(&td->o.verify_pattern[i], &td->o.verify_pattern[0], i);
-		i *= 2;
-	}
+	/* FIXME: for now buffer pattern does not support formats */
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern,
+				     MAX_PATTERN_SIZE, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		return 1;
+
+	assert(ret != 0);
+	td->o.buffer_pattern_bytes = ret;
 
 	/*
-	 * Fill remainder, if the pattern multiple ends up not being
-	 * MAX_PATTERN_SIZE.
+	 * If this job is doing any reading or has compression set,
+	 * ensure that we refill buffers for writes or we could be
+	 * invalidating the pattern through reads.
 	 */
-	while (i > 1 && i < MAX_PATTERN_SIZE) {
-		unsigned int b = min(pattern_length, MAX_PATTERN_SIZE - i);
+	if (!td->o.compress_percentage && !td_read(td))
+		td->o.refill_buffers = 0;
+	else
+		td->o.refill_buffers = 1;
 
-		memcpy(&td->o.verify_pattern[i], &td->o.verify_pattern[0], b);
-		i += b;
-	}
+	td->o.scramble_buffers = 0;
+	td->o.zero_buffers = 0;
 
-	if (i == 1) {
-		/*
-		 * The code in verify_io_u_pattern assumes a single byte pattern
-		 * fills the whole verify pattern buffer.
-		 */
-		memset(td->o.verify_pattern, td->o.verify_pattern[0],
-		       MAX_PATTERN_SIZE);
-	}
+	return 0;
+}
+
+static int str_buffer_compress_cb(void *data, unsigned long long *il)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->flags |= TD_F_COMPRESS;
+	td->o.compress_percentage = *il;
+	return 0;
+}
+
+static int str_dedupe_cb(void *data, unsigned long long *il)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	td->flags |= TD_F_COMPRESS;
+	td->o.dedupe_percentage = *il;
+	td->o.refill_buffers = 1;
+	return 0;
+}
+
+static int str_verify_pattern_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	int ret;
 
-	td->o.verify_pattern_bytes = i;
+	td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
+				     MAX_PATTERN_SIZE, fmt_desc, sizeof(fmt_desc),
+				     td->o.verify_fmt, &td->o.verify_fmt_sz);
+	if (ret < 0)
+		return 1;
 
+	assert(ret != 0);
+	td->o.verify_pattern_bytes = ret;
 	/*
-	 * VERIFY_META could already be set
+	 * VERIFY_* could already be set
 	 */
-	if (td->o.verify == VERIFY_NONE)
+	if (!fio_option_is_set(&td->o, verify))
 		td->o.verify = VERIFY_PATTERN;
 
 	return 0;
@@ -916,7 +1404,7 @@
 
 static int str_gtod_reduce_cb(void *data, int *il)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	int val = *il;
 
 	td->o.disable_lat = !!val;
@@ -925,216 +1413,162 @@
 	td->o.disable_bw = !!val;
 	td->o.clat_percentiles = !val;
 	if (val)
-		td->tv_cache_mask = 63;
+		td->ts_cache_mask = 63;
 
 	return 0;
 }
 
-static int str_gtod_cpu_cb(void *data, long long *il)
+static int str_offset_cb(void *data, unsigned long long *__val)
 {
-	struct thread_data *td = data;
-	int val = *il;
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.start_offset = 0;
+		td->o.start_offset_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET start_offset_percent %d\n",
+					td->o.start_offset_percent);
+	} else
+		td->o.start_offset = v;
+
+	return 0;
+}
+
+static int str_offset_increment_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.offset_increment = 0;
+		td->o.offset_increment_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET offset_increment_percent %d\n",
+					td->o.offset_increment_percent);
+	} else
+		td->o.offset_increment = v;
 
-	td->o.gtod_cpu = val;
-	td->o.gtod_offload = 1;
 	return 0;
 }
 
 static int str_size_cb(void *data, unsigned long long *__val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	unsigned long long v = *__val;
 
 	if (parse_is_percent(v)) {
 		td->o.size = 0;
 		td->o.size_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET size_percent %d\n",
+					td->o.size_percent);
 	} else
 		td->o.size = v;
 
 	return 0;
 }
 
-static int rw_verify(struct fio_option *o, void *data)
+static int str_write_bw_log_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
-	if (read_only && td_write(td)) {
-		log_err("fio: job <%s> has write bit set, but fio is in"
-			" read-only mode\n", td->o.name);
-		return 1;
-	}
+	if (str)
+		td->o.bw_log_file = strdup(str);
 
+	td->o.write_bw_log = 1;
 	return 0;
 }
 
-static int gtod_cpu_verify(struct fio_option *o, void *data)
+static int str_write_lat_log_cb(void *data, const char *str)
 {
-#ifndef FIO_HAVE_CPU_AFFINITY
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
-	if (td->o.gtod_cpu) {
-		log_err("fio: platform must support CPU affinity for"
-			"gettimeofday() offloading\n");
-		return 1;
-	}
-#endif
+	if (str)
+		td->o.lat_log_file = strdup(str);
+
+	td->o.write_lat_log = 1;
+	return 0;
+}
 
+static int str_write_iops_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.iops_log_file = strdup(str);
+
+	td->o.write_iops_log = 1;
+	return 0;
+}
+
+static int str_write_hist_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.hist_log_file = strdup(str);
+
+	td->o.write_hist_log = 1;
 	return 0;
 }
 
 /*
- * Option grouping
+ * str is supposed to be a substring of the strdup'd original string,
+ * and is valid only if it's a regular file path.
+ * This function keeps the pointer to the path as needed later.
+ *
+ * "external:/path/to/so\0" <- original pointer updated with strdup'd
+ * "external\0"             <- above pointer after parsed, i.e. ->ioengine
+ *          "/path/to/so\0" <- str argument, i.e. ->ioengine_so_path
  */
-static struct opt_group fio_opt_groups[] = {
-	{
-		.name	= "General",
-		.mask	= FIO_OPT_C_GENERAL,
-	},
-	{
-		.name	= "I/O",
-		.mask	= FIO_OPT_C_IO,
-	},
-	{
-		.name	= "File",
-		.mask	= FIO_OPT_C_FILE,
-	},
-	{
-		.name	= "Statistics",
-		.mask	= FIO_OPT_C_STAT,
-	},
-	{
-		.name	= "Logging",
-		.mask	= FIO_OPT_C_LOG,
-	},
-	{
-		.name	= "Profiles",
-		.mask	= FIO_OPT_C_PROFILE,
-	},
-	{
-		.name	= NULL,
-	},
-};
-
-static struct opt_group *__opt_group_from_mask(struct opt_group *ogs, unsigned int *mask,
-					       unsigned int inv_mask)
+static int str_ioengine_external_cb(void *data, const char *str)
 {
-	struct opt_group *og;
-	int i;
+	struct thread_data *td = cb_data_to_td(data);
+	struct stat sb;
+	char *p;
 
-	if (*mask == inv_mask || !*mask)
-		return NULL;
+	if (!str) {
+		log_err("fio: null external ioengine path\n");
+		return 1;
+	}
 
-	for (i = 0; ogs[i].name; i++) {
-		og = &ogs[i];
+	p = (char *)str; /* str is mutable */
+	strip_blank_front(&p);
+	strip_blank_end(p);
 
-		if (*mask & og->mask) {
-			*mask &= ~(og->mask);
-			return og;
-		}
+	if (stat(p, &sb) || !S_ISREG(sb.st_mode)) {
+		log_err("fio: invalid external ioengine path \"%s\"\n", p);
+		return 1;
 	}
 
-	return NULL;
+	td->o.ioengine_so_path = p;
+	return 0;
 }
 
-struct opt_group *opt_group_from_mask(unsigned int *mask)
+static int rw_verify(const struct fio_option *o, void *data)
 {
-	return __opt_group_from_mask(fio_opt_groups, mask, FIO_OPT_C_INVALID);
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (read_only && (td_write(td) || td_trim(td))) {
+		log_err("fio: job <%s> has write or trim bit set, but"
+			" fio is in read-only mode\n", td->o.name);
+		return 1;
+	}
+
+	return 0;
 }
 
-static struct opt_group fio_opt_cat_groups[] = {
-	{
-		.name	= "Rate",
-		.mask	= FIO_OPT_G_RATE,
-	},
-	{
-		.name	= "Zone",
-		.mask	= FIO_OPT_G_ZONE,
-	},
-	{
-		.name	= "Read/write mix",
-		.mask	= FIO_OPT_G_RWMIX,
-	},
-	{
-		.name	= "Verify",
-		.mask	= FIO_OPT_G_VERIFY,
-	},
-	{
-		.name	= "Trim",
-		.mask	= FIO_OPT_G_TRIM,
-	},
-	{
-		.name	= "I/O Logging",
-		.mask	= FIO_OPT_G_IOLOG,
-	},
-	{
-		.name	= "I/O Depth",
-		.mask	= FIO_OPT_G_IO_DEPTH,
-	},
-	{
-		.name	= "I/O Flow",
-		.mask	= FIO_OPT_G_IO_FLOW,
-	},
-	{
-		.name	= "Description",
-		.mask	= FIO_OPT_G_DESC,
-	},
-	{
-		.name	= "Filename",
-		.mask	= FIO_OPT_G_FILENAME,
-	},
-	{
-		.name	= "General I/O",
-		.mask	= FIO_OPT_G_IO_BASIC,
-	},
-	{
-		.name	= "Cgroups",
-		.mask	= FIO_OPT_G_CGROUP,
-	},
-	{
-		.name	= "Runtime",
-		.mask	= FIO_OPT_G_RUNTIME,
-	},
-	{
-		.name	= "Process",
-		.mask	= FIO_OPT_G_PROCESS,
-	},
-	{
-		.name	= "Job credentials / priority",
-		.mask	= FIO_OPT_G_CRED,
-	},
-	{
-		.name	= "Clock settings",
-		.mask	= FIO_OPT_G_CLOCK,
-	},
-	{
-		.name	= "I/O Type",
-		.mask	= FIO_OPT_G_IO_TYPE,
-	},
-	{
-		.name	= "I/O Thinktime",
-		.mask	= FIO_OPT_G_THINKTIME,
-	},
-	{
-		.name	= "Randomizations",
-		.mask	= FIO_OPT_G_RANDOM,
-	},
-	{
-		.name	= "I/O buffers",
-		.mask	= FIO_OPT_G_IO_BUF,
-	},
-	{
-		.name	= "Tiobench profile",
-		.mask	= FIO_OPT_G_TIOBENCH,
-	},
+static int gtod_cpu_verify(const struct fio_option *o, void *data)
+{
+#ifndef FIO_HAVE_CPU_AFFINITY
+	struct thread_data *td = cb_data_to_td(data);
 
-	{
-		.name	= NULL,
+	if (td->o.gtod_cpu) {
+		log_err("fio: platform must support CPU affinity for"
+			"gettimeofday() offloading\n");
+		return 1;
 	}
-};
+#endif
 
-struct opt_group *opt_group_cat_from_mask(unsigned int *mask)
-{
-	return __opt_group_from_mask(fio_opt_cat_groups, mask, FIO_OPT_G_INVALID);
+	return 0;
 }
 
 /*
@@ -1145,7 +1579,7 @@
 		.name	= "description",
 		.lname	= "Description of job",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(description),
+		.off1	= offsetof(struct thread_options, description),
 		.help	= "Text job description",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_DESC,
@@ -1154,16 +1588,25 @@
 		.name	= "name",
 		.lname	= "Job name",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(name),
+		.off1	= offsetof(struct thread_options, name),
 		.help	= "Name of this job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_DESC,
 	},
 	{
+		.name	= "wait_for",
+		.lname	= "Waitee name",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, wait_for),
+		.help	= "Name of the job this one wants to wait for before starting",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_DESC,
+	},
+	{
 		.name	= "filename",
 		.lname	= "Filename(s)",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(filename),
+		.off1	= offsetof(struct thread_options, filename),
 		.cb	= str_filename_cb,
 		.prio	= -1, /* must come after "directory" */
 		.help	= "File(s) to use for the workload",
@@ -1174,7 +1617,7 @@
 		.name	= "directory",
 		.lname	= "Directory",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(directory),
+		.off1	= offsetof(struct thread_options, directory),
 		.cb	= str_directory_cb,
 		.help	= "Directory to store files in",
 		.category = FIO_OPT_C_FILE,
@@ -1182,8 +1625,9 @@
 	},
 	{
 		.name	= "filename_format",
+		.lname	= "Filename Format",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(filename_format),
+		.off1	= offsetof(struct thread_options, filename_format),
 		.prio	= -1, /* must come after "directory" */
 		.help	= "Override default $jobname.$jobnum.$filenum naming",
 		.def	= "$jobname.$jobnum.$filenum",
@@ -1191,11 +1635,22 @@
 		.group	= FIO_OPT_G_FILENAME,
 	},
 	{
+		.name	= "unique_filename",
+		.lname	= "Unique Filename",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unique_filename),
+		.help	= "For network clients, prefix file with source IP",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
 		.name	= "lockfile",
 		.lname	= "Lockfile",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(file_lock_mode),
+		.off1	= offsetof(struct thread_options, file_lock_mode),
 		.help	= "Lock file when doing IO to it",
+		.prio	= 1,
 		.parent	= "filename",
 		.hide	= 0,
 		.def	= "none",
@@ -1221,7 +1676,7 @@
 		.name	= "opendir",
 		.lname	= "Open directory",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(opendir),
+		.off1	= offsetof(struct thread_options, opendir),
 		.cb	= str_opendir_cb,
 		.help	= "Recursively add files from this directory and down",
 		.category = FIO_OPT_C_FILE,
@@ -1233,7 +1688,7 @@
 		.alias	= "readwrite",
 		.type	= FIO_OPT_STR,
 		.cb	= str_rw_cb,
-		.off1	= td_var_offset(td_ddir),
+		.off1	= offsetof(struct thread_options, td_ddir),
 		.help	= "IO direction",
 		.def	= "read",
 		.verify	= rw_verify,
@@ -1276,13 +1731,17 @@
 			    .oval = TD_DDIR_RANDRW,
 			    .help = "Random read and write mix"
 			  },
+			  { .ival = "trimwrite",
+			    .oval = TD_DDIR_TRIMWRITE,
+			    .help = "Trim and write mix, trims preceding writes"
+			  },
 		},
 	},
 	{
 		.name	= "rw_sequencer",
 		.lname	= "RW Sequencer",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(rw_seq),
+		.off1	= offsetof(struct thread_options, rw_seq),
 		.help	= "IO offset generator modifier",
 		.def	= "sequential",
 		.category = FIO_OPT_C_IO,
@@ -1303,7 +1762,7 @@
 		.name	= "ioengine",
 		.lname	= "IO Engine",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(ioengine),
+		.off1	= offsetof(struct thread_options, ioengine),
 		.help	= "IO engine to use",
 		.def	= FIO_PREFERRED_ENGINE,
 		.category = FIO_OPT_C_IO,
@@ -1323,11 +1782,21 @@
 			    .help = "Use preadv/pwritev",
 			  },
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+			  { .ival = "pvsync2",
+			    .help = "Use preadv2/pwritev2",
+			  },
+#endif
 #ifdef CONFIG_LIBAIO
 			  { .ival = "libaio",
 			    .help = "Linux native asynchronous IO",
 			  },
 #endif
+#ifdef ARCH_HAVE_IOURING
+			  { .ival = "io_uring",
+			    .help = "Fast Linux native aio",
+			  },
+#endif
 #ifdef CONFIG_POSIXAIO
 			  { .ival = "posixaio",
 			    .help = "POSIX asynchronous IO",
@@ -1343,6 +1812,11 @@
 			    .help = "Windows native asynchronous IO"
 			  },
 #endif
+#ifdef CONFIG_RBD
+			  { .ival = "rbd",
+			    .help = "Rados Block Device asynchronous IO"
+			  },
+#endif
 			  { .ival = "mmap",
 			    .help = "Memory mapped IO"
 			  },
@@ -1373,21 +1847,11 @@
 			    .help = "GUASI IO engine",
 			  },
 #endif
-#ifdef FIO_HAVE_BINJECT
-			  { .ival = "binject",
-			    .help = "binject direct inject block engine",
-			  },
-#endif
 #ifdef CONFIG_RDMA
 			  { .ival = "rdma",
 			    .help = "RDMA IO engine",
 			  },
 #endif
-#ifdef CONFIG_FUSION_AW
-			  { .ival = "fusion-aw-sync",
-			    .help = "Fusion-io atomic write engine",
-			  },
-#endif
 #ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
 			  { .ival = "e4defrag",
 			    .help = "ext4 defrag engine",
@@ -1398,8 +1862,61 @@
 			    .help = "fallocate() file based engine",
 			  },
 #endif
+#ifdef CONFIG_GFAPI
+			  { .ival = "gfapi",
+			    .help = "Glusterfs libgfapi(sync) based engine"
+			  },
+			  { .ival = "gfapi_async",
+			    .help = "Glusterfs libgfapi(async) based engine"
+			  },
+#endif
+#ifdef CONFIG_LIBHDFS
+			  { .ival = "libhdfs",
+			    .help = "Hadoop Distributed Filesystem (HDFS) engine"
+			  },
+#endif
+#ifdef CONFIG_PMEMBLK
+			  { .ival = "pmemblk",
+			    .help = "PMDK libpmemblk based IO engine",
+			  },
+
+#endif
+#ifdef CONFIG_IME
+			  { .ival = "ime_psync",
+			    .help = "DDN's IME synchronous IO engine",
+			  },
+			  { .ival = "ime_psyncv",
+			    .help = "DDN's IME synchronous IO engine using iovecs",
+			  },
+			  { .ival = "ime_aio",
+			    .help = "DDN's IME asynchronous IO engine",
+			  },
+#endif
+#ifdef CONFIG_LINUX_DEVDAX
+			  { .ival = "dev-dax",
+			    .help = "DAX Device based IO engine",
+			  },
+#endif
+			  {
+			    .ival = "filecreate",
+			    .help = "File creation engine",
+			  },
 			  { .ival = "external",
 			    .help = "Load external engine (append name)",
+			    .cb = str_ioengine_external_cb,
+			  },
+#ifdef CONFIG_LIBPMEM
+			  { .ival = "libpmem",
+			    .help = "PMDK libpmem based IO engine",
+			  },
+#endif
+#ifdef CONFIG_HTTP
+			  { .ival = "http",
+			    .help = "HTTP (WebDAV/S3) IO engine",
+			  },
+#endif
+			  { .ival = "nbd",
+			    .help = "Network Block Device (NBD) IO engine"
 			  },
 		},
 	},
@@ -1407,7 +1924,7 @@
 		.name	= "iodepth",
 		.lname	= "IO Depth",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth),
+		.off1	= offsetof(struct thread_options, iodepth),
 		.help	= "Number of IO buffers to keep in flight",
 		.minval = 1,
 		.interval = 1,
@@ -1420,22 +1937,22 @@
 		.lname	= "IO Depth batch",
 		.alias	= "iodepth_batch_submit",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_batch),
+		.off1	= offsetof(struct thread_options, iodepth_batch),
 		.help	= "Number of IO buffers to submit in one go",
 		.parent	= "iodepth",
 		.hide	= 1,
-		.minval	= 1,
 		.interval = 1,
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
-		.name	= "iodepth_batch_complete",
-		.lname	= "IO Depth batch complete",
+		.name	= "iodepth_batch_complete_min",
+		.lname	= "Min IO depth batch complete",
+		.alias	= "iodepth_batch_complete",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_batch_complete),
-		.help	= "Number of IO buffers to retrieve in one go",
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_min),
+		.help	= "Min number of IO buffers to retrieve in one go",
 		.parent	= "iodepth",
 		.hide	= 1,
 		.minval	= 0,
@@ -1445,10 +1962,23 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "iodepth_batch_complete_max",
+		.lname	= "Max IO depth batch complete",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_max),
+		.help	= "Max number of IO buffers to retrieve in one go",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.minval	= 0,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
 		.name	= "iodepth_low",
 		.lname	= "IO Depth batch low",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_low),
+		.off1	= offsetof(struct thread_options, iodepth_low),
 		.help	= "Low water mark for queuing depth",
 		.parent	= "iodepth",
 		.hide	= 1,
@@ -1457,21 +1987,64 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "serialize_overlap",
+		.lname	= "Serialize overlap",
+		.off1	= offsetof(struct thread_options, serialize_overlap),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Wait for in-flight IOs that collide to complete",
+		.parent	= "iodepth",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "io_submit_mode",
+		.lname	= "IO submit mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, io_submit_mode),
+		.help	= "How IO submissions and completions are done",
+		.def	= "inline",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "inline",
+			    .oval = IO_MODE_INLINE,
+			    .help = "Submit and complete IO inline",
+			  },
+			  { .ival = "offload",
+			    .oval = IO_MODE_OFFLOAD,
+			    .help = "Offload submit and complete to threads",
+			  },
+		},
+	},
+	{
 		.name	= "size",
 		.lname	= "Size",
 		.type	= FIO_OPT_STR_VAL,
 		.cb	= str_size_cb,
+		.off1	= offsetof(struct thread_options, size),
 		.help	= "Total size of device or files",
 		.interval = 1024 * 1024,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "io_size",
+		.alias	= "io_limit",
+		.lname	= "IO Size",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, io_size),
+		.help	= "Total size of I/O to be performed",
+		.interval = 1024 * 1024,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "fill_device",
 		.lname	= "Fill device",
 		.alias	= "fill_fs",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fill_device),
+		.off1	= offsetof(struct thread_options, fill_device),
 		.help	= "Write until an ENOSPC error occurs",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -1481,8 +2054,8 @@
 		.name	= "filesize",
 		.lname	= "File size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(file_size_low),
-		.off2	= td_var_offset(file_size_high),
+		.off1	= offsetof(struct thread_options, file_size_low),
+		.off2	= offsetof(struct thread_options, file_size_high),
 		.minval = 1,
 		.help	= "Size of individual files",
 		.interval = 1024 * 1024,
@@ -1490,11 +2063,22 @@
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "file_append",
+		.lname	= "File append",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, file_append),
+		.help	= "IO will start at the end of the file(s)",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "offset",
 		.lname	= "IO offset",
 		.alias	= "fileoffset",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(start_offset),
+		.cb	= str_offset_cb,
+		.off1	= offsetof(struct thread_options, start_offset),
 		.help	= "Start IO from this offset",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -1502,10 +2086,22 @@
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "offset_align",
+		.lname	= "IO offset alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, start_offset_align),
+		.help	= "Start IO from this offset alignment",
+		.def	= "0",
+		.interval = 512,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "offset_increment",
 		.lname	= "IO offset increment",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(offset_increment),
+		.cb	= str_offset_increment_cb,
+		.off1	= offsetof(struct thread_options, offset_increment),
 		.help	= "What is the increment from one offset to the next",
 		.parent = "offset",
 		.hide	= 1,
@@ -1518,8 +2114,8 @@
 		.name	= "number_ios",
 		.lname	= "Number of IOs to perform",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(number_ios),
-		.help	= "Force job completion of this number of IOs",
+		.off1	= offsetof(struct thread_options, number_ios),
+		.help	= "Force job completion after this number of IOs",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
@@ -1528,13 +2124,13 @@
 		.name	= "bs",
 		.lname	= "Block size",
 		.alias	= "blocksize",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(bs[DDIR_READ]),
-		.off2	= td_var_offset(bs[DDIR_WRITE]),
-		.off3	= td_var_offset(bs[DDIR_TRIM]),
+		.type	= FIO_OPT_ULL,
+		.off1	= offsetof(struct thread_options, bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, bs[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Block size unit",
-		.def	= "4k",
+		.def	= "4096",
 		.parent = "rw",
 		.hide	= 1,
 		.interval = 512,
@@ -1545,10 +2141,10 @@
 		.name	= "ba",
 		.lname	= "Block size align",
 		.alias	= "blockalign",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ba[DDIR_READ]),
-		.off2	= td_var_offset(ba[DDIR_WRITE]),
-		.off3	= td_var_offset(ba[DDIR_TRIM]),
+		.type	= FIO_OPT_ULL,
+		.off1	= offsetof(struct thread_options, ba[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ba[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ba[DDIR_TRIM]),
 		.minval	= 1,
 		.help	= "IO block offset alignment",
 		.parent	= "rw",
@@ -1562,12 +2158,12 @@
 		.lname	= "Block size range",
 		.alias	= "blocksize_range",
 		.type	= FIO_OPT_RANGE,
-		.off1	= td_var_offset(min_bs[DDIR_READ]),
-		.off2	= td_var_offset(max_bs[DDIR_READ]),
-		.off3	= td_var_offset(min_bs[DDIR_WRITE]),
-		.off4	= td_var_offset(max_bs[DDIR_WRITE]),
-		.off5	= td_var_offset(min_bs[DDIR_TRIM]),
-		.off6	= td_var_offset(max_bs[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, min_bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, max_bs[DDIR_READ]),
+		.off3	= offsetof(struct thread_options, min_bs[DDIR_WRITE]),
+		.off4	= offsetof(struct thread_options, max_bs[DDIR_WRITE]),
+		.off5	= offsetof(struct thread_options, min_bs[DDIR_TRIM]),
+		.off6	= offsetof(struct thread_options, max_bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Set block size range (in more detail than bs)",
 		.parent = "rw",
@@ -1579,8 +2175,9 @@
 	{
 		.name	= "bssplit",
 		.lname	= "Block size split",
-		.type	= FIO_OPT_STR,
+		.type	= FIO_OPT_STR_ULL,
 		.cb	= str_bssplit_cb,
+		.off1	= offsetof(struct thread_options, bssplit),
 		.help	= "Set a specific mix of block sizes",
 		.parent	= "rw",
 		.hide	= 1,
@@ -1592,7 +2189,7 @@
 		.lname	= "Block size unaligned",
 		.alias	= "blocksize_unaligned",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(bs_unaligned),
+		.off1	= offsetof(struct thread_options, bs_unaligned),
 		.help	= "Don't sector align IO buffer sizes",
 		.parent = "rw",
 		.hide	= 1,
@@ -1603,8 +2200,8 @@
 		.name	= "bs_is_seq_rand",
 		.lname	= "Block size division is seq/random (not read/write)",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(bs_is_seq_rand),
-		.help	= "Consider any blocksize setting to be sequential,ramdom",
+		.off1	= offsetof(struct thread_options, bs_is_seq_rand),
+		.help	= "Consider any blocksize setting to be sequential,random",
 		.def	= "0",
 		.parent = "blocksize",
 		.category = FIO_OPT_C_IO,
@@ -1614,7 +2211,7 @@
 		.name	= "randrepeat",
 		.lname	= "Random repeatable",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(rand_repeatable),
+		.off1	= offsetof(struct thread_options, rand_repeatable),
 		.help	= "Use repeatable random IO pattern",
 		.def	= "1",
 		.parent = "rw",
@@ -1623,14 +2220,13 @@
 		.group	= FIO_OPT_G_RANDOM,
 	},
 	{
-		.name	= "use_os_rand",
-		.lname	= "Use OS random",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(use_os_rand),
-		.help	= "Set to use OS random generator",
-		.def	= "0",
+		.name	= "randseed",
+		.lname	= "The random generator seed",
+		.type	= FIO_OPT_STR_VAL,
+		.off1	= offsetof(struct thread_options, rand_seed),
+		.help	= "Set the random generator seed value",
+		.def	= "0x89",
 		.parent = "rw",
-		.hide	= 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
 	},
@@ -1638,7 +2234,7 @@
 		.name	= "norandommap",
 		.lname	= "No randommap",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(norandommap),
+		.off1	= offsetof(struct thread_options, norandommap),
 		.help	= "Accept potential duplicate random blocks",
 		.parent = "rw",
 		.hide	= 1,
@@ -1650,7 +2246,7 @@
 		.name	= "softrandommap",
 		.lname	= "Soft randommap",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(softrandommap),
+		.off1	= offsetof(struct thread_options, softrandommap),
 		.help	= "Set norandommap if randommap allocation fails",
 		.parent	= "norandommap",
 		.hide	= 1,
@@ -1660,8 +2256,9 @@
 	},
 	{
 		.name	= "random_generator",
+		.lname	= "Random Generator",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(random_generator),
+		.off1	= offsetof(struct thread_options, random_generator),
 		.help	= "Type of random number generator to use",
 		.def	= "tausworthe",
 		.posval	= {
@@ -1673,14 +2270,20 @@
 			    .oval = FIO_RAND_GEN_LFSR,
 			    .help = "Variable length LFSR",
 			  },
+			  {
+			    .ival = "tausworthe64",
+			    .oval = FIO_RAND_GEN_TAUSWORTHE64,
+			    .help = "64-bit Tausworthe variant",
+			  },
 		},
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
 	},
 	{
 		.name	= "random_distribution",
+		.lname	= "Random Distribution",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(random_distribution),
+		.off1	= offsetof(struct thread_options, random_distribution),
 		.cb	= str_random_distribution_cb,
 		.help	= "Random offset distribution generator",
 		.def	= "random",
@@ -1697,6 +2300,18 @@
 			    .oval = FIO_RAND_DIST_PARETO,
 			    .help = "Pareto distribution",
 			  },
+			  { .ival = "normal",
+			    .oval = FIO_RAND_DIST_GAUSS,
+			    .help = "Normal (Gaussian) distribution",
+			  },
+			  { .ival = "zoned",
+			    .oval = FIO_RAND_DIST_ZONED,
+			    .help = "Zoned random distribution",
+			  },
+			  { .ival = "zoned_abs",
+			    .oval = FIO_RAND_DIST_ZONED_ABS,
+			    .help = "Zoned absolute random distribution",
+			  },
 		},
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
@@ -1705,9 +2320,9 @@
 		.name	= "percentage_random",
 		.lname	= "Percentage Random",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(perc_rand[DDIR_READ]),
-		.off2	= td_var_offset(perc_rand[DDIR_WRITE]),
-		.off3	= td_var_offset(perc_rand[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, perc_rand[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, perc_rand[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, perc_rand[DDIR_TRIM]),
 		.maxval	= 100,
 		.help	= "Percentage of seq/random mix that should be random",
 		.def	= "100,100,100",
@@ -1724,11 +2339,21 @@
 		.group	= FIO_OPT_G_RANDOM,
 	},
 	{
+		.name	= "allrandrepeat",
+		.lname	= "All Random Repeat",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allrand_repeatable),
+		.help	= "Use repeatable random numbers for everything",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RANDOM,
+	},
+	{
 		.name	= "nrfiles",
 		.lname	= "Number of files",
 		.alias	= "nr_files",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(nr_files),
+		.off1	= offsetof(struct thread_options, nr_files),
 		.help	= "Split job workload between this number of files",
 		.def	= "1",
 		.interval = 1,
@@ -1739,7 +2364,7 @@
 		.name	= "openfiles",
 		.lname	= "Number of open files",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(open_files),
+		.off1	= offsetof(struct thread_options, open_files),
 		.help	= "Number of files to keep open at the same time",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -1749,7 +2374,7 @@
 		.lname	= "File service type",
 		.type	= FIO_OPT_STR,
 		.cb	= str_fst_cb,
-		.off1	= td_var_offset(file_service_type),
+		.off1	= offsetof(struct thread_options, file_service_type),
 		.help	= "How to select which file to service next",
 		.def	= "roundrobin",
 		.category = FIO_OPT_C_FILE,
@@ -1757,7 +2382,23 @@
 		.posval	= {
 			  { .ival = "random",
 			    .oval = FIO_FSERVICE_RANDOM,
-			    .help = "Choose a file at random",
+			    .help = "Choose a file at random (uniform)",
+			  },
+			  { .ival = "zipf",
+			    .oval = FIO_FSERVICE_ZIPF,
+			    .help = "Zipf randomized",
+			  },
+			  { .ival = "pareto",
+			    .oval = FIO_FSERVICE_PARETO,
+			    .help = "Pareto randomized",
+			  },
+			  { .ival = "normal",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Normal (Gaussian) randomized",
+			  },
+			  { .ival = "gauss",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Alias for normal",
 			  },
 			  { .ival = "roundrobin",
 			    .oval = FIO_FSERVICE_RR,
@@ -1771,14 +2412,14 @@
 		.parent = "nrfiles",
 		.hide	= 1,
 	},
-#ifdef CONFIG_POSIX_FALLOCATE
+#ifdef FIO_HAVE_ANY_FALLOCATE
 	{
 		.name	= "fallocate",
 		.lname	= "Fallocate",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(fallocate_mode),
+		.off1	= offsetof(struct thread_options, fallocate_mode),
 		.help	= "Whether pre-allocation is performed when laying out files",
-		.def	= "posix",
+		.def	= "native",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 		.posval	= {
@@ -1786,10 +2427,16 @@
 			    .oval = FIO_FALLOCATE_NONE,
 			    .help = "Do not pre-allocate space",
 			  },
+			  { .ival = "native",
+			    .oval = FIO_FALLOCATE_NATIVE,
+			    .help = "Use native pre-allocation if possible",
+			  },
+#ifdef CONFIG_POSIX_FALLOCATE
 			  { .ival = "posix",
 			    .oval = FIO_FALLOCATE_POSIX,
 			    .help = "Use posix_fallocate()",
 			  },
+#endif
 #ifdef CONFIG_LINUX_FALLOCATE
 			  { .ival = "keep",
 			    .oval = FIO_FALLOCATE_KEEP_SIZE,
@@ -1801,18 +2448,45 @@
 			    .oval = FIO_FALLOCATE_NONE,
 			    .help = "Alias for 'none'",
 			  },
+#ifdef CONFIG_POSIX_FALLOCATE
 			  { .ival = "1",
 			    .oval = FIO_FALLOCATE_POSIX,
 			    .help = "Alias for 'posix'",
 			  },
+#endif
 		},
 	},
-#endif	/* CONFIG_POSIX_FALLOCATE */
+#else	/* FIO_HAVE_ANY_FALLOCATE */
+	{
+		.name	= "fallocate",
+		.lname	= "Fallocate",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support fallocate",
+	},
+#endif /* FIO_HAVE_ANY_FALLOCATE */
 	{
 		.name	= "fadvise_hint",
 		.lname	= "Fadvise hint",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fadvise_hint),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, fadvise_hint),
+		.posval	= {
+			  { .ival = "0",
+			    .oval = F_ADV_NONE,
+			    .help = "Don't issue fadvise/madvise",
+			  },
+			  { .ival = "1",
+			    .oval = F_ADV_TYPE,
+			    .help = "Advise using fio IO pattern",
+			  },
+			  { .ival = "random",
+			    .oval = F_ADV_RANDOM,
+			    .help = "Advise using FADV_RANDOM",
+			  },
+			  { .ival = "sequential",
+			    .oval = F_ADV_SEQUENTIAL,
+			    .help = "Advise using FADV_SEQUENTIAL",
+			  },
+		},
 		.help	= "Use fadvise() to advise the kernel on IO pattern",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
@@ -1822,7 +2496,7 @@
 		.name	= "fsync",
 		.lname	= "Fsync",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(fsync_blocks),
+		.off1	= offsetof(struct thread_options, fsync_blocks),
 		.help	= "Issue fsync for writes every given number of blocks",
 		.def	= "0",
 		.interval = 1,
@@ -1833,7 +2507,7 @@
 		.name	= "fdatasync",
 		.lname	= "Fdatasync",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(fdatasync_blocks),
+		.off1	= offsetof(struct thread_options, fdatasync_blocks),
 		.help	= "Issue fdatasync for writes every given number of blocks",
 		.def	= "0",
 		.interval = 1,
@@ -1844,7 +2518,7 @@
 		.name	= "write_barrier",
 		.lname	= "Write barrier",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(barrier_blocks),
+		.off1	= offsetof(struct thread_options, barrier_blocks),
 		.help	= "Make every Nth write a barrier write",
 		.def	= "0",
 		.interval = 1,
@@ -1859,33 +2533,40 @@
 			  { .ival = "wait_before",
 			    .oval = SYNC_FILE_RANGE_WAIT_BEFORE,
 			    .help = "SYNC_FILE_RANGE_WAIT_BEFORE",
-			    .or	  = 1,
+			    .orval  = 1,
 			  },
 			  { .ival = "write",
 			    .oval = SYNC_FILE_RANGE_WRITE,
 			    .help = "SYNC_FILE_RANGE_WRITE",
-			    .or	  = 1,
+			    .orval  = 1,
 			  },
 			  {
 			    .ival = "wait_after",
 			    .oval = SYNC_FILE_RANGE_WAIT_AFTER,
 			    .help = "SYNC_FILE_RANGE_WAIT_AFTER",
-			    .or	  = 1,
+			    .orval  = 1,
 			  },
 		},
 		.type	= FIO_OPT_STR_MULTI,
 		.cb	= str_sfr_cb,
-		.off1	= td_var_offset(sync_file_range),
+		.off1	= offsetof(struct thread_options, sync_file_range),
 		.help	= "Use sync_file_range()",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "sync_file_range",
+		.lname	= "Sync file range",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support sync_file_range",
+	},
 #endif
 	{
 		.name	= "direct",
 		.lname	= "Direct I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(odirect),
+		.off1	= offsetof(struct thread_options, odirect),
 		.help	= "Use O_DIRECT IO (negates buffered)",
 		.def	= "0",
 		.inverse = "buffered",
@@ -1893,10 +2574,20 @@
 		.group	= FIO_OPT_G_IO_TYPE,
 	},
 	{
+		.name	= "atomic",
+		.lname	= "Atomic I/O",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, oatomic),
+		.help	= "Use Atomic IO with O_DIRECT (implies O_DIRECT)",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_TYPE,
+	},
+	{
 		.name	= "buffered",
 		.lname	= "Buffered I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(odirect),
+		.off1	= offsetof(struct thread_options, odirect),
 		.neg	= 1,
 		.help	= "Use buffered IO (negates direct)",
 		.def	= "1",
@@ -1908,7 +2599,7 @@
 		.name	= "overwrite",
 		.lname	= "Overwrite",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(overwrite),
+		.off1	= offsetof(struct thread_options, overwrite),
 		.help	= "When writing, set whether to overwrite current data",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -1918,7 +2609,7 @@
 		.name	= "loops",
 		.lname	= "Loops",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(loops),
+		.off1	= offsetof(struct thread_options, loops),
 		.help	= "Number of times to run the job",
 		.def	= "1",
 		.interval = 1,
@@ -1929,7 +2620,7 @@
 		.name	= "numjobs",
 		.lname	= "Number of jobs",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(numjobs),
+		.off1	= offsetof(struct thread_options, numjobs),
 		.help	= "Duplicate this job this many times",
 		.def	= "1",
 		.interval = 1,
@@ -1940,9 +2631,12 @@
 		.name	= "startdelay",
 		.lname	= "Start delay",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(start_delay),
+		.off1	= offsetof(struct thread_options, start_delay),
+		.off2	= offsetof(struct thread_options, start_delay_high),
 		.help	= "Only start job when this period has passed",
 		.def	= "0",
+		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -1951,9 +2645,11 @@
 		.lname	= "Runtime",
 		.alias	= "timeout",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(timeout),
+		.off1	= offsetof(struct thread_options, timeout),
 		.help	= "Stop workload when this amount of time has passed",
 		.def	= "0",
+		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -1961,17 +2657,28 @@
 		.name	= "time_based",
 		.lname	= "Time based",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(time_based),
+		.off1	= offsetof(struct thread_options, time_based),
 		.help	= "Keep running until runtime/timeout is met",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
 	{
+		.name	= "verify_only",
+		.lname	= "Verify only",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, verify_only),
+		.help	= "Verifies previously written data is still valid",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_RUNTIME,
+	},
+	{
 		.name	= "ramp_time",
 		.lname	= "Ramp time",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(ramp_time),
+		.off1	= offsetof(struct thread_options, ramp_time),
 		.help	= "Ramp up time before measuring performance",
+		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -1980,7 +2687,7 @@
 		.lname	= "Clock source",
 		.type	= FIO_OPT_STR,
 		.cb	= fio_clock_source_cb,
-		.off1	= td_var_offset(clocksource),
+		.off1	= offsetof(struct thread_options, clocksource),
 		.help	= "What type of timing source to use",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CLOCK,
@@ -2011,7 +2718,7 @@
 		.lname	= "I/O Memory",
 		.type	= FIO_OPT_STR,
 		.cb	= str_mem_cb,
-		.off1	= td_var_offset(mem_type),
+		.off1	= offsetof(struct thread_options, mem_type),
 		.help	= "Backing type for IO buffers",
 		.def	= "malloc",
 		.category = FIO_OPT_C_IO,
@@ -2021,6 +2728,7 @@
 			    .oval = MEM_MALLOC,
 			    .help = "Use malloc(3) for IO buffers",
 			  },
+#ifndef CONFIG_NO_SHM
 			  { .ival = "shm",
 			    .oval = MEM_SHM,
 			    .help = "Use shared memory segments for IO buffers",
@@ -2031,16 +2739,27 @@
 			    .help = "Like shm, but use huge pages",
 			  },
 #endif
+#endif
 			  { .ival = "mmap",
 			    .oval = MEM_MMAP,
 			    .help = "Use mmap(2) (file or anon) for IO buffers",
 			  },
+			  { .ival = "mmapshared",
+			    .oval = MEM_MMAPSHARED,
+			    .help = "Like mmap, but use the shared flag",
+			  },
 #ifdef FIO_HAVE_HUGETLB
 			  { .ival = "mmaphuge",
 			    .oval = MEM_MMAPHUGE,
 			    .help = "Like mmap, but use huge pages",
 			  },
 #endif
+#ifdef CONFIG_CUDA
+			  { .ival = "cudamalloc",
+			    .oval = MEM_CUDA_MALLOC,
+			    .help = "Allocate GPU device memory for GPUDirect RDMA",
+			  },
+#endif
 		  },
 	},
 	{
@@ -2048,7 +2767,7 @@
 		.alias	= "mem_align",
 		.lname	= "I/O memory alignment",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(mem_align),
+		.off1	= offsetof(struct thread_options, mem_align),
 		.minval	= 0,
 		.help	= "IO memory buffer offset alignment",
 		.def	= "0",
@@ -2061,7 +2780,7 @@
 		.name	= "verify",
 		.lname	= "Verify",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(verify),
+		.off1	= offsetof(struct thread_options, verify),
 		.help	= "Verify data written",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -2111,9 +2830,37 @@
 			    .oval = VERIFY_SHA512,
 			    .help = "Use sha512 checksums for verification",
 			  },
+			  { .ival = "sha3-224",
+			    .oval = VERIFY_SHA3_224,
+			    .help = "Use sha3-224 checksums for verification",
+			  },
+			  { .ival = "sha3-256",
+			    .oval = VERIFY_SHA3_256,
+			    .help = "Use sha3-256 checksums for verification",
+			  },
+			  { .ival = "sha3-384",
+			    .oval = VERIFY_SHA3_384,
+			    .help = "Use sha3-384 checksums for verification",
+			  },
+			  { .ival = "sha3-512",
+			    .oval = VERIFY_SHA3_512,
+			    .help = "Use sha3-512 checksums for verification",
+			  },
+			  { .ival = "xxhash",
+			    .oval = VERIFY_XXHASH,
+			    .help = "Use xxhash checksums for verification",
+			  },
+			  /* Meta information was included into verify_header,
+			   * 'meta' verification is implied by default. */
 			  { .ival = "meta",
-			    .oval = VERIFY_META,
-			    .help = "Use io information",
+			    .oval = VERIFY_HDR_ONLY,
+			    .help = "Use io information for verification. "
+				    "Now is implied by default, thus option is obsolete, "
+				    "don't use it",
+			  },
+			  { .ival = "pattern",
+			    .oval = VERIFY_PATTERN_NO_HDR,
+			    .help = "Verify strict pattern",
 			  },
 			  {
 			    .ival = "null",
@@ -2126,7 +2873,7 @@
 		.name	= "do_verify",
 		.lname	= "Perform verify step",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(do_verify),
+		.off1	= offsetof(struct thread_options, do_verify),
 		.help	= "Run verification stage after write",
 		.def	= "1",
 		.parent = "verify",
@@ -2137,24 +2884,14 @@
 	{
 		.name	= "verifysort",
 		.lname	= "Verify sort",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verifysort),
-		.help	= "Sort written verify blocks for read back",
-		.def	= "1",
-		.parent = "verify",
-		.hide	= 1,
+		.type	= FIO_OPT_SOFT_DEPRECATED,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
 	{
 		.name	= "verifysort_nr",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verifysort_nr),
-		.help	= "Pre-load and sort verify blocks for a read workload",
-		.minval	= 0,
-		.maxval	= 131072,
-		.def	= "1024",
-		.parent = "verify",
+		.lname	= "Verify Sort Nr",
+		.type	= FIO_OPT_SOFT_DEPRECATED,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
@@ -2162,7 +2899,7 @@
 		.name   = "verify_interval",
 		.lname	= "Verify interval",
 		.type   = FIO_OPT_INT,
-		.off1   = td_var_offset(verify_interval),
+		.off1   = offsetof(struct thread_options, verify_interval),
 		.minval	= 2 * sizeof(struct verify_header),
 		.help   = "Store verify buffer header every N bytes",
 		.parent	= "verify",
@@ -2176,7 +2913,7 @@
 		.lname	= "Verify offset",
 		.type	= FIO_OPT_INT,
 		.help	= "Offset verify header location by N bytes",
-		.off1	= td_var_offset(verify_offset),
+		.off1	= offsetof(struct thread_options, verify_offset),
 		.minval	= sizeof(struct verify_header),
 		.parent	= "verify",
 		.hide	= 1,
@@ -2188,6 +2925,7 @@
 		.lname	= "Verify pattern",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_pattern_cb,
+		.off1	= offsetof(struct thread_options, verify_pattern),
 		.help	= "Fill pattern for IO buffers",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2198,7 +2936,7 @@
 		.name	= "verify_fatal",
 		.lname	= "Verify fatal",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verify_fatal),
+		.off1	= offsetof(struct thread_options, verify_fatal),
 		.def	= "0",
 		.help	= "Exit on a single verify failure, don't continue",
 		.parent = "verify",
@@ -2210,7 +2948,7 @@
 		.name	= "verify_dump",
 		.lname	= "Verify dump",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verify_dump),
+		.off1	= offsetof(struct thread_options, verify_dump),
 		.def	= "0",
 		.help	= "Dump contents of good and bad blocks on failure",
 		.parent = "verify",
@@ -2222,7 +2960,7 @@
 		.name	= "verify_async",
 		.lname	= "Verify asynchronously",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verify_async),
+		.off1	= offsetof(struct thread_options, verify_async),
 		.def	= "0",
 		.help	= "Number of async verifier threads to use",
 		.parent	= "verify",
@@ -2234,7 +2972,7 @@
 		.name	= "verify_backlog",
 		.lname	= "Verify backlog",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(verify_backlog),
+		.off1	= offsetof(struct thread_options, verify_backlog),
 		.help	= "Verify after this number of blocks are written",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2245,7 +2983,7 @@
 		.name	= "verify_backlog_batch",
 		.lname	= "Verify backlog batch",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verify_batch),
+		.off1	= offsetof(struct thread_options, verify_batch),
 		.help	= "Verify this number of IO blocks",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2258,18 +2996,49 @@
 		.lname	= "Async verify CPUs",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, verify_cpumask),
 		.help	= "Set CPUs allowed for async verify threads",
 		.parent	= "verify_async",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
+#else
+	{
+		.name	= "verify_async_cpus",
+		.lname	= "Async verify CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
 #endif
 	{
 		.name	= "experimental_verify",
-		.off1	= td_var_offset(experimental_verify),
+		.lname	= "Experimental Verify",
+		.off1	= offsetof(struct thread_options, experimental_verify),
 		.type	= FIO_OPT_BOOL,
 		.help	= "Enable experimental verification",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_load",
+		.lname	= "Load verify state",
+		.off1	= offsetof(struct thread_options, verify_state),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Load verify termination state",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_save",
+		.lname	= "Save verify state",
+		.off1	= offsetof(struct thread_options, verify_state_save),
+		.type	= FIO_OPT_BOOL,
+		.def	= "1",
+		.help	= "Save verify state on termination",
+		.parent	= "verify",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
@@ -2278,10 +3047,10 @@
 		.name	= "trim_percentage",
 		.lname	= "Trim percentage",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(trim_percentage),
+		.off1	= offsetof(struct thread_options, trim_percentage),
 		.minval = 0,
 		.maxval = 100,
-		.help	= "Number of verify blocks to discard/trim",
+		.help	= "Number of verify blocks to trim (i.e., discard)",
 		.parent	= "verify",
 		.def	= "0",
 		.interval = 1,
@@ -2293,8 +3062,8 @@
 		.name	= "trim_verify_zero",
 		.lname	= "Verify trim zero",
 		.type	= FIO_OPT_BOOL,
-		.help	= "Verify that trim/discarded blocks are returned as zeroes",
-		.off1	= td_var_offset(trim_zero),
+		.help	= "Verify that trimmed (i.e., discarded) blocks are returned as zeroes",
+		.off1	= offsetof(struct thread_options, trim_zero),
 		.parent	= "trim_percentage",
 		.hide	= 1,
 		.def	= "1",
@@ -2305,7 +3074,7 @@
 		.name	= "trim_backlog",
 		.lname	= "Trim backlog",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(trim_backlog),
+		.off1	= offsetof(struct thread_options, trim_backlog),
 		.help	= "Trim after this number of blocks are written",
 		.parent	= "trim_percentage",
 		.hide	= 1,
@@ -2317,7 +3086,7 @@
 		.name	= "trim_backlog_batch",
 		.lname	= "Trim backlog batch",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(trim_batch),
+		.off1	= offsetof(struct thread_options, trim_batch),
 		.help	= "Trim this number of IO blocks",
 		.parent	= "trim_percentage",
 		.hide	= 1,
@@ -2325,12 +3094,37 @@
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_TRIM,
 	},
+#else
+	{
+		.name	= "trim_percentage",
+		.lname	= "Trim percentage",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_verify_zero",
+		.lname	= "Verify trim zero",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog",
+		.lname	= "Trim backlog",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog_batch",
+		.lname	= "Trim backlog batch",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
 #endif
 	{
 		.name	= "write_iolog",
 		.lname	= "Write I/O log",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(write_iolog_file),
+		.off1	= offsetof(struct thread_options, write_iolog_file),
 		.help	= "Store IO pattern to file",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IOLOG,
@@ -2339,16 +3133,27 @@
 		.name	= "read_iolog",
 		.lname	= "Read I/O log",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(read_iolog_file),
+		.off1	= offsetof(struct thread_options, read_iolog_file),
 		.help	= "Playback IO pattern from file",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IOLOG,
 	},
 	{
+		.name	= "read_iolog_chunked",
+		.lname	= "Read I/O log in parts",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, read_iolog_chunked),
+		.def	= "0",
+		.parent	= "read_iolog",
+		.help	= "Parse IO pattern in chunks",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
 		.name	= "replay_no_stall",
 		.lname	= "Don't stall on replay",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(no_stall),
+		.off1	= offsetof(struct thread_options, no_stall),
 		.def	= "0",
 		.parent	= "read_iolog",
 		.hide	= 1,
@@ -2360,7 +3165,7 @@
 		.name	= "replay_redirect",
 		.lname	= "Redirect device for replay",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(replay_redirect),
+		.off1	= offsetof(struct thread_options, replay_redirect),
 		.parent	= "read_iolog",
 		.hide	= 1,
 		.help	= "Replay all I/O onto this device, regardless of trace device",
@@ -2368,10 +3173,85 @@
 		.group	= FIO_OPT_G_IOLOG,
 	},
 	{
+		.name	= "replay_scale",
+		.lname	= "Replace offset scale factor",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_scale),
+		.parent	= "read_iolog",
+		.def	= "1",
+		.help	= "Align offsets to this blocksize",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_align",
+		.lname	= "Replace alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_align),
+		.parent	= "read_iolog",
+		.help	= "Scale offset down by this factor",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+		.pow2	= 1,
+	},
+	{
+		.name	= "replay_time_scale",
+		.lname	= "Replay Time Scale",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_time_scale),
+		.def	= "100",
+		.minval	= 1,
+		.parent	= "read_iolog",
+		.hide	= 1,
+		.help	= "Scale time for replay events",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_skip",
+		.lname	= "Replay Skip",
+		.type	= FIO_OPT_STR,
+		.cb	= str_replay_skip_cb,
+		.off1	= offsetof(struct thread_options, replay_skip),
+		.parent	= "read_iolog",
+		.help	= "Skip certain IO types (read,write,trim,flush)",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_file",
+		.lname	= "Merged blktrace output filename",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, merge_blktrace_file),
+		.help	= "Merged blktrace output filename",
+		.category = FIO_OPT_C_IO,
+		.group = FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_scalars",
+		.lname	= "Percentage to scale each trace",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, merge_blktrace_scalars),
+		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
+		.help	= "Percentage to scale each trace",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "merge_blktrace_iters",
+		.lname	= "Number of iterations to run per trace",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, merge_blktrace_iters),
+		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
+		.help	= "Number of iterations to run per trace",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
 		.name	= "exec_prerun",
 		.lname	= "Pre-execute runnable",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(exec_prerun),
+		.off1	= offsetof(struct thread_options, exec_prerun),
 		.help	= "Execute this file prior to running job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -2380,7 +3260,7 @@
 		.name	= "exec_postrun",
 		.lname	= "Post-execute runnable",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(exec_postrun),
+		.off1	= offsetof(struct thread_options, exec_postrun),
 		.help	= "Execute this file after running job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -2390,17 +3270,48 @@
 		.name	= "ioscheduler",
 		.lname	= "I/O scheduler",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(ioscheduler),
+		.off1	= offsetof(struct thread_options, ioscheduler),
 		.help	= "Use this IO scheduler on the backing device",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "ioscheduler",
+		.lname	= "I/O scheduler",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO scheduler switching",
+	},
 #endif
 	{
+		.name	= "zonemode",
+		.lname	= "Zone mode",
+		.help	= "Mode for the zonesize, zonerange and zoneskip parameters",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, zone_mode),
+		.def	= "none",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+		.posval	= {
+			   { .ival = "none",
+			     .oval = ZONE_MODE_NONE,
+			     .help = "no zoning",
+			   },
+			   { .ival = "strided",
+			     .oval = ZONE_MODE_STRIDED,
+			     .help = "strided mode - random I/O is restricted to a single zone",
+			   },
+			   { .ival = "zbd",
+			     .oval = ZONE_MODE_ZBD,
+			     .help = "zoned block device mode - random I/O selects one of multiple zones randomly",
+			   },
+		},
+	},
+	{
 		.name	= "zonesize",
 		.lname	= "Zone size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_size),
+		.off1	= offsetof(struct thread_options, zone_size),
 		.help	= "Amount of data to read per zone",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2411,7 +3322,7 @@
 		.name	= "zonerange",
 		.lname	= "Zone range",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_range),
+		.off1	= offsetof(struct thread_options, zone_range),
 		.help	= "Give size of an IO zone",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2422,7 +3333,7 @@
 		.name	= "zoneskip",
 		.lname	= "Zone skip",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_skip),
+		.off1	= offsetof(struct thread_options, zone_skip),
 		.help	= "Space between IO zones",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2430,10 +3341,56 @@
 		.group	= FIO_OPT_G_ZONE,
 	},
 	{
+		.name	= "read_beyond_wp",
+		.lname	= "Allow reads beyond the zone write pointer",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, read_beyond_wp),
+		.help	= "Allow reads beyond the zone write pointer",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "max_open_zones",
+		.lname	= "Maximum number of open zones",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, max_open_zones),
+		.maxval	= FIO_MAX_OPEN_ZBD_ZONES,
+		.help	= "Limit random writes to SMR drives to the specified"
+			  " number of sequential zones",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "zone_reset_threshold",
+		.lname	= "Zone reset threshold",
+		.help	= "Zoned block device reset threshold",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.maxlen	= 1,
+		.off1	= offsetof(struct thread_options, zrt),
+		.minfp	= 0,
+		.maxfp	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "zone_reset_frequency",
+		.lname	= "Zone reset frequency",
+		.help	= "Zoned block device zone reset frequency in HZ",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.maxlen	= 1,
+		.off1	= offsetof(struct thread_options, zrf),
+		.minfp	= 0,
+		.maxfp	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
 		.name	= "lockmem",
 		.lname	= "Lock memory",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(lockmem),
+		.off1	= offsetof(struct thread_options, lockmem),
 		.help	= "Lock down this amount of memory (per worker)",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2445,6 +3402,7 @@
 		.lname	= "Read/write mix read",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_read_cb,
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_READ]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is reads",
 		.def	= "50",
@@ -2458,6 +3416,7 @@
 		.lname	= "Read/write mix write",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_write_cb,
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_WRITE]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is writes",
 		.def	= "50",
@@ -2477,10 +3436,10 @@
 		.name	= "nice",
 		.lname	= "Nice",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(nice),
+		.off1	= offsetof(struct thread_options, nice),
 		.help	= "Set job CPU nice value",
-		.minval	= -19,
-		.maxval	= 20,
+		.minval	= -20,
+		.maxval	= 19,
 		.def	= "0",
 		.interval = 1,
 		.category = FIO_OPT_C_GENERAL,
@@ -2491,34 +3450,54 @@
 		.name	= "prio",
 		.lname	= "I/O nice priority",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ioprio),
+		.off1	= offsetof(struct thread_options, ioprio),
 		.help	= "Set job IO priority value",
-		.minval	= 0,
-		.maxval	= 7,
+		.minval	= IOPRIO_MIN_PRIO,
+		.maxval	= IOPRIO_MAX_PRIO,
 		.interval = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+#else
+	{
+		.name	= "prio",
+		.lname	= "I/O nice priority",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priorities",
+	},
+#endif
+#ifdef FIO_HAVE_IOPRIO_CLASS
+#ifndef FIO_HAVE_IOPRIO
+#error "FIO_HAVE_IOPRIO_CLASS requires FIO_HAVE_IOPRIO"
+#endif
 	{
 		.name	= "prioclass",
 		.lname	= "I/O nice priority class",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ioprio_class),
+		.off1	= offsetof(struct thread_options, ioprio_class),
 		.help	= "Set job IO priority class",
-		.minval	= 0,
-		.maxval	= 3,
+		.minval	= IOPRIO_MIN_PRIO_CLASS,
+		.maxval	= IOPRIO_MAX_PRIO_CLASS,
 		.interval = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+#else
+	{
+		.name	= "prioclass",
+		.lname	= "I/O nice priority class",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priority classes",
+	},
 #endif
 	{
 		.name	= "thinktime",
 		.lname	= "Thinktime",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime),
+		.off1	= offsetof(struct thread_options, thinktime),
 		.help	= "Idle time between IO buffers (usec)",
 		.def	= "0",
+		.is_time = 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_THINKTIME,
 	},
@@ -2526,9 +3505,10 @@
 		.name	= "thinktime_spin",
 		.lname	= "Thinktime spin",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime_spin),
+		.off1	= offsetof(struct thread_options, thinktime_spin),
 		.help	= "Start think time by spinning this amount (usec)",
 		.def	= "0",
+		.is_time = 1,
 		.parent	= "thinktime",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
@@ -2538,7 +3518,7 @@
 		.name	= "thinktime_blocks",
 		.lname	= "Thinktime blocks",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime_blocks),
+		.off1	= offsetof(struct thread_options, thinktime_blocks),
 		.help	= "IO buffer period between 'thinktime'",
 		.def	= "1",
 		.parent	= "thinktime",
@@ -2550,20 +3530,21 @@
 		.name	= "rate",
 		.lname	= "I/O rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate[DDIR_READ]),
-		.off2	= td_var_offset(rate[DDIR_WRITE]),
-		.off3	= td_var_offset(rate[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate[DDIR_TRIM]),
 		.help	= "Set bandwidth rate",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
-		.name	= "ratemin",
+		.name	= "rate_min",
+		.alias	= "ratemin",
 		.lname	= "I/O min rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ratemin[DDIR_READ]),
-		.off2	= td_var_offset(ratemin[DDIR_WRITE]),
-		.off3	= td_var_offset(ratemin[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, ratemin[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ratemin[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ratemin[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shutdown",
 		.parent	= "rate",
 		.hide	= 1,
@@ -2574,9 +3555,9 @@
 		.name	= "rate_iops",
 		.lname	= "I/O rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops[DDIR_READ]),
-		.off2	= td_var_offset(rate_iops[DDIR_WRITE]),
-		.off3	= td_var_offset(rate_iops[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate_iops[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops[DDIR_TRIM]),
 		.help	= "Limit IO used to this number of IO operations/sec",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
@@ -2586,9 +3567,9 @@
 		.name	= "rate_iops_min",
 		.lname	= "I/O min rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops_min[DDIR_READ]),
-		.off2	= td_var_offset(rate_iops_min[DDIR_WRITE]),
-		.off3	= td_var_offset(rate_iops_min[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate_iops_min[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops_min[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops_min[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shut down",
 		.parent	= "rate_iops",
 		.hide	= 1,
@@ -2596,10 +3577,33 @@
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
-		.name	= "ratecycle",
+		.name	= "rate_process",
+		.lname	= "Rate Process",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, rate_process),
+		.help	= "What process controls how rated IO is managed",
+		.def	= "linear",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+		.posval = {
+			  { .ival = "linear",
+			    .oval = RATE_PROCESS_LINEAR,
+			    .help = "Linear rate of IO",
+			  },
+			  {
+			    .ival = "poisson",
+			    .oval = RATE_PROCESS_POISSON,
+			    .help = "Rate follows Poisson process",
+			  },
+		},
+		.parent = "rate",
+	},
+	{
+		.name	= "rate_cycle",
+		.alias	= "ratecycle",
 		.lname	= "I/O rate cycle",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ratecycle),
+		.off1	= offsetof(struct thread_options, ratecycle),
 		.help	= "Window average for rate limits (msec)",
 		.def	= "1000",
 		.parent = "rate",
@@ -2608,18 +3612,63 @@
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
+		.name	= "rate_ignore_thinktime",
+		.lname	= "Rate ignore thinktime",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, rate_ign_think),
+		.help	= "Rated IO ignores thinktime settings",
+		.parent = "rate",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+	},
+	{
 		.name	= "max_latency",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(max_latency),
+		.lname	= "Max Latency (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, max_latency),
 		.help	= "Maximum tolerated IO latency (usec)",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group = FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_target",
+		.lname	= "Latency Target (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, latency_target),
+		.help	= "Ramp to max queue depth supporting this latency",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_window",
+		.lname	= "Latency Window (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= offsetof(struct thread_options, latency_window),
+		.help	= "Time to sustain latency_target",
+		.is_time = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_percentile",
+		.lname	= "Latency Percentile",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= offsetof(struct thread_options, latency_percentile),
+		.help	= "Percentile of IOs must be below latency_target",
+		.def	= "100",
+		.maxlen	= 1,
+		.minfp	= 0.0,
+		.maxfp	= 100.0,
 		.category = FIO_OPT_C_IO,
-		.group = FIO_OPT_G_RATE,
+		.group	= FIO_OPT_G_LATPROF,
 	},
 	{
 		.name	= "invalidate",
 		.lname	= "Cache invalidate",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(invalidate_cache),
+		.off1	= offsetof(struct thread_options, invalidate_cache),
 		.help	= "Invalidate buffer/page cache prior to running job",
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
@@ -2629,7 +3678,7 @@
 		.name	= "sync",
 		.lname	= "Synchronous I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(sync_io),
+		.off1	= offsetof(struct thread_options, sync_io),
 		.help	= "Use O_SYNC for buffered writes",
 		.def	= "0",
 		.parent = "buffered",
@@ -2637,12 +3686,40 @@
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_TYPE,
 	},
+#ifdef FIO_HAVE_WRITE_HINT
+	{
+		.name	= "write_hint",
+		.lname	= "Write hint",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, write_hint),
+		.help	= "Set expected write life time",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+		.posval = {
+			  { .ival = "none",
+			    .oval = RWH_WRITE_LIFE_NONE,
+			  },
+			  { .ival = "short",
+			    .oval = RWH_WRITE_LIFE_SHORT,
+			  },
+			  { .ival = "medium",
+			    .oval = RWH_WRITE_LIFE_MEDIUM,
+			  },
+			  { .ival = "long",
+			    .oval = RWH_WRITE_LIFE_LONG,
+			  },
+			  { .ival = "extreme",
+			    .oval = RWH_WRITE_LIFE_EXTREME,
+			  },
+		},
+	},
+#endif
 	{
 		.name	= "create_serialize",
 		.lname	= "Create serialize",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_serialize),
-		.help	= "Serialize creating of job files",
+		.off1	= offsetof(struct thread_options, create_serialize),
+		.help	= "Serialize creation of job files",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -2651,7 +3728,7 @@
 		.name	= "create_fsync",
 		.lname	= "Create fsync",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_fsync),
+		.off1	= offsetof(struct thread_options, create_fsync),
 		.help	= "fsync file after creation",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
@@ -2661,7 +3738,7 @@
 		.name	= "create_on_open",
 		.lname	= "Create on open",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_on_open),
+		.off1	= offsetof(struct thread_options, create_on_open),
 		.help	= "Create files when they are opened for IO",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2669,17 +3746,38 @@
 	},
 	{
 		.name	= "create_only",
+		.lname	= "Create Only",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_only),
+		.off1	= offsetof(struct thread_options, create_only),
 		.help	= "Only perform file creation phase",
 		.category = FIO_OPT_C_FILE,
 		.def	= "0",
 	},
 	{
+		.name	= "allow_file_create",
+		.lname	= "Allow file create",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_create),
+		.help	= "Permit fio to create files, if they don't exist",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "allow_mounted_write",
+		.lname	= "Allow mounted write",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_mounted_write),
+		.help	= "Allow writes to a mounted partition",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
 		.name	= "pre_read",
 		.lname	= "Pre-read files",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(pre_read),
+		.off1	= offsetof(struct thread_options, pre_read),
 		.help	= "Pre-read files before starting official testing",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2691,6 +3789,7 @@
 		.lname	= "CPU mask",
 		.type	= FIO_OPT_INT,
 		.cb	= str_cpumask_cb,
+		.off1	= offsetof(struct thread_options, cpumask),
 		.help	= "CPU affinity mask",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -2700,34 +3799,104 @@
 		.lname	= "CPUs allowed",
 		.type	= FIO_OPT_STR,
 		.cb	= str_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, cpumask),
 		.help	= "Set CPUs allowed",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, cpus_allowed_policy),
+		.help	= "Distribution policy for cpus_allowed",
+		.parent = "cpus_allowed",
+		.prio	= 1,
+		.posval = {
+			  { .ival = "shared",
+			    .oval = FIO_CPUS_SHARED,
+			    .help = "Mask shared between threads",
+			  },
+			  { .ival = "split",
+			    .oval = FIO_CPUS_SPLIT,
+			    .help = "Mask split between threads",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
+#else
+	{
+		.name	= "cpumask",
+		.lname	= "CPU mask",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed",
+		.lname	= "CPUs allowed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
 #endif
 #ifdef CONFIG_LIBNUMA
 	{
 		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_cpunodes_cb,
+		.off1	= offsetof(struct thread_options, numa_cpunodes),
 		.help	= "NUMA CPU nodes bind",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_mpol_cb,
+		.off1	= offsetof(struct thread_options, numa_memnodes),
 		.help	= "NUMA memory policy setup",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+	{
+		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+#endif
+#ifdef CONFIG_CUDA
+	{
+		.name	= "gpu_dev_id",
+		.lname	= "GPU device ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gpu_dev_id),
+		.help	= "Set GPU device ID for GPUDirect RDMA",
+		.def    = "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
 #endif
 	{
 		.name	= "end_fsync",
 		.lname	= "End fsync",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(end_fsync),
+		.off1	= offsetof(struct thread_options, end_fsync),
 		.help	= "Include fsync at the end of job",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2737,7 +3906,7 @@
 		.name	= "fsync_on_close",
 		.lname	= "Fsync on close",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fsync_on_close),
+		.off1	= offsetof(struct thread_options, fsync_on_close),
 		.help	= "fsync files on close",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2747,13 +3916,23 @@
 		.name	= "unlink",
 		.lname	= "Unlink file",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(unlink),
+		.off1	= offsetof(struct thread_options, unlink),
 		.help	= "Unlink created files after job has completed",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "unlink_each_loop",
+		.lname	= "Unlink file after each loop of a job",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unlink_each_loop),
+		.help	= "Unlink created files after each loop in a job has completed",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "exitall",
 		.lname	= "Exit-all on terminate",
 		.type	= FIO_OPT_STR_SET,
@@ -2763,11 +3942,20 @@
 		.group	= FIO_OPT_G_PROCESS,
 	},
 	{
+		.name	= "exitall_on_error",
+		.lname	= "Exit-all on terminate in error",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, exitall_error),
+		.help	= "Terminate all jobs when one exits in error",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
 		.name	= "stonewall",
 		.lname	= "Wait for previous",
 		.alias	= "wait_for_previous",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(stonewall),
+		.off1	= offsetof(struct thread_options, stonewall),
 		.help	= "Insert a hard barrier between this job and previous",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
@@ -2776,7 +3964,7 @@
 		.name	= "new_group",
 		.lname	= "New group",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(new_group),
+		.off1	= offsetof(struct thread_options, new_group),
 		.help	= "Mark the start of a new group (for reporting)",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
@@ -2785,16 +3973,31 @@
 		.name	= "thread",
 		.lname	= "Thread",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(use_thread),
+		.off1	= offsetof(struct thread_options, use_thread),
 		.help	= "Use threads instead of processes",
+#ifdef CONFIG_NO_SHM
+		.def	= "1",
+		.no_warn_def = 1,
+#endif
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
 	},
 	{
+		.name	= "per_job_logs",
+		.lname	= "Per Job Logs",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, per_job_logs),
+		.help	= "Include job number in generated log files or not",
+		.def	= "1",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "write_bw_log",
 		.lname	= "Write bandwidth log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(bw_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, bw_log_file),
+		.cb	= str_write_bw_log_cb,
 		.help	= "Write log of bandwidth during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -2802,8 +4005,9 @@
 	{
 		.name	= "write_lat_log",
 		.lname	= "Write latency log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(lat_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, lat_log_file),
+		.cb	= str_write_lat_log_cb,
 		.help	= "Write log of latency during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -2811,8 +4015,9 @@
 	{
 		.name	= "write_iops_log",
 		.lname	= "Write IOPS log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(iops_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, iops_log_file),
+		.cb	= str_write_iops_log_cb,
 		.help	= "Write log of IOPS during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -2821,17 +4026,143 @@
 		.name	= "log_avg_msec",
 		.lname	= "Log averaging (msec)",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(log_avg_msec),
+		.off1	= offsetof(struct thread_options, log_avg_msec),
 		.help	= "Average bw/iops/lat logs over this period of time",
 		.def	= "0",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "log_hist_msec",
+		.lname	= "Log histograms (msec)",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_msec),
+		.help	= "Dump completion latency histograms at frequency of this time value",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_hist_coarseness",
+		.lname	= "Histogram logs coarseness",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_coarseness),
+		.help	= "Integer in range [0,6]. Higher coarseness outputs"
+			" fewer histogram bins per sample. The number of bins for"
+			" these are [1216, 608, 304, 152, 76, 38, 19] respectively.",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_hist_log",
+		.lname	= "Write latency histogram logs",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, hist_log_file),
+		.cb	= str_write_hist_log_cb,
+		.help	= "Write log of latency histograms during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_max_value",
+		.lname	= "Log maximum instead of average",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_max),
+		.help	= "Log max sample in a window instead of average",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_offset",
+		.lname	= "Log offset of IO",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_offset),
+		.help	= "Include offset of IO for each log entry",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef CONFIG_ZLIB
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_gz),
+		.help	= "Log in compressed chunks of this size",
+		.minval	= 1024ULL,
+		.maxval	= 512 * 1024 * 1024ULL,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef FIO_HAVE_CPU_AFFINITY
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_STR,
+		.cb	= str_log_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, log_gz_cpumask),
+		.parent = "log_compression",
+		.help	= "Limit log compression to these CPUs",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+#endif
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_gz_store),
+		.help	= "Store logs in a compressed format",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
+#endif
+	{
+		.name = "log_unix_epoch",
+		.lname = "Log epoch unix",
+		.type = FIO_OPT_BOOL,
+		.off1 = offsetof(struct thread_options, log_unix_epoch),
+		.help = "Use Unix time in log files",
+		.category = FIO_OPT_C_LOG,
+		.group = FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "block_error_percentiles",
+		.lname	= "Block error percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, block_error_hist),
+		.help	= "Record trim block errors and make a histogram",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "bwavgtime",
 		.lname	= "Bandwidth average time",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(bw_avg_time),
+		.off1	= offsetof(struct thread_options, bw_avg_time),
 		.help	= "Time window over which to calculate bandwidth"
 			  " (msec)",
 		.def	= "500",
@@ -2845,7 +4176,7 @@
 		.name	= "iopsavgtime",
 		.lname	= "IOPS average time",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iops_avg_time),
+		.off1	= offsetof(struct thread_options, iops_avg_time),
 		.help	= "Time window over which to calculate IOPS (msec)",
 		.def	= "500",
 		.parent	= "write_iops_log",
@@ -2858,16 +4189,26 @@
 		.name	= "group_reporting",
 		.lname	= "Group reporting",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(group_reporting),
+		.off1	= offsetof(struct thread_options, group_reporting),
 		.help	= "Do reporting on a per-group basis",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "stats",
+		.lname	= "Stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, stats),
+		.help	= "Enable collection of stats",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "zero_buffers",
 		.lname	= "Zero I/O buffers",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(zero_buffers),
+		.off1	= offsetof(struct thread_options, zero_buffers),
 		.help	= "Init IO buffers to all zeroes",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -2876,7 +4217,7 @@
 		.name	= "refill_buffers",
 		.lname	= "Refill I/O buffers",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(refill_buffers),
+		.off1	= offsetof(struct thread_options, refill_buffers),
 		.help	= "Refill IO buffers on every IO submit",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -2885,19 +4226,30 @@
 		.name	= "scramble_buffers",
 		.lname	= "Scramble I/O buffers",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(scramble_buffers),
+		.off1	= offsetof(struct thread_options, scramble_buffers),
 		.help	= "Slightly scramble buffers on every IO submit",
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
 	},
 	{
+		.name	= "buffer_pattern",
+		.lname	= "Buffer pattern",
+		.type	= FIO_OPT_STR,
+		.cb	= str_buffer_pattern_cb,
+		.off1	= offsetof(struct thread_options, buffer_pattern),
+		.help	= "Fill pattern for IO buffers",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
 		.name	= "buffer_compress_percentage",
 		.lname	= "Buffer compression percentage",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(compress_percentage),
+		.cb	= str_buffer_compress_cb,
+		.off1	= offsetof(struct thread_options, compress_percentage),
 		.maxval	= 100,
-		.minval	= 1,
+		.minval	= 0,
 		.help	= "How compressible the buffer is (approximately)",
 		.interval = 5,
 		.category = FIO_OPT_C_IO,
@@ -2907,31 +4259,58 @@
 		.name	= "buffer_compress_chunk",
 		.lname	= "Buffer compression chunk size",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(compress_chunk),
+		.off1	= offsetof(struct thread_options, compress_chunk),
 		.parent	= "buffer_compress_percentage",
 		.hide	= 1,
 		.help	= "Size of compressible region in buffer",
+		.def	= "512",
 		.interval = 256,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
 	},
 	{
+		.name	= "dedupe_percentage",
+		.lname	= "Dedupe percentage",
+		.type	= FIO_OPT_INT,
+		.cb	= str_dedupe_cb,
+		.off1	= offsetof(struct thread_options, dedupe_percentage),
+		.maxval	= 100,
+		.minval	= 0,
+		.help	= "Percentage of buffers that are dedupable",
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
 		.name	= "clat_percentiles",
 		.lname	= "Completion latency percentiles",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(clat_percentiles),
+		.off1	= offsetof(struct thread_options, clat_percentiles),
 		.help	= "Enable the reporting of completion latency percentiles",
 		.def	= "1",
+		.inverse = "lat_percentiles",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "lat_percentiles",
+		.lname	= "IO latency percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, lat_percentiles),
+		.help	= "Enable the reporting of IO latency percentiles",
+		.def	= "0",
+		.inverse = "clat_percentiles",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "percentile_list",
-		.lname	= "Completion latency percentile list",
+		.lname	= "Percentile list",
 		.type	= FIO_OPT_FLOAT_LIST,
-		.off1	= td_var_offset(percentile_list),
-		.off2	= td_var_offset(percentile_precision),
-		.help	= "Specify a custom list of percentiles to report",
+		.off1	= offsetof(struct thread_options, percentile_list),
+		.off2	= offsetof(struct thread_options, percentile_precision),
+		.help	= "Specify a custom list of percentiles to report for "
+			  "completion latency and block errors",
 		.def    = "1:5:10:20:30:40:50:60:70:80:90:95:99:99.5:99.9:99.95:99.99",
 		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
 		.minfp	= 0.0,
@@ -2939,18 +4318,38 @@
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
+	{
+		.name	= "significant_figures",
+		.lname	= "Significant figures",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, sig_figs),
+		.maxval	= 10,
+		.minval	= 1,
+		.help	= "Significant figures for output-format set to normal",
+		.def	= "4",
+		.interval = 1,
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
 
 #ifdef FIO_HAVE_DISK_UTIL
 	{
 		.name	= "disk_util",
 		.lname	= "Disk utilization",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(do_disk_util),
+		.off1	= offsetof(struct thread_options, do_disk_util),
 		.help	= "Log disk utilization statistics",
 		.def	= "1",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "disk_util",
+		.lname	= "Disk utilization",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support disk utilization",
+	},
 #endif
 	{
 		.name	= "gtod_reduce",
@@ -2967,7 +4366,7 @@
 		.name	= "disable_lat",
 		.lname	= "Disable all latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_lat),
+		.off1	= offsetof(struct thread_options, disable_lat),
 		.help	= "Disable latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -2979,7 +4378,7 @@
 		.name	= "disable_clat",
 		.lname	= "Disable completion latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_clat),
+		.off1	= offsetof(struct thread_options, disable_clat),
 		.help	= "Disable completion latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -2991,7 +4390,7 @@
 		.name	= "disable_slat",
 		.lname	= "Disable submission latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_slat),
+		.off1	= offsetof(struct thread_options, disable_slat),
 		.help	= "Disable submission latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3001,9 +4400,10 @@
 	},
 	{
 		.name	= "disable_bw_measurement",
+		.alias	= "disable_bw",
 		.lname	= "Disable bandwidth stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_bw),
+		.off1	= offsetof(struct thread_options, disable_bw),
 		.help	= "Disable bandwidth logging",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3015,7 +4415,7 @@
 		.name	= "gtod_cpu",
 		.lname	= "Dedicated gettimeofday() CPU",
 		.type	= FIO_OPT_INT,
-		.cb	= str_gtod_cpu_cb,
+		.off1	= offsetof(struct thread_options, gtod_cpu),
 		.help	= "Set up dedicated gettimeofday() thread on this CPU",
 		.verify	= gtod_cpu_verify,
 		.category = FIO_OPT_C_GENERAL,
@@ -3023,8 +4423,9 @@
 	},
 	{
 		.name	= "unified_rw_reporting",
+		.lname	= "Unified RW Reporting",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(unified_rw_rep),
+		.off1	= offsetof(struct thread_options, unified_rw_rep),
 		.help	= "Unify reporting across data direction",
 		.def	= "0",
 		.category = FIO_OPT_C_GENERAL,
@@ -3034,7 +4435,7 @@
 		.name	= "continue_on_error",
 		.lname	= "Continue on error",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(continue_on_error),
+		.off1	= offsetof(struct thread_options, continue_on_error),
 		.help	= "Continue on non-fatal errors during IO",
 		.def	= "none",
 		.category = FIO_OPT_C_GENERAL,
@@ -3076,8 +4477,10 @@
 	},
 	{
 		.name	= "ignore_error",
+		.lname	= "Ignore Error",
 		.type	= FIO_OPT_STR,
 		.cb	= str_ignore_error_cb,
+		.off1	= offsetof(struct thread_options, ignore_error_nr),
 		.help	= "Set a specific list of errors to ignore",
 		.parent	= "rw",
 		.category = FIO_OPT_C_GENERAL,
@@ -3085,8 +4488,9 @@
 	},
 	{
 		.name	= "error_dump",
+		.lname	= "Error Dump",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(error_dump),
+		.off1	= offsetof(struct thread_options, error_dump),
 		.def	= "0",
 		.help	= "Dump info on each error",
 		.category = FIO_OPT_C_GENERAL,
@@ -3096,7 +4500,7 @@
 		.name	= "profile",
 		.lname	= "Profile",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(profile),
+		.off1	= offsetof(struct thread_options, profile),
 		.help	= "Select a specific builtin performance test",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -3105,7 +4509,7 @@
 		.name	= "cgroup",
 		.lname	= "Cgroup",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(cgroup),
+		.off1	= offsetof(struct thread_options, cgroup),
 		.help	= "Add job to cgroup of this name",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CGROUP,
@@ -3114,7 +4518,7 @@
 		.name	= "cgroup_nodelete",
 		.lname	= "Cgroup no-delete",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(cgroup_nodelete),
+		.off1	= offsetof(struct thread_options, cgroup_nodelete),
 		.help	= "Do not delete cgroups after job completion",
 		.def	= "0",
 		.parent	= "cgroup",
@@ -3125,7 +4529,7 @@
 		.name	= "cgroup_weight",
 		.lname	= "Cgroup weight",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(cgroup_weight),
+		.off1	= offsetof(struct thread_options, cgroup_weight),
 		.help	= "Use given weight for cgroup",
 		.minval = 100,
 		.maxval	= 1000,
@@ -3137,7 +4541,7 @@
 		.name	= "uid",
 		.lname	= "User ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(uid),
+		.off1	= offsetof(struct thread_options, uid),
 		.help	= "Run job with this user ID",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -3146,7 +4550,7 @@
 		.name	= "gid",
 		.lname	= "Group ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(gid),
+		.off1	= offsetof(struct thread_options, gid),
 		.help	= "Run job with this group ID",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -3154,41 +4558,41 @@
 	{
 		.name	= "kb_base",
 		.lname	= "KB Base",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(kb_base),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, kb_base),
 		.prio	= 1,
 		.def	= "1024",
 		.posval = {
 			  { .ival = "1024",
 			    .oval = 1024,
-			    .help = "Use 1024 as the K base",
+			    .help = "Inputs invert IEC and SI prefixes (for compatibility); outputs prefer binary",
 			  },
 			  { .ival = "1000",
 			    .oval = 1000,
-			    .help = "Use 1000 as the K base",
+			    .help = "Inputs use IEC and SI prefixes; outputs prefer SI",
 			  },
 		},
-		.help	= "How many bytes per KB for reporting (1000 or 1024)",
+		.help	= "Unit prefix interpretation for quantities of data (IEC and SI)",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "unit_base",
-		.lname	= "Base unit for reporting (Bits or Bytes)",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(unit_base),
+		.lname	= "Unit for quantities of data (Bits or Bytes)",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, unit_base),
 		.prio	= 1,
 		.posval = {
 			  { .ival = "0",
-			    .oval = 0,
+			    .oval = N2S_NONE,
 			    .help = "Auto-detect",
 			  },
 			  { .ival = "8",
-			    .oval = 8,
+			    .oval = N2S_BYTEPERSEC,
 			    .help = "Normal (byte based)",
 			  },
 			  { .ival = "1",
-			    .oval = 1,
+			    .oval = N2S_BITPERSEC,
 			    .help = "Bit based",
 			  },
 		},
@@ -3200,7 +4604,7 @@
 		.name	= "hugepage-size",
 		.lname	= "Hugepage size",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(hugepage_size),
+		.off1	= offsetof(struct thread_options, hugepage_size),
 		.help	= "When using hugepages, specify size of each page",
 		.def	= __fio_stringify(FIO_HUGE_PAGE),
 		.interval = 1024 * 1024,
@@ -3211,7 +4615,7 @@
 		.name	= "flow_id",
 		.lname	= "I/O flow ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_id),
+		.off1	= offsetof(struct thread_options, flow_id),
 		.help	= "The flow index ID to use",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -3221,7 +4625,7 @@
 		.name	= "flow",
 		.lname	= "I/O flow weight",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow),
+		.off1	= offsetof(struct thread_options, flow),
 		.help	= "Weight for flow control of this job",
 		.parent	= "flow_id",
 		.hide	= 1,
@@ -3233,7 +4637,7 @@
 		.name	= "flow_watermark",
 		.lname	= "I/O flow watermark",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_watermark),
+		.off1	= offsetof(struct thread_options, flow_watermark),
 		.help	= "High watermark for flow control. This option"
 			" should be set to the same value for all threads"
 			" with non-zero flow.",
@@ -3247,7 +4651,7 @@
 		.name	= "flow_sleep",
 		.lname	= "I/O flow sleep",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_sleep),
+		.off1	= offsetof(struct thread_options, flow_sleep),
 		.help	= "How many microseconds to sleep after being held"
 			" back by the flow control mechanism",
 		.parent	= "flow_id",
@@ -3257,6 +4661,65 @@
 		.group	= FIO_OPT_G_IO_FLOW,
 	},
 	{
+		.name   = "steadystate",
+		.lname  = "Steady state threshold",
+		.alias  = "ss",
+		.type   = FIO_OPT_STR,
+		.off1   = offsetof(struct thread_options, ss_state),
+		.cb	= str_steadystate_cb,
+		.help   = "Define the criterion and limit to judge when a job has reached steady state",
+		.def	= "iops_slope:0.01%",
+		.posval	= {
+			  { .ival = "iops",
+			    .oval = FIO_SS_IOPS,
+			    .help = "maximum mean deviation of IOPS measurements",
+			  },
+			  { .ival = "iops_slope",
+			    .oval = FIO_SS_IOPS_SLOPE,
+			    .help = "slope calculated from IOPS measurements",
+			  },
+			  { .ival = "bw",
+			    .oval = FIO_SS_BW,
+			    .help = "maximum mean deviation of bandwidth measurements",
+			  },
+			  {
+			    .ival = "bw_slope",
+			    .oval = FIO_SS_BW_SLOPE,
+			    .help = "slope calculated from bandwidth measurements",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_duration",
+		.lname  = "Steady state duration",
+		.alias  = "ss_dur",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_dur),
+		.help   = "Stop workload upon attaining steady state for specified duration",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_ramp_time",
+		.lname  = "Steady state ramp time",
+		.alias  = "ss_ramp",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_ramp_time),
+		.help   = "Delay before initiation of data collection for steady state job termination testing",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+	{
 		.name = NULL,
 	},
 };
@@ -3267,7 +4730,7 @@
 	lopt->name = (char *) name;
 	lopt->val = val;
 	if (o->type == FIO_OPT_STR_SET)
-		lopt->has_arg = no_argument;
+		lopt->has_arg = optional_argument;
 	else
 		lopt->has_arg = required_argument;
 }
@@ -3351,6 +4814,18 @@
 	},
 };
 
+void fio_keywords_exit(void)
+{
+	struct fio_keyword *kw;
+
+	kw = &fio_keywords[0];
+	while (kw->word) {
+		free(kw->replace);
+		kw->replace = NULL;
+		kw++;
+	}
+}
+
 void fio_keywords_init(void)
 {
 	unsigned long long mb_memory;
@@ -3411,8 +4886,10 @@
 		return NULL;
 
 	ret = fread(&buf[tmp - str], 1, 128 - (tmp - str), f);
-	if (ret <= 0)
+	if (ret <= 0) {
+		pclose(f);
 		return NULL;
+	}
 
 	pclose(f);
 	buf[(tmp - str) + ret - 1] = '\0';
@@ -3427,7 +4904,7 @@
  * substitution always occurs, even if VARNAME is empty or the corresponding
  * environment variable undefined.
  */
-static char *option_dup_subs(const char *opt)
+char *fio_option_dup_subs(const char *opt)
 {
 	char out[OPT_LEN_MAX+1];
 	char in[OPT_LEN_MAX+1];
@@ -3442,8 +4919,7 @@
 		return NULL;
 	}
 
-	in[OPT_LEN_MAX] = '\0';
-	strncpy(in, opt, OPT_LEN_MAX);
+	snprintf(in, sizeof(in), "%s", opt);
 
 	while (*inptr && nchr > 0) {
 		if (inptr[0] == '$' && inptr[1] == '{') {
@@ -3532,7 +5008,7 @@
 	int i;
 	char **opts_copy = malloc(num_opts * sizeof(*opts));
 	for (i = 0; i < num_opts; i++) {
-		opts_copy[i] = option_dup_subs(opts[i]);
+		opts_copy[i] = fio_option_dup_subs(opts[i]);
 		if (!opts_copy[i])
 			continue;
 		opts_copy[i] = fio_keyword_replace(opts_copy[i]);
@@ -3540,6 +5016,40 @@
 	return opts_copy;
 }
 
+static void show_closest_option(const char *opt)
+{
+	int best_option, best_distance;
+	int i, distance;
+	char *name;
+
+	if (!strlen(opt))
+		return;
+
+	name = strdup(opt);
+	i = 0;
+	while (name[i] != '\0' && name[i] != '=')
+		i++;
+	name[i] = '\0';
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (fio_options[i].name) {
+		distance = string_distance(name, fio_options[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1 && string_distance_ok(name, best_distance) &&
+	    fio_options[best_option].type != FIO_OPT_UNSUPPORTED)
+		log_err("Did you mean %s?\n", fio_options[best_option].name);
+
+	free(name);
+}
+
 int fio_options_parse(struct thread_data *td, char **opts, int num_opts)
 {
 	int i, ret, unknown;
@@ -3549,9 +5059,12 @@
 	opts_copy = dup_and_sub_options(opts, num_opts);
 
 	for (ret = 0, i = 0, unknown = 0; i < num_opts; i++) {
-		struct fio_option *o;
+		const struct fio_option *o;
 		int newret = parse_option(opts_copy[i], opts[i], fio_options,
-						&o, td);
+						&o, &td->o, &td->opt_list);
+
+		if (!newret && o)
+			fio_option_mark_set(&td->o, o);
 
 		if (opts_copy[i]) {
 			if (newret && !o) {
@@ -3572,20 +5085,22 @@
 			opts = opts_copy;
 		}
 		for (i = 0; i < num_opts; i++) {
-			struct fio_option *o = NULL;
+			const struct fio_option *o = NULL;
 			int newret = 1;
+
 			if (!opts_copy[i])
 				continue;
 
 			if (td->eo)
 				newret = parse_option(opts_copy[i], opts[i],
 						      td->io_ops->options, &o,
-						      td->eo);
+						      td->eo, &td->opt_list);
 
 			ret |= newret;
-			if (!o)
+			if (!o) {
 				log_err("Bad option <%s>\n", opts[i]);
-
+				show_closest_option(opts[i]);
+			}
 			free(opts_copy[i]);
 			opts_copy[i] = NULL;
 		}
@@ -3597,18 +5112,31 @@
 
 int fio_cmd_option_parse(struct thread_data *td, const char *opt, char *val)
 {
-	return parse_cmd_option(opt, val, fio_options, td);
+	int ret;
+
+	ret = parse_cmd_option(opt, val, fio_options, &td->o, &td->opt_list);
+	if (!ret) {
+		const struct fio_option *o;
+
+		o = find_option_c(fio_options, opt);
+		if (o)
+			fio_option_mark_set(&td->o, o);
+	}
+
+	return ret;
 }
 
 int fio_cmd_ioengine_option_parse(struct thread_data *td, const char *opt,
 				char *val)
 {
-	return parse_cmd_option(opt, val, td->io_ops->options, td);
+	return parse_cmd_option(opt, val, td->io_ops->options, td->eo,
+					&td->opt_list);
 }
 
 void fio_fill_default_options(struct thread_data *td)
 {
-	fill_default_options(td, fio_options);
+	td->o.magic = OPT_MAGIC;
+	fill_default_options(&td->o, fio_options);
 }
 
 int fio_show_option_help(const char *opt)
@@ -3616,43 +5144,38 @@
 	return show_cmd_help(fio_options, opt);
 }
 
-void options_mem_dupe(void *data, struct fio_option *options)
-{
-	struct fio_option *o;
-	char **ptr;
-
-	for (o = &options[0]; o->name; o++) {
-		if (o->type != FIO_OPT_STR_STORE)
-			continue;
-
-		ptr = td_var(data, o->off1);
-		if (*ptr)
-			*ptr = strdup(*ptr);
-	}
-}
-
 /*
  * dupe FIO_OPT_STR_STORE options
  */
 void fio_options_mem_dupe(struct thread_data *td)
 {
-	options_mem_dupe(&td->o, fio_options);
+	options_mem_dupe(fio_options, &td->o);
 
 	if (td->eo && td->io_ops) {
 		void *oldeo = td->eo;
 
 		td->eo = malloc(td->io_ops->option_struct_size);
 		memcpy(td->eo, oldeo, td->io_ops->option_struct_size);
-		options_mem_dupe(td->eo, td->io_ops->options);
+		options_mem_dupe(td->io_ops->options, td->eo);
 	}
 }
 
 unsigned int fio_get_kb_base(void *data)
 {
-	struct thread_options *o = data;
+	struct thread_data *td = cb_data_to_td(data);
+	struct thread_options *o = &td->o;
 	unsigned int kb_base = 0;
 
-	if (o)
+	/*
+	 * This is a hack... For private options, *data is not holding
+	 * a pointer to the thread_options, but to private data. This means
+	 * we can't safely dereference it, but magic is first so mem wise
+	 * it is valid. But this also means that if the job first sets
+	 * kb_base and expects that to be honored by private options,
+	 * it will be disappointed. We will return the global default
+	 * for this.
+	 */
+	if (o && o->magic == OPT_MAGIC)
 		kb_base = o->kb_base;
 	if (!kb_base)
 		kb_base = 1024;
@@ -3660,7 +5183,7 @@
 	return kb_base;
 }
 
-int add_option(struct fio_option *o)
+int add_option(const struct fio_option *o)
 {
 	struct fio_option *__o;
 	int opt_index = 0;
@@ -3671,7 +5194,13 @@
 		__o++;
 	}
 
+	if (opt_index + 1 == FIO_MAX_OPTS) {
+		log_err("fio: FIO_MAX_OPTS is too small\n");
+		return 1;
+	}
+
 	memcpy(&fio_options[opt_index], o, sizeof(*o));
+	fio_options[opt_index + 1].name = NULL;
 	return 0;
 }
 
@@ -3730,7 +5259,7 @@
 
 void fio_options_free(struct thread_data *td)
 {
-	options_free(fio_options, td);
+	options_free(fio_options, &td->o);
 	if (td->eo && td->io_ops && td->io_ops->options) {
 		options_free(td->io_ops->options, td->eo);
 		free(td->eo);
@@ -3743,3 +5272,59 @@
 	return find_option(fio_options, name);
 }
 
+static struct fio_option *find_next_opt(struct fio_option *from,
+					unsigned int off1)
+{
+	struct fio_option *opt;
+
+	if (!from)
+		from = &fio_options[0];
+	else
+		from++;
+
+	opt = NULL;
+	do {
+		if (off1 == from->off1) {
+			opt = from;
+			break;
+		}
+		from++;
+	} while (from->name);
+
+	return opt;
+}
+
+static int opt_is_set(struct thread_options *o, struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	return (o->set_options[index] & ((uint64_t)1 << offset)) != 0;
+}
+
+bool __fio_option_is_set(struct thread_options *o, unsigned int off1)
+{
+	struct fio_option *opt, *next;
+
+	next = NULL;
+	while ((opt = find_next_opt(next, off1)) != NULL) {
+		if (opt_is_set(o, opt))
+			return true;
+
+		next = opt;
+	}
+
+	return false;
+}
+
+void fio_option_mark_set(struct thread_options *o, const struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	o->set_options[index] |= (uint64_t)1 << offset;
+}
diff -Nru fio-2.1.3/options.h fio-3.16/options.h
--- fio-2.1.3/options.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/options.h	2019-09-20 01:01:52.000000000 +0000
@@ -4,12 +4,11 @@
 #define FIO_MAX_OPTS		512
 
 #include <string.h>
+#include <inttypes.h>
 #include "parse.h"
-#include "flist.h"
+#include "lib/types.h"
 
-#define td_var_offset(var)	((size_t) &((struct thread_options *)0)->var)
-
-int add_option(struct fio_option *);
+int add_option(const struct fio_option *);
 void invalidate_profile_options(const char *);
 extern char *exec_profile;
 
@@ -17,117 +16,41 @@
 void del_opt_posval(const char *, const char *);
 struct thread_data;
 void fio_options_free(struct thread_data *);
+char *get_next_str(char **ptr);
+int get_max_str_idx(char *input);
+char* get_name_by_idx(char *input, int index);
+int set_name_idx(char *, size_t, char *, int, bool);
+
+extern char client_sockaddr_str[];  /* used with --client option */
 
 extern struct fio_option fio_options[FIO_MAX_OPTS];
 
-static inline int o_match(struct fio_option *o, const char *opt)
-{
-	if (!strcmp(o->name, opt))
-		return 1;
-	else if (o->alias && !strcmp(o->alias, opt))
-		return 1;
+extern bool __fio_option_is_set(struct thread_options *, unsigned int off);
 
-	return 0;
-}
+#define fio_option_is_set(__td, name)					\
+({									\
+	const unsigned int off = offsetof(struct thread_options, name);	\
+	bool __r = __fio_option_is_set((__td), off);			\
+	__r;								\
+})
 
-static inline struct fio_option *find_option(struct fio_option *options,
-					     const char *opt)
-{
-	struct fio_option *o;
+extern void fio_option_mark_set(struct thread_options *,
+				const struct fio_option *);
 
-	for (o = &options[0]; o->name; o++)
-		if (o_match(o, opt))
-			return o;
+static inline bool o_match(const struct fio_option *o, const char *opt)
+{
+	if (!strcmp(o->name, opt))
+		return true;
+	else if (o->alias && !strcmp(o->alias, opt))
+		return true;
 
-	return NULL;
+	return false;
 }
 
-struct opt_group {
-	const char *name;
-	unsigned int mask;
-};
-
-enum opt_category {
-	__FIO_OPT_C_GENERAL	= 0,
-	__FIO_OPT_C_IO,
-	__FIO_OPT_C_FILE,
-	__FIO_OPT_C_STAT,
-	__FIO_OPT_C_LOG,
-	__FIO_OPT_C_PROFILE,
-	__FIO_OPT_C_ENGINE,
-	__FIO_OPT_C_NR,
-
-	FIO_OPT_C_GENERAL	= (1U << __FIO_OPT_C_GENERAL),
-	FIO_OPT_C_IO		= (1U << __FIO_OPT_C_IO),
-	FIO_OPT_C_FILE		= (1U << __FIO_OPT_C_FILE),
-	FIO_OPT_C_STAT		= (1U << __FIO_OPT_C_STAT),
-	FIO_OPT_C_LOG		= (1U << __FIO_OPT_C_LOG),
-	FIO_OPT_C_PROFILE	= (1U << __FIO_OPT_C_PROFILE),
-	FIO_OPT_C_ENGINE	= (1U << __FIO_OPT_C_ENGINE),
-	FIO_OPT_C_INVALID	= (1U << __FIO_OPT_C_NR),
-};
-
-enum opt_category_group {
-	__FIO_OPT_G_RATE	= 0,
-	__FIO_OPT_G_ZONE,
-	__FIO_OPT_G_RWMIX,
-	__FIO_OPT_G_VERIFY,
-	__FIO_OPT_G_TRIM,
-	__FIO_OPT_G_IOLOG,
-	__FIO_OPT_G_IO_DEPTH,
-	__FIO_OPT_G_IO_FLOW,
-	__FIO_OPT_G_DESC,
-	__FIO_OPT_G_FILENAME,
-	__FIO_OPT_G_IO_BASIC,
-	__FIO_OPT_G_CGROUP,
-	__FIO_OPT_G_RUNTIME,
-	__FIO_OPT_G_PROCESS,
-	__FIO_OPT_G_CRED,
-	__FIO_OPT_G_CLOCK,
-	__FIO_OPT_G_IO_TYPE,
-	__FIO_OPT_G_THINKTIME,
-	__FIO_OPT_G_RANDOM,
-	__FIO_OPT_G_IO_BUF,
-	__FIO_OPT_G_TIOBENCH,
-	__FIO_OPT_G_ERR,
-	__FIO_OPT_G_E4DEFRAG,
-	__FIO_OPT_G_NETIO,
-	__FIO_OPT_G_LIBAIO,
-	__FIO_OPT_G_ACT,
-	__FIO_OPT_G_NR,
-
-	FIO_OPT_G_RATE		= (1U << __FIO_OPT_G_RATE),
-	FIO_OPT_G_ZONE		= (1U << __FIO_OPT_G_ZONE),
-	FIO_OPT_G_RWMIX		= (1U << __FIO_OPT_G_RWMIX),
-	FIO_OPT_G_VERIFY	= (1U << __FIO_OPT_G_VERIFY),
-	FIO_OPT_G_TRIM		= (1U << __FIO_OPT_G_TRIM),
-	FIO_OPT_G_IOLOG		= (1U << __FIO_OPT_G_IOLOG),
-	FIO_OPT_G_IO_DEPTH	= (1U << __FIO_OPT_G_IO_DEPTH),
-	FIO_OPT_G_IO_FLOW	= (1U << __FIO_OPT_G_IO_FLOW),
-	FIO_OPT_G_DESC		= (1U << __FIO_OPT_G_DESC),
-	FIO_OPT_G_FILENAME	= (1U << __FIO_OPT_G_FILENAME),
-	FIO_OPT_G_IO_BASIC	= (1U << __FIO_OPT_G_IO_BASIC),
-	FIO_OPT_G_CGROUP	= (1U << __FIO_OPT_G_CGROUP),
-	FIO_OPT_G_RUNTIME	= (1U << __FIO_OPT_G_RUNTIME),
-	FIO_OPT_G_PROCESS	= (1U << __FIO_OPT_G_PROCESS),
-	FIO_OPT_G_CRED		= (1U << __FIO_OPT_G_CRED),
-	FIO_OPT_G_CLOCK		= (1U << __FIO_OPT_G_CLOCK),
-	FIO_OPT_G_IO_TYPE	= (1U << __FIO_OPT_G_IO_TYPE),
-	FIO_OPT_G_THINKTIME	= (1U << __FIO_OPT_G_THINKTIME),
-	FIO_OPT_G_RANDOM	= (1U << __FIO_OPT_G_RANDOM),
-	FIO_OPT_G_IO_BUF	= (1U << __FIO_OPT_G_IO_BUF),
-	FIO_OPT_G_TIOBENCH	= (1U << __FIO_OPT_G_TIOBENCH),
-	FIO_OPT_G_ERR		= (1U << __FIO_OPT_G_ERR),
-	FIO_OPT_G_E4DEFRAG	= (1U << __FIO_OPT_G_E4DEFRAG),
-	FIO_OPT_G_NETIO		= (1U << __FIO_OPT_G_NETIO),
-	FIO_OPT_G_LIBAIO	= (1U << __FIO_OPT_G_LIBAIO),
-	FIO_OPT_G_ACT		= (1U << __FIO_OPT_G_ACT),
-	FIO_OPT_G_INVALID	= (1U << __FIO_OPT_G_NR),
-};
-
-extern struct opt_group *opt_group_from_mask(unsigned int *mask);
-extern struct opt_group *opt_group_cat_from_mask(unsigned int *mask);
-extern struct fio_option *fio_option_find(const char *name);
+extern struct fio_option *find_option(struct fio_option *, const char *);
+extern const struct fio_option *
+find_option_c(const struct fio_option *, const char *);
+extern struct fio_option *fio_option_find(const char *);
 extern unsigned int fio_get_kb_base(void *);
 
 #endif
diff -Nru fio-2.1.3/os/binject.h fio-3.16/os/binject.h
--- fio-2.1.3/os/binject.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/binject.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,71 +0,0 @@
-#ifndef BINJECT_H
-#define BINJECT_H
-
-#include <linux/types.h>
-
-#define BINJECT_MAGIC		0x89
-#define BINJECT_VER		0x01
-#define BINJECT_MAGIC_SHIFT	8
-#define BINJECT_VER_MASK	((1 << BINJECT_MAGIC_SHIFT) - 1)
-
-struct b_user_cmd {
-	__u16 magic;	/* INPUT */
-	__u16 type;	/* INPUT */
-	__u32 error;	/* OUTPUT */
-	__u32 flags;	/* INPUT */
-	__u32 len;	/* INPUT */
-	__u64 offset;	/* INPUT */
-	__u64 buf;	/* INPUT */
-	__u64 usr_ptr;	/* PASSED THROUGH */
-	__u64 nsec;	/* OUTPUT */
-};
-
-struct b_ioctl_cmd {
-	int fd;
-	int minor;
-};
-
-#define BINJECT_IOCTL_CHR	'J'
-#define B_IOCTL_ADD		_IOWR(BINJECT_IOCTL_CHR, 1, struct b_ioctl_cmd)
-#define B_IOCTL_DEL		_IOWR(BINJECT_IOCTL_CHR, 2, struct b_ioctl_cmd)
-
-enum {
-	B_TYPE_READ		= 0,
-	B_TYPE_WRITE,
-	B_TYPE_DISCARD,
-	B_TYPE_READVOID,
-	B_TYPE_WRITEZERO,
-	B_TYPE_READBARRIER,
-	B_TYPE_WRITEBARRIER,
-	B_TYPE_NR
-};
-
-enum {
-	__B_FLAG_SYNC	= 0,
-	__B_FLAG_UNPLUG,
-	__B_FLAG_NOIDLE,
-	__B_FLAG_BARRIER,
-	__B_FLAG_META,
-	__B_FLAG_RAHEAD,
-	__B_FLAG_FAILFAST_DEV,
-	__B_FLAG_FAILFAST_TRANSPORT,
-	__B_FLAG_FAILFAST_DRIVER,
-	__B_FLAG_NR,
-
-	B_FLAG_SYNC			= 1 << __B_FLAG_SYNC,
-	B_FLAG_UNPLUG			= 1 << __B_FLAG_UNPLUG,
-	B_FLAG_NOIDLE			= 1 << __B_FLAG_NOIDLE,
-	B_FLAG_BARRIER			= 1 << __B_FLAG_BARRIER,
-	B_FLAG_META			= 1 << __B_FLAG_META,
-	B_FLAG_RAHEAD			= 1 << __B_FLAG_RAHEAD,
-	B_FLAG_FAILFAST_DEV		= 1 << __B_FLAG_FAILFAST_DEV,
-	B_FLAG_FAILFAST_TRANSPORT	= 1 << __B_FLAG_FAILFAST_TRANSPORT,
-	B_FLAG_FAILFAST_DRIVER		= 1 << __B_FLAG_FAILFAST_DRIVER,
-};
-
-static inline void binject_buc_set_magic(struct b_user_cmd *buc)
-{
-	buc->magic = (BINJECT_MAGIC << BINJECT_MAGIC_SHIFT) | BINJECT_VER;
-}
-
-#endif
diff -Nru fio-2.1.3/os/linux/io_uring.h fio-3.16/os/linux/io_uring.h
--- fio-2.1.3/os/linux/io_uring.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/linux/io_uring.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Header file for the io_uring interface.
+ *
+ * Copyright (C) 2019 Jens Axboe
+ * Copyright (C) 2019 Christoph Hellwig
+ */
+#ifndef LINUX_IO_URING_H
+#define LINUX_IO_URING_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+/*
+ * IO submission data structure (Submission Queue Entry)
+ */
+struct io_uring_sqe {
+	__u8	opcode;		/* type of operation for this sqe */
+	__u8	flags;		/* IOSQE_ flags */
+	__u16	ioprio;		/* ioprio for the request */
+	__s32	fd;		/* file descriptor to do IO on */
+	__u64	off;		/* offset into file */
+	__u64	addr;		/* pointer to buffer or iovecs */
+	__u32	len;		/* buffer size or number of iovecs */
+	union {
+		__kernel_rwf_t	rw_flags;
+		__u32		fsync_flags;
+		__u16		poll_events;
+		__u32		sync_range_flags;
+	};
+	__u64	user_data;	/* data to be passed back at completion time */
+	union {
+		__u16	buf_index;	/* index into fixed buffers, if used */
+		__u64	__pad2[3];
+	};
+};
+
+/*
+ * sqe->flags
+ */
+#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+#define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
+#define IOSQE_IO_LINK		(1U << 2)	/* next IO depends on this one */
+
+/*
+ * io_uring_setup() flags
+ */
+#define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
+#define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
+#define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
+
+#define IORING_OP_NOP		0
+#define IORING_OP_READV		1
+#define IORING_OP_WRITEV	2
+#define IORING_OP_FSYNC		3
+#define IORING_OP_READ_FIXED	4
+#define IORING_OP_WRITE_FIXED	5
+#define IORING_OP_POLL_ADD	6
+#define IORING_OP_POLL_REMOVE	7
+#define IORING_OP_SYNC_FILE_RANGE	8
+
+/*
+ * sqe->fsync_flags
+ */
+#define IORING_FSYNC_DATASYNC	(1U << 0)
+
+/*
+ * IO completion data structure (Completion Queue Entry)
+ */
+struct io_uring_cqe {
+	__u64	user_data;	/* sqe->data submission passed back */
+	__s32	res;		/* result code for this event */
+	__u32	flags;
+};
+
+/*
+ * Magic offsets for the application to mmap the data it needs
+ */
+#define IORING_OFF_SQ_RING		0ULL
+#define IORING_OFF_CQ_RING		0x8000000ULL
+#define IORING_OFF_SQES			0x10000000ULL
+
+/*
+ * Filled with the offset for mmap(2)
+ */
+struct io_sqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 flags;
+	__u32 dropped;
+	__u32 array;
+	__u32 resv1;
+	__u64 resv2;
+};
+
+/*
+ * sq_ring->flags
+ */
+#define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
+
+struct io_cqring_offsets {
+	__u32 head;
+	__u32 tail;
+	__u32 ring_mask;
+	__u32 ring_entries;
+	__u32 overflow;
+	__u32 cqes;
+	__u64 resv[2];
+};
+
+/*
+ * io_uring_enter(2) flags
+ */
+#define IORING_ENTER_GETEVENTS	(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
+
+/*
+ * Passed in for io_uring_setup(2). Copied back with updated info on success
+ */
+struct io_uring_params {
+	__u32 sq_entries;
+	__u32 cq_entries;
+	__u32 flags;
+	__u32 sq_thread_cpu;
+	__u32 sq_thread_idle;
+	__u32 resv[5];
+	struct io_sqring_offsets sq_off;
+	struct io_cqring_offsets cq_off;
+};
+
+/*
+ * io_uring_register(2) opcodes and arguments
+ */
+#define IORING_REGISTER_BUFFERS		0
+#define IORING_UNREGISTER_BUFFERS	1
+#define IORING_REGISTER_FILES		2
+#define IORING_UNREGISTER_FILES		3
+#define IORING_REGISTER_EVENTFD		4
+#define IORING_UNREGISTER_EVENTFD	5
+
+#endif
diff -Nru fio-2.1.3/os/os-aix.h fio-3.16/os/os-aix.h
--- fio-2.1.3/os/os-aix.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-aix.h	2019-09-20 01:01:52.000000000 +0000
@@ -11,11 +11,8 @@
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 
-#define FIO_HAVE_PSHARED_MUTEX
-
 #define OS_MAP_ANON		MAP_ANON
 #define OS_MSG_DONTWAIT		0
 
@@ -23,7 +20,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
diff -Nru fio-2.1.3/os/os-android.h fio-3.16/os/os-android.h
--- fio-2.1.3/os/os-android.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-android.h	2019-09-20 01:01:52.000000000 +0000
@@ -4,8 +4,10 @@
 #define	FIO_OS	os_android
 
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 #include <sys/uio.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <unistd.h>
 #include <fcntl.h>
@@ -15,17 +17,22 @@
 #include <linux/major.h>
 #include <asm/byteorder.h>
 
-#include "binject.h"
+#include "./os-linux-syscall.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_TRIM
 #define FIO_HAVE_GETTID
@@ -37,10 +44,13 @@
 
 #define OS_MAP_ANON		MAP_ANONYMOUS
 
+#ifndef POSIX_MADV_DONTNEED
 #define posix_madvise   madvise
 #define POSIX_MADV_DONTNEED MADV_DONTNEED
 #define POSIX_MADV_SEQUENTIAL	MADV_SEQUENTIAL
 #define POSIX_MADV_RANDOM	MADV_RANDOM
+#endif
+
 #ifdef MADV_REMOVE
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
@@ -48,22 +58,19 @@
 #define MAP_HUGETLB 0x40000 /* arch specific */
 #endif
 
-
+#ifndef CONFIG_NO_SHM
 /*
- * The Android NDK doesn't currently export <sys/shm.h>, so define the
- * necessary stuff here.
+ * Bionic doesn't support SysV shared memeory, so implement it using ashmem
  */
-
-#include <linux/shm.h>
-#define SHM_HUGETLB    04000
-
 #include <stdio.h>
 #include <linux/ashmem.h>
-#include <sys/mman.h>
+#include <linux/shm.h>
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB    04000
 
 #define ASHMEM_DEVICE	"/dev/ashmem"
 
-static inline int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf)
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
 {
 	int ret=0;
 	if (__cmd == IPC_RMID)
@@ -76,47 +83,50 @@
 	return ret;
 }
 
-static inline int shmget (key_t __key, size_t __size, int __shmflg)
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
 {
 	int fd,ret;
-	char key[11];
-	
+	char keybuf[11];
+
 	fd = open(ASHMEM_DEVICE, O_RDWR);
 	if (fd < 0)
 		return fd;
 
-	sprintf(key,"%d",__key);
-	ret = ioctl(fd, ASHMEM_SET_NAME, key);
+	sprintf(keybuf,"%d",__key);
+	ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
 	if (ret < 0)
 		goto error;
 
-	ret = ioctl(fd, ASHMEM_SET_SIZE, __size);
+	/* Stores size in first 8 bytes, allocate extra space */
+	ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
 	if (ret < 0)
 		goto error;
 
 	return fd;
-	
+
 error:
 	close(fd);
 	return ret;
 }
 
-static inline void *shmat (int __shmid, const void *__shmaddr, int __shmflg)
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
 {
-	size_t *ptr, size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
-	ptr = mmap(NULL, size + sizeof(size_t), PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
-	*ptr = size;    //save size at beginning of buffer, for use with munmap
-	return &ptr[1];
+	size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+	/* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+	uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+	/* Save size at beginning of buffer, for use with munmap */
+	*ptr = size;
+	return ptr + 1;
 }
 
 static inline int shmdt (const void *__shmaddr)
 {
-	size_t *ptr, size;
-	ptr = (size_t *)__shmaddr;
-	ptr--;
-	size = *ptr;    //find mmap size which we stored at the beginning of the buffer
-	return munmap((void *)ptr, size + sizeof(size_t));
+	/* Find mmap size which we stored at the beginning of the buffer */
+	uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+	size_t size = *ptr;
+	return munmap(ptr, size);
 }
+#endif
 
 #define SPLICE_DEF_SIZE	(64*1024)
 
@@ -136,6 +146,12 @@
 #define IOPRIO_BITS		16
 #define IOPRIO_CLASS_SHIFT	13
 
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
 static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
 {
 	/*
@@ -185,32 +201,25 @@
 	return (unsigned long long) pages * (unsigned long long) pagesize;
 }
 
-typedef struct { unsigned short r[3]; } os_random_state_t;
-
-static inline void os_random_seed(unsigned long seed, os_random_state_t *rs)
-{
-	rs->r[0] = seed & 0xffff;
-	seed >>= 16;
-	rs->r[1] = seed & 0xffff;
-	seed >>= 16;
-	rs->r[2] = seed & 0xffff;
-	seed48(rs->r);
-}
-
-static inline long os_random_long(os_random_state_t *rs)
-{
-	return nrand48(rs->r);
-}
-
 #ifdef O_NOATIME
 #define FIO_O_NOATIME	O_NOATIME
 #else
 #define FIO_O_NOATIME	0
 #endif
 
-#define fio_swap16(x)	__bswap_16(x)
-#define fio_swap32(x)	__bswap_32(x)
-#define fio_swap64(x)	__bswap_64(x)
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -234,7 +243,7 @@
 		return atoi(size);
 }
 
-static inline unsigned long long get_fs_size(const char *path)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	unsigned long long ret;
 	struct statfs s;
@@ -247,7 +256,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	uint64_t range[2];
@@ -255,10 +264,18 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, BLKDISCARD, range))
+	if (!ioctl(f->fd, BLKDISCARD, range))
 		return 0;
 
 	return errno;
 }
 
+#ifdef CONFIG_SCHED_IDLE
+static inline int fio_set_sched_idle(void)
+{
+        struct sched_param p = { .sched_priority = 0, };
+        return sched_setscheduler(gettid(), SCHED_IDLE, &p);
+}
+#endif
+
 #endif
diff -Nru fio-2.1.3/os/os-dragonfly.h fio-3.16/os/os-dragonfly.h
--- fio-2.1.3/os/os-dragonfly.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/os-dragonfly.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,254 @@
+#ifndef FIO_OS_DRAGONFLY_H
+#define FIO_OS_DRAGONFLY_H
+
+#define	FIO_OS	os_dragonfly
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/endian.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/statvfs.h>
+#include <sys/diskslice.h>
+#include <sys/usched.h>
+#include <sys/resource.h>
+
+/* API changed during "5.3 development" */
+#if __DragonFly_version < 500302
+#include <sys/ioctl_compat.h>
+#define DAIOCTRIM	IOCTLTRIM
+#else
+#include <bus/cam/scsi/scsi_daio.h>
+#endif
+
+#include "../file.h"
+#include "../lib/types.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_IOPRIO
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+/* This is supposed to equal (sizeof(cpumask_t)*8) */
+#define FIO_MAX_CPUS	SMP_MAXCPU
+
+typedef off_t off64_t;
+typedef cpumask_t os_cpu_mask_t;
+
+/*
+ * These macros are copied from sys/cpu/x86_64/include/types.h.
+ * It's okay to copy from arch dependent header because x86_64 is the only
+ * supported arch, and no other arch is going to be supported any time soon.
+ *
+ * These are supposed to be able to be included from userspace by defining
+ * _KERNEL_STRUCTURES, however this scheme is badly broken that enabling it
+ * causes compile-time conflicts with other headers. Although the current
+ * upstream code no longer requires _KERNEL_STRUCTURES, they should be kept
+ * here for compatibility with older versions.
+ */
+#ifndef CPUMASK_SIMPLE
+#define CPUMASK_SIMPLE(cpu)		((uint64_t)1 << (cpu))
+#define CPUMASK_TESTBIT(val, i)		((val).ary[((i) >> 6) & 3] & \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ORBIT(mask, i)		((mask).ary[((i) >> 6) & 3] |= \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_NANDBIT(mask, i)	((mask).ary[((i) >> 6) & 3] &= \
+					 ~CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ASSZERO(mask)		do {				\
+					(mask).ary[0] = 0;		\
+					(mask).ary[1] = 0;		\
+					(mask).ary[2] = 0;		\
+					(mask).ary[3] = 0;		\
+					} while(0)
+#endif
+
+/*
+ * Define USCHED_GET_CPUMASK as the macro didn't exist until release 4.5.
+ * usched_set(2) returns EINVAL if the kernel doesn't support it.
+ *
+ * Also note usched_set(2) works only for the current thread regardless of
+ * the command type. It doesn't work against another thread regardless of
+ * a caller's privilege. A caller would generally specify 0 for pid for the
+ * current thread though that's the only choice. See BUGS in usched_set(2).
+ */
+#ifndef USCHED_GET_CPUMASK
+#define USCHED_GET_CPUMASK	5
+#endif
+
+/* No CPU_COUNT(), but use the default function defined in os/os.h */
+#define fio_cpu_count(mask)             CPU_COUNT((mask))
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	CPUMASK_ASSZERO(*mask);
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+
+static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_NANDBIT(*mask, cpu);
+}
+
+static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_ORBIT(*mask, cpu);
+}
+
+static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	return CPUMASK_TESTBIT(*mask, cpu) != 0;
+}
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t mask)
+{
+	int i, firstcall = 1;
+
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	for (i = 0; i < FIO_MAX_CPUS; i++) {
+		if (!CPUMASK_TESTBIT(mask, i))
+			continue;
+		if (firstcall) {
+			if (usched_set(pid, USCHED_SET_CPU, &i, sizeof(int)))
+				return -1;
+			firstcall = 0;
+		} else {
+			if (usched_set(pid, USCHED_ADD_CPU, &i, sizeof(int)))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	if (usched_set(pid, USCHED_GET_CPUMASK, mask, sizeof(*mask)))
+		return -1;
+
+	return 0;
+}
+
+/* fio code is Linux based, so rename macros to Linux style */
+#define IOPRIO_WHO_PROCESS	PRIO_PROCESS
+#define IOPRIO_WHO_PGRP		PRIO_PGRP
+#define IOPRIO_WHO_USER		PRIO_USER
+
+#define IOPRIO_MIN_PRIO		1	/* lowest priority */
+#define IOPRIO_MAX_PRIO		10	/* highest priority */
+
+/*
+ * Prototypes declared in sys/sys/resource.h are preventing from defining
+ * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
+ * Note that there is no idea of class within ioprio_set(2) unlike Linux.
+ */
+#define ioprio_set(which, who, ioprio_class, ioprio)	\
+	ioprio_set(which, who, ioprio)
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct partinfo pi;
+
+	if (!ioctl(f->fd, DIOCGPART, &pi)) {
+		*bytes = (unsigned long long) pi.media_size;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return (int) lwp_gettid();
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, DAIOCTRIM, range))
+		return 0;
+
+	return errno;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
+#endif
diff -Nru fio-2.1.3/os/os-freebsd.h fio-3.16/os/os-freebsd.h
--- fio-2.1.3/os/os-freebsd.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-freebsd.h	2019-09-20 01:01:52.000000000 +0000
@@ -6,19 +6,23 @@
 #include <errno.h>
 #include <sys/sysctl.h>
 #include <sys/disk.h>
+#include <sys/endian.h>
 #include <sys/thr.h>
 #include <sys/socket.h>
 #include <sys/param.h>
 #include <sys/cpuset.h>
+#include <sys/statvfs.h>
 
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -32,6 +36,8 @@
 
 #define fio_cpu_clear(mask, cpu)        (void) CPU_CLR((cpu), (mask))
 #define fio_cpu_set(mask, cpu)          (void) CPU_SET((cpu), (mask))
+#define fio_cpu_isset(mask, cpu)	(CPU_ISSET((cpu), (mask)) != 0)
+#define fio_cpu_count(mask)		CPU_COUNT((mask))
 
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
@@ -76,7 +82,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -97,8 +103,46 @@
 	return (int) lwpid;
 }
 
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(struct fio_file *f, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(f->fd, DIOCGDELETE, range))
+		return 0;
+
+	return errno;
+}
+
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
 #endif
diff -Nru fio-2.1.3/os/os.h fio-3.16/os/os.h
--- fio-2.1.3/os/os.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os.h	2019-09-20 01:01:52.000000000 +0000
@@ -8,7 +8,8 @@
 #include <unistd.h>
 #include <stdlib.h>
 
-#include "../arch/arch.h"
+#include "../arch/arch.h" /* IWYU pragma: export */
+#include "../lib/types.h"
 
 enum {
 	os_linux = 1,
@@ -17,19 +18,28 @@
 	os_hpux,
 	os_mac,
 	os_netbsd,
+	os_openbsd,
 	os_solaris,
 	os_windows,
 	os_android,
+	os_dragonfly,
 
 	os_nr,
 };
 
+typedef enum {
+        CPU_ARM64_CRC32C,
+} cpu_features;
+
+/* IWYU pragma: begin_exports */
 #if defined(__ANDROID__)
 #include "os-android.h"
 #elif defined(__linux__)
 #include "os-linux.h"
 #elif defined(__FreeBSD__)
 #include "os-freebsd.h"
+#elif defined(__OpenBSD__)
+#include "os-openbsd.h"
 #elif defined(__NetBSD__)
 #include "os-netbsd.h"
 #elif defined(__sun__)
@@ -42,6 +52,8 @@
 #include "os-hpux.h"
 #elif defined(WIN32)
 #include "os-windows.h"
+#elif defined (__DragonFly__)
+#include "os-dragonfly.h"
 #else
 #error "unsupported os"
 #endif
@@ -53,14 +65,14 @@
 #endif
 #endif
 
-#ifdef FIO_HAVE_SGIO
-#include <linux/fs.h>
-#include <scsi/sg.h>
+#ifndef CONFIG_STRSEP
+#include "../oslib/strsep.h"
 #endif
 
-#ifndef CONFIG_STRSEP
-#include "../lib/strsep.h"
+#ifndef CONFIG_STRLCAT
+#include "../oslib/strlcat.h"
 #endif
+/* IWYU pragma: end_exports */
 
 #ifdef MSG_DONTWAIT
 #define OS_MSG_DONTWAIT	MSG_DONTWAIT
@@ -70,14 +82,34 @@
 #define POSIX_FADV_DONTNEED	(0)
 #define POSIX_FADV_SEQUENTIAL	(0)
 #define POSIX_FADV_RANDOM	(0)
+#define POSIX_FADV_NORMAL	(0)
 #endif
 
 #ifndef FIO_HAVE_CPU_AFFINITY
-#define fio_setaffinity(pid, mask)	(0)
-#define fio_getaffinity(pid, mask)	do { } while (0)
 #define fio_cpu_clear(mask, cpu)	do { } while (0)
-#define fio_cpuset_exit(mask)		(-1)
 typedef unsigned long os_cpu_mask_t;
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *cpumask)
+{
+	return -1;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return -1;
+}
+
+static inline int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
+{
+	return 0;
+}
+#else
+extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #endif
 
 #ifndef FIO_HAVE_IOPRIO
@@ -90,6 +122,12 @@
 #define OS_O_DIRECT			O_DIRECT
 #endif
 
+#ifdef OS_O_ATOMIC
+#define FIO_O_ATOMIC			OS_O_ATOMIC
+#else
+#define FIO_O_ATOMIC			0
+#endif
+
 #ifndef FIO_HAVE_HUGETLB
 #define SHM_HUGETLB			0
 #define MAP_HUGETLB			0
@@ -119,11 +157,11 @@
 #endif
 
 #ifndef FIO_PREFERRED_ENGINE
-#define FIO_PREFERRED_ENGINE	"sync"
+#define FIO_PREFERRED_ENGINE	"psync"
 #endif
 
 #ifndef FIO_OS_PATH_SEPARATOR
-#define FIO_OS_PATH_SEPARATOR	"/"
+#define FIO_OS_PATH_SEPARATOR	'/'
 #endif
 
 #ifndef FIO_PREFERRED_CLOCK_SOURCE
@@ -135,7 +173,7 @@
 #endif
 
 #ifndef FIO_MAX_JOBS
-#define FIO_MAX_JOBS		2048
+#define FIO_MAX_JOBS		4096
 #endif
 
 #ifndef CONFIG_SOCKLEN_T
@@ -172,59 +210,86 @@
 
 #ifndef FIO_HAVE_BYTEORDER_FUNCS
 #ifdef CONFIG_LITTLE_ENDIAN
+#define __be16_to_cpu(x)		fio_swap16(x)
+#define __be32_to_cpu(x)		fio_swap32(x)
+#define __be64_to_cpu(x)		fio_swap64(x)
 #define __le16_to_cpu(x)		(x)
 #define __le32_to_cpu(x)		(x)
 #define __le64_to_cpu(x)		(x)
+#define __cpu_to_be16(x)		fio_swap16(x)
+#define __cpu_to_be32(x)		fio_swap32(x)
+#define __cpu_to_be64(x)		fio_swap64(x)
 #define __cpu_to_le16(x)		(x)
 #define __cpu_to_le32(x)		(x)
 #define __cpu_to_le64(x)		(x)
 #else
+#define __be16_to_cpu(x)		(x)
+#define __be32_to_cpu(x)		(x)
+#define __be64_to_cpu(x)		(x)
 #define __le16_to_cpu(x)		fio_swap16(x)
 #define __le32_to_cpu(x)		fio_swap32(x)
 #define __le64_to_cpu(x)		fio_swap64(x)
+#define __cpu_to_be16(x)		(x)
+#define __cpu_to_be32(x)		(x)
+#define __cpu_to_be64(x)		(x)
 #define __cpu_to_le16(x)		fio_swap16(x)
 #define __cpu_to_le32(x)		fio_swap32(x)
 #define __cpu_to_le64(x)		fio_swap64(x)
 #endif
 #endif /* FIO_HAVE_BYTEORDER_FUNCS */
 
+#ifdef FIO_INTERNAL
+#define be16_to_cpu(val) ({			\
+	typecheck(uint16_t, val);		\
+	__be16_to_cpu(val);			\
+})
+#define be32_to_cpu(val) ({			\
+	typecheck(uint32_t, val);		\
+	__be32_to_cpu(val);			\
+})
+#define be64_to_cpu(val) ({			\
+	typecheck(uint64_t, val);		\
+	__be64_to_cpu(val);			\
+})
 #define le16_to_cpu(val) ({			\
-	uint16_t *__val = &(val);		\
-	__le16_to_cpu(*__val);			\
+	typecheck(uint16_t, val);		\
+	__le16_to_cpu(val);			\
 })
 #define le32_to_cpu(val) ({			\
-	uint32_t *__val = &(val);		\
-	__le32_to_cpu(*__val);			\
+	typecheck(uint32_t, val);		\
+	__le32_to_cpu(val);			\
 })
 #define le64_to_cpu(val) ({			\
-	uint64_t *__val = &(val);		\
-	__le64_to_cpu(*__val);			\
+	typecheck(uint64_t, val);		\
+	__le64_to_cpu(val);			\
+})
+#endif
+
+#define cpu_to_be16(val) ({			\
+	typecheck(uint16_t, val);		\
+	__cpu_to_be16(val);			\
+})
+#define cpu_to_be32(val) ({			\
+	typecheck(uint32_t, val);		\
+	__cpu_to_be32(val);			\
+})
+#define cpu_to_be64(val) ({			\
+	typecheck(uint64_t, val);		\
+	__cpu_to_be64(val);			\
 })
 #define cpu_to_le16(val) ({			\
-	uint16_t *__val = &(val);		\
-	__cpu_to_le16(*__val);			\
+	typecheck(uint16_t, val);		\
+	__cpu_to_le16(val);			\
 })
 #define cpu_to_le32(val) ({			\
-	uint32_t *__val = &(val);		\
-	__cpu_to_le32(*__val);			\
+	typecheck(uint32_t, val);		\
+	__cpu_to_le32(val);			\
 })
 #define cpu_to_le64(val) ({			\
-	uint64_t *__val = &(val);		\
-	__cpu_to_le64(*__val);			\
+	typecheck(uint64_t, val);		\
+	__cpu_to_le64(val);			\
 })
 
-#ifndef FIO_HAVE_BLKTRACE
-static inline int is_blktrace(const char *fname)
-{
-	return 0;
-}
-struct thread_data;
-static inline int load_blktrace(struct thread_data *td, const char *fname)
-{
-	return 1;
-}
-#endif
-
 #define FIO_DEF_CL_SIZE		128
 
 static inline int os_cache_line_size(void)
@@ -257,30 +322,8 @@
 }
 #endif
 
-#ifdef FIO_USE_GENERIC_RAND
-typedef unsigned int os_random_state_t;
-
-static inline void os_random_seed(unsigned long seed, os_random_state_t *rs)
-{
-	srand(seed);
-}
-
-static inline long os_random_long(os_random_state_t *rs)
-{
-	long val;
-
-	val = rand_r(rs);
-	return val;
-}
-#endif
-
 #ifdef FIO_USE_GENERIC_INIT_RANDOM_STATE
-extern void td_fill_rand_seeds(struct thread_data *td);
-/*
- * Initialize the various random states we need (random io, block size ranges,
- * read/write mix, etc).
- */
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(uint64_t *rand_seeds, int size)
 {
 	int fd;
 
@@ -295,13 +338,12 @@
 	}
 
 	close(fd);
-	td_fill_rand_seeds(td);
 	return 0;
 }
 #endif
 
 #ifndef FIO_HAVE_FS_STAT
-static inline unsigned long long get_fs_size(const char *path)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	return 0;
 }
@@ -314,11 +356,55 @@
 }
 #endif
 
+#ifndef CPU_COUNT
+#ifdef FIO_HAVE_CPU_AFFINITY
+static inline int CPU_COUNT(os_cpu_mask_t *mask)
+{
+	int max_cpus = cpus_online();
+	int nr_cpus, i;
+
+	for (i = 0, nr_cpus = 0; i < max_cpus; i++)
+		if (fio_cpu_isset(mask, i))
+			nr_cpus++;
+
+	return nr_cpus;
+}
+#endif
+#endif
+
 #ifndef FIO_HAVE_GETTID
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return getpid();
 }
 #endif
+#endif
+
+#ifndef FIO_HAVE_SHM_ATTACH_REMOVED
+static inline int shm_attach_to_open_removed(void)
+{
+	return 0;
+}
+#endif
+
+#ifndef FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	errno = ENOSYS;
+	return false;
+}
+#endif
+
+#if defined(CONFIG_POSIX_FALLOCATE) || defined(FIO_HAVE_NATIVE_FALLOCATE)
+# define FIO_HAVE_ANY_FALLOCATE
+#endif
+
+#ifndef FIO_HAVE_CPU_HAS
+static inline bool os_cpu_has(cpu_features feature)
+{
+	return false;
+}
+#endif
 
 #endif
diff -Nru fio-2.1.3/os/os-hpux.h fio-3.16/os/os-hpux.h
--- fio-2.1.3/os/os-hpux.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-hpux.h	2019-09-20 01:01:52.000000000 +0000
@@ -6,7 +6,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
-#include <sys/fcntl.h>
+#include <fcntl.h>
 #include <sys/fadvise.h>
 #include <sys/mman.h>
 #include <sys/mpctl.h>
@@ -20,9 +20,7 @@
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 
 #define OS_MAP_ANON		MAP_ANONYMOUS
@@ -44,7 +42,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
diff -Nru fio-2.1.3/os/os-linux.h fio-3.16/os/os-linux.h
--- fio-2.1.3/os/os-linux.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-linux.h	2019-09-20 01:01:52.000000000 +0000
@@ -6,6 +6,7 @@
 #include <sys/ioctl.h>
 #include <sys/uio.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -15,27 +16,41 @@
 #include <linux/unistd.h>
 #include <linux/raw.h>
 #include <linux/major.h>
+#include <linux/fs.h>
+#include <scsi/sg.h>
 
-#include "binject.h"
+#ifdef ARCH_HAVE_CRC_CRYPTO
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32             (1 << 7)
+#endif /* HWCAP_CRC32 */
+#endif /* ARCH_HAVE_CRC_CRYPTO */
+
+#include "./os-linux-syscall.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_CPU_AFFINITY
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_SGIO
 #define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_RAWBIND
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
 #define FIO_HAVE_CGROUPS
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_TRIM
-#define FIO_HAVE_BINJECT
 #define FIO_HAVE_GETTID
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_PWRITEV2
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #ifdef MAP_HUGETLB
 #define FIO_HAVE_MMAP_HUGE
@@ -45,8 +60,6 @@
 
 typedef cpu_set_t os_cpu_mask_t;
 
-typedef struct drand48_data os_random_state_t;
-
 #ifdef CONFIG_3ARG_AFFINITY
 #define fio_setaffinity(pid, cpumask)		\
 	sched_setaffinity((pid), sizeof(cpumask), &(cpumask))
@@ -61,6 +74,8 @@
 
 #define fio_cpu_clear(mask, cpu)	(void) CPU_CLR((cpu), (mask))
 #define fio_cpu_set(mask, cpu)		(void) CPU_SET((cpu), (mask))
+#define fio_cpu_isset(mask, cpu)	(CPU_ISSET((cpu), (mask)) != 0)
+#define fio_cpu_count(mask)		CPU_COUNT((mask))
 
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
@@ -91,6 +106,12 @@
 #define IOPRIO_BITS		16
 #define IOPRIO_CLASS_SHIFT	13
 
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
 static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
 {
 	/*
@@ -103,10 +124,12 @@
 	return syscall(__NR_ioprio_set, which, who, ioprio);
 }
 
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return syscall(__NR_gettid);
 }
+#endif
 
 #define SPLICE_DEF_SIZE	(64*1024)
 
@@ -147,19 +170,6 @@
 	return (unsigned long long) pages * (unsigned long long) pagesize;
 }
 
-static inline void os_random_seed(unsigned long seed, os_random_state_t *rs)
-{
-	srand48_r(seed, rs);
-}
-
-static inline long os_random_long(os_random_state_t *rs)
-{
-	long val;
-
-	lrand48_r(rs, &val);
-	return val;
-}
-
 static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
 {
 	struct raw_config_request rq;
@@ -196,13 +206,29 @@
 #define FIO_O_NOATIME	0
 #endif
 
+#ifdef O_ATOMIC
+#define OS_O_ATOMIC	O_ATOMIC
+#else
+#define OS_O_ATOMIC	040000000
+#endif
+
 #ifdef MADV_REMOVE
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
 
-#define fio_swap16(x)	__bswap_16(x)
-#define fio_swap32(x)	__bswap_32(x)
-#define fio_swap64(x)	__bswap_64(x)
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -226,7 +252,15 @@
 		return atoi(size);
 }
 
-static inline unsigned long long get_fs_size(const char *path)
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	unsigned long long ret;
 	struct statfs s;
@@ -239,7 +273,7 @@
 	return ret;
 }
 
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
 			  unsigned long long len)
 {
 	uint64_t range[2];
@@ -247,7 +281,7 @@
 	range[0] = start;
 	range[1] = len;
 
-	if (!ioctl(fd, BLKDISCARD, range))
+	if (!ioctl(f->fd, BLKDISCARD, range))
 		return 0;
 
 	return errno;
@@ -261,4 +295,131 @@
 }
 #endif
 
+#ifndef F_GET_RW_HINT
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE	1024
+#endif
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+#endif
+
+#ifndef RWH_WRITE_LIFE_NONE
+#define RWH_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
+#endif
+
+#define FIO_HAVE_WRITE_HINT
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI	0x00000001
+#endif
+#ifndef RWF_DSYNC
+#define RWF_DSYNC	0x00000002
+#endif
+#ifndef RWF_SYNC
+#define RWF_SYNC	0x00000004
+#endif
+
+#ifndef RWF_WRITE_LIFE_SHIFT
+#define RWF_WRITE_LIFE_SHIFT		4
+#define RWF_WRITE_LIFE_SHORT		(1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM		(2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG		(3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME		(4 << RWF_WRITE_LIFE_SHIFT)
+#endif
+
+#ifndef CONFIG_PWRITEV2
+#ifdef __NR_preadv2
+static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l,
+				off_t offset)
+{
+#if BITS_PER_LONG == 64
+	*pos_l = offset;
+	*pos_h = 0;
+#else
+	*pos_l = offset & 0xffffffff;
+	*pos_h = ((uint64_t) offset) >> 32;
+#endif
+}
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_preadv2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_pwritev2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+#else
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+#endif /* __NR_preadv2 */
+#endif /* CONFIG_PWRITEV2 */
+
+static inline int shm_attach_to_open_removed(void)
+{
+	return 1;
+}
+
+#ifdef CONFIG_LINUX_FALLOCATE
+#define FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset,
+				 uint64_t len)
+{
+	int ret;
+	ret = fallocate(f->fd, 0, offset, len);
+	if (ret == 0)
+		return true;
+
+	/* Work around buggy old glibc versions... */
+	if (ret > 0)
+		errno = ret;
+
+	return false;
+}
+#endif
+
+#define FIO_HAVE_CPU_HAS
+static inline bool os_cpu_has(cpu_features feature)
+{
+	bool have_feature;
+	unsigned long fio_unused hwcap;
+
+	switch (feature) {
+#ifdef ARCH_HAVE_CRC_CRYPTO
+	case CPU_ARM64_CRC32C:
+		hwcap = getauxval(AT_HWCAP);
+		have_feature = (hwcap & HWCAP_CRC32) != 0;
+		break;
+#endif
+	default:
+		have_feature = false;
+	}
+
+	return have_feature;
+}
+
 #endif
diff -Nru fio-2.1.3/os/os-linux-syscall.h fio-3.16/os/os-linux-syscall.h
--- fio-2.1.3/os/os-linux-syscall.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/os-linux-syscall.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,277 @@
+#ifndef FIO_OS_LINUX_SYSCALL_H
+#define FIO_OS_LINUX_SYSCALL_H
+
+#include "../arch/arch.h"
+
+/* Linux syscalls for x86 */
+#if defined(ARCH_X86_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		378
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		379
+#endif
+
+/* Linux syscalls for x86_64 */
+#elif defined(ARCH_X86_64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		251
+#define __NR_ioprio_get		252
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		221
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		275
+#define __NR_sys_tee		276
+#define __NR_sys_vmsplice	278
+#endif
+
+#ifndef __NR_shmget
+#define __NR_shmget		 29
+#define __NR_shmat		 30
+#define __NR_shmctl		 31
+#define __NR_shmdt		 67
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		327
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		328
+#endif
+
+/* Linux syscalls for ppc */
+#elif defined(ARCH_PPC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		233
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		283
+#define __NR_sys_tee		284
+#define __NR_sys_vmsplice	285
+#endif
+
+/* Linux syscalls for ia64 */
+#elif defined(ARCH_IA64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		1274
+#define __NR_ioprio_get		1275
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		1234
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		1297
+#define __NR_sys_tee		1301
+#define __NR_sys_vmsplice	1302
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		1348
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		1349
+#endif
+
+/* Linux syscalls for alpha */
+#elif defined(ARCH_ALPHA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		442
+#define __NR_ioprio_get		443
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		413
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		468
+#define __NR_sys_tee		470
+#define __NR_sys_vmsplice	471
+#endif
+
+/* Linux syscalls for s390 */
+#elif defined(ARCH_S390_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		282
+#define __NR_ioprio_get		283
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		253
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		306
+#define __NR_sys_tee		308
+#define __NR_sys_vmsplice	309
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		376
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		377
+#endif
+
+/* Linux syscalls for sparc */
+#elif defined(ARCH_SPARC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for sparc64 */
+#elif defined(ARCH_SPARC64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for arm */
+#elif defined(ARCH_ARM_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		270
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		340
+#define __NR_sys_tee		342
+#define __NR_sys_vmsplice	343
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		392
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		393
+#endif
+
+/* Linux syscalls for mips */
+#elif defined(ARCH_MIPS64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		215
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		263
+#define __NR_sys_tee		265
+#define __NR_sys_vmsplice	266
+#endif
+
+/* Linux syscalls for sh */
+#elif defined(ARCH_SH_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		288
+#define __NR_ioprio_get		289
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+/* Linux syscalls for hppa */
+#elif defined(ARCH_HPPA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		267
+#define __NR_ioprio_get		268
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		236
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		291
+#define __NR_sys_tee		293
+#define __NR_sys_vmsplice	294
+#endif
+
+/* Linux syscalls for aarch64 */
+#elif defined(ARCH_AARCH64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		30
+#define __NR_ioprio_get		31
+#endif
+
+#else
+#warning "Unknown architecture"
+#endif
+
+#endif /* FIO_OS_LINUX_SYSCALL_H */
diff -Nru fio-2.1.3/os/os-mac.h fio-3.16/os/os-mac.h
--- fio-2.1.3/os/os-mac.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-mac.h	2019-09-20 01:01:52.000000000 +0000
@@ -16,10 +16,10 @@
 
 #include "../file.h"
 
-#define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_NATIVE_FALLOCATE
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -35,81 +35,14 @@
 
 typedef off_t off64_t;
 
-/* OS X as of 10.6 doesn't have the timer_* functions. 
- * Emulate the functionality using setitimer and sigaction here
- */
-
-#define MAX_TIMERS 64
-
+#ifndef CONFIG_CLOCKID_T
 typedef unsigned int clockid_t;
-typedef unsigned int timer_t;
-
-struct itimerspec {
-	struct timespec it_value;
-	struct timespec it_interval;
-};
-
-static struct sigevent fio_timers[MAX_TIMERS];
-static unsigned int num_timers = 0;
-
-static void sig_alrm(int signum)
-{
-	union sigval sv;
-	
-	for (int i = 0; i < num_timers; i++) {
-		if (fio_timers[i].sigev_notify_function == NULL)
-			continue;
-		
-		if (fio_timers[i].sigev_notify == SIGEV_THREAD)
-			fio_timers[i].sigev_notify_function(sv);
-		else if (fio_timers[i].sigev_notify == SIGEV_SIGNAL)
-			kill(getpid(), fio_timers[i].sigev_signo);
-	}
-}
-
-static inline int timer_settime(timer_t timerid, int flags,
-				const struct itimerspec *value,
-				struct itimerspec *ovalue)
-{
-	struct sigaction sa;
-	struct itimerval tv;
-	struct itimerval tv_out;
-	int rc;
-	
-	tv.it_interval.tv_sec = value->it_interval.tv_sec;
-	tv.it_interval.tv_usec = value->it_interval.tv_nsec / 1000;
-
-	tv.it_value.tv_sec = value->it_value.tv_sec;
-	tv.it_value.tv_usec = value->it_value.tv_nsec / 1000;
-
-	sa.sa_handler = sig_alrm;
-	sigemptyset(&sa.sa_mask);
-	sa.sa_flags = 0;
-	
-	rc = sigaction(SIGALRM, &sa, NULL);
-
-	if (!rc)
-		rc = setitimer(ITIMER_REAL, &tv, &tv_out);
-	
-	if (!rc && ovalue != NULL) {
-		ovalue->it_interval.tv_sec = tv_out.it_interval.tv_sec;
-		ovalue->it_interval.tv_nsec = tv_out.it_interval.tv_usec * 1000;
-		ovalue->it_value.tv_sec = tv_out.it_value.tv_sec;
-		ovalue->it_value.tv_nsec = tv_out.it_value.tv_usec * 1000;
-	}
-
-	return rc;
-}
-
-static inline int timer_delete(timer_t timer)
-{
-	return 0;
-}
+#endif
 
 #define FIO_OS_DIRECTIO
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
 {
-	if (fcntl(fd, F_NOCACHE, 1) == -1)
+	if (fcntl(f->fd, F_NOCACHE, 1) == -1)
 		return errno;
 	return 0;
 }
@@ -144,7 +77,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -157,10 +90,12 @@
 	return mem;
 }
 
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return mach_thread_self();
 }
+#endif
 
 /*
  * For some reason, there's no header definition for fdatasync(), even
@@ -168,4 +103,15 @@
  */
 extern int fdatasync(int fd);
 
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+	fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, offset, len};
+	if (fcntl(f->fd, F_PREALLOCATE, &store) != -1) {
+		if (ftruncate(f->fd, len) == 0)
+			return true;
+	}
+
+	return false;
+}
+
 #endif
diff -Nru fio-2.1.3/os/os-netbsd.h fio-3.16/os/os-netbsd.h
--- fio-2.1.3/os/os-netbsd.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-netbsd.h	2019-09-20 01:01:52.000000000 +0000
@@ -6,9 +6,14 @@
 #include <errno.h>
 #include <lwp.h>
 #include <sys/param.h>
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
-#define	rb_node	_rb_node
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
+#include <sys/endian.h>
 #include <sys/sysctl.h>
+
+/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
 #undef rb_node
 #undef rb_left
 #undef rb_right
@@ -16,13 +21,10 @@
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_BDEV_SIZE
-#define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
 #define FIO_HAVE_GETTID
 
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
-
 #define OS_MAP_ANON		MAP_ANON
 
 #ifndef PTHREAD_STACK_MIN
@@ -35,9 +37,22 @@
 
 typedef off_t off64_t;
 
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -50,16 +65,28 @@
 	return mem;
 }
 
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return (int) _lwp_self();
 }
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
 
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
-/* XXX NetBSD doesn't have getopt_long_only */
-#define	getopt_long_only	getopt_long
-
 #endif
diff -Nru fio-2.1.3/os/os-openbsd.h fio-3.16/os/os-openbsd.h
--- fio-2.1.3/os/os-openbsd.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/os-openbsd.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,122 @@
+#ifndef FIO_OS_OPENBSD_H
+#define FIO_OS_OPENBSD_H
+
+#define	FIO_OS	os_openbsd
+
+#include <errno.h>
+#include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
+#include <sys/endian.h>
+#include <sys/utsname.h>
+#include <sys/sysctl.h>
+
+/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
+#undef RB_BLACK
+#undef RB_RED
+#undef RB_ROOT
+
+#include "../file.h"
+
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_GETTID
+#define FIO_HAVE_SHM_ATTACH_REMOVED
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+typedef off_t off64_t;
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return ENOTSUP;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM64 };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+#ifndef CONFIG_HAVE_GETTID
+static inline int gettid(void)
+{
+	return (int)(intptr_t) pthread_self();
+}
+#endif
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+static inline int shm_attach_to_open_removed(void)
+{
+	struct utsname uts;
+	int major, minor;
+
+	if (uname(&uts) == -1)
+		return 0;
+
+	/*
+	 * Return 1 if >= OpenBSD 5.1 according to 97900ebf,
+	 * assuming both major/minor versions are < 10.
+	 */
+	if (uts.release[0] > '9' || uts.release[0] < '0')
+		return 0;
+	if (uts.release[1] != '.')
+		return 0;
+	if (uts.release[2] > '9' || uts.release[2] < '0')
+		return 0;
+
+	major = uts.release[0] - '0';
+	minor = uts.release[2] - '0';
+
+	if (major > 5)
+		return 1;
+	if (major == 5 && minor >= 1)
+		return 1;
+
+	return 0;
+}
+
+#endif
diff -Nru fio-2.1.3/os/os-solaris.h fio-3.16/os/os-solaris.h
--- fio-2.1.3/os/os-solaris.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-solaris.h	2019-09-20 01:01:52.000000000 +0000
@@ -5,19 +5,23 @@
 
 #include <errno.h>
 #include <malloc.h>
+#include <unistd.h>
 #include <sys/types.h>
-#include <sys/fcntl.h>
+#include <fcntl.h>
 #include <sys/pset.h>
 #include <sys/mman.h>
 #include <sys/dkio.h>
 #include <sys/byteorder.h>
+#include <sys/statvfs.h>
+#include <pthread.h>
 
 #include "../file.h"
+#include "../lib/types.h"
 
 #define FIO_HAVE_CPU_AFFINITY
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 #define FIO_USE_GENERIC_BDEV_SIZE
+#define FIO_HAVE_FS_STAT
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_GETTID
 
@@ -43,7 +47,6 @@
 #define FIO_OS_HAS_CTIME_R
 
 typedef psetid_t os_cpu_mask_t;
-typedef struct solaris_rand_seed os_random_state_t;
 
 static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
 {
@@ -60,34 +63,39 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
 {
-	return 0;
-}
+	long pagesize, pages;
 
-static inline void os_random_seed(unsigned long seed, os_random_state_t *rs)
-{
-	rs->r[0] = seed & 0xffff;
-	seed >>= 16;
-	rs->r[1] = seed & 0xffff;
-	seed >>= 16;
-	rs->r[2] = seed & 0xffff;
-	seed48(rs->r);
+	pagesize = sysconf(_SC_PAGESIZE);
+	pages = sysconf(_SC_PHYS_PAGES);
+	if (pages == -1 || pagesize == -1)
+		return 0;
+
+	return (unsigned long long) pages * (unsigned long long) pagesize;
 }
 
-static inline long os_random_long(os_random_state_t *rs)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
-	return nrand48(rs->r);
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
 }
 
 #define FIO_OS_DIRECTIO
 extern int directio(int, int);
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
 {
-	if (directio(fd, DIRECTIO_ON) < 0)
+	if (directio(f->fd, DIRECTIO_ON) < 0)
 		return errno;
 
 	return 0;
@@ -97,12 +105,49 @@
  * pset binding hooks for fio
  */
 #define fio_setaffinity(pid, cpumask)		\
-	pset_bind((cpumask), P_PID, (pid), NULL)
+	pset_bind((cpumask), P_LWPID, (pid), NULL)
 #define fio_getaffinity(pid, ptr)	({ 0; })
 
 #define fio_cpu_clear(mask, cpu)	pset_assign(PS_NONE, (cpu), NULL)
 #define fio_cpu_set(mask, cpu)		pset_assign(*(mask), (cpu), NULL)
 
+static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	unsigned int num_cpus;
+	processorid_t *cpus;
+	bool ret;
+	int i;
+
+	cpus = malloc(sizeof(*cpus) * max_cpus);
+
+	if (pset_info(*mask, NULL, &num_cpus, cpus) < 0) {
+		free(cpus);
+		return false;
+	}
+
+	ret = false;
+	for (i = 0; i < num_cpus; i++) {
+		if (cpus[i] == cpu) {
+			ret = true;
+			break;
+		}
+	}
+
+	free(cpus);
+	return ret;
+}
+
+static inline int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	unsigned int num_cpus;
+
+	if (pset_info(*mask, NULL, &num_cpus, NULL) < 0)
+		return 0;
+
+	return num_cpus;
+}
+
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
 	if (pset_create(mask) < 0)
@@ -119,10 +164,12 @@
 	return 0;
 }
 
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return pthread_self();
 }
+#endif
 
 /*
  * Should be enough, not aware of what (if any) restrictions Solaris has
diff -Nru fio-2.1.3/os/os-windows-7.h fio-3.16/os/os-windows-7.h
--- fio-2.1.3/os/os-windows-7.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/os-windows-7.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,367 @@
+#define FIO_MAX_CPUS		512 /* From Hyper-V 2016's max logical processors */
+#define FIO_CPU_MASK_STRIDE	64
+#define FIO_CPU_MASK_ROWS	(FIO_MAX_CPUS / FIO_CPU_MASK_STRIDE)
+
+typedef struct {
+	uint64_t row[FIO_CPU_MASK_ROWS];
+} os_cpu_mask_t;
+
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+/* Return all processors regardless of processor group */
+static inline unsigned int cpus_online(void)
+{
+	return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+}
+
+static inline void print_mask(os_cpu_mask_t *cpumask)
+{
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		dprint(FD_PROCESS, "cpumask[%d]=%lu\n", i, cpumask->row[i]);
+}
+
+/* Return the index of the least significant set CPU in cpumask or -1 if no
+ * CPUs are set */
+static inline int first_set_cpu(os_cpu_mask_t *cpumask)
+{
+	int cpus_offset, mask_first_cpu, row;
+
+	cpus_offset = 0;
+	row = 0;
+	mask_first_cpu = -1;
+	while (mask_first_cpu < 0 && row < FIO_CPU_MASK_ROWS) {
+		int row_first_cpu;
+
+		row_first_cpu = __builtin_ffsll(cpumask->row[row]) - 1;
+		dprint(FD_PROCESS, "row_first_cpu=%d cpumask->row[%d]=%lu\n",
+		       row_first_cpu, row, cpumask->row[row]);
+		if (row_first_cpu > -1) {
+			mask_first_cpu = cpus_offset + row_first_cpu;
+			dprint(FD_PROCESS, "first set cpu in mask is at index %d\n",
+			       mask_first_cpu);
+		} else {
+			cpus_offset += FIO_CPU_MASK_STRIDE;
+			row++;
+		}
+	}
+
+	return mask_first_cpu;
+}
+
+/* Return the index of the most significant set CPU in cpumask or -1 if no
+ * CPUs are set */
+static inline int last_set_cpu(os_cpu_mask_t *cpumask)
+{
+	int cpus_offset, mask_last_cpu, row;
+
+	cpus_offset = (FIO_CPU_MASK_ROWS - 1) * FIO_CPU_MASK_STRIDE;
+	row = FIO_CPU_MASK_ROWS - 1;
+	mask_last_cpu = -1;
+	while (mask_last_cpu < 0 && row >= 0) {
+		int row_last_cpu;
+
+		if (cpumask->row[row] == 0)
+			row_last_cpu = -1;
+		else {
+			uint64_t tmp = cpumask->row[row];
+
+			row_last_cpu = 0;
+			while (tmp >>= 1)
+			    row_last_cpu++;
+		}
+
+		dprint(FD_PROCESS, "row_last_cpu=%d cpumask->row[%d]=%lu\n",
+		       row_last_cpu, row, cpumask->row[row]);
+		if (row_last_cpu > -1) {
+			mask_last_cpu = cpus_offset + row_last_cpu;
+			dprint(FD_PROCESS, "last set cpu in mask is at index %d\n",
+			       mask_last_cpu);
+		} else {
+			cpus_offset -= FIO_CPU_MASK_STRIDE;
+			row--;
+		}
+	}
+
+	return mask_last_cpu;
+}
+
+static inline int mask_to_group_mask(os_cpu_mask_t *cpumask, int *processor_group, uint64_t *affinity_mask)
+{
+	WORD online_groups, group, group_size;
+	bool found;
+	int cpus_offset, search_cpu, last_cpu, bit_offset, row, end;
+	uint64_t group_cpumask;
+
+	search_cpu = first_set_cpu(cpumask);
+	if (search_cpu < 0) {
+		log_info("CPU mask doesn't set any CPUs\n");
+		return 1;
+	}
+
+	/* Find processor group first set CPU applies to */
+	online_groups = GetActiveProcessorGroupCount();
+	group = 0;
+	found = false;
+	cpus_offset = 0;
+	group_size = 0;
+	while (!found && group < online_groups) {
+		group_size = GetActiveProcessorCount(group);
+		dprint(FD_PROCESS, "group=%d group_start=%d group_size=%u search_cpu=%d\n",
+		       group, cpus_offset, group_size, search_cpu);
+		if (cpus_offset + group_size > search_cpu)
+			found = true;
+		else {
+			cpus_offset += group_size;
+			group++;
+		}
+	}
+
+	if (!found) {
+		log_err("CPU mask contains processor beyond last active processor index (%d)\n",
+			 cpus_offset - 1);
+		print_mask(cpumask);
+		return 1;
+	}
+
+	/* Check all the CPUs in the mask apply to ONLY that processor group */
+	last_cpu = last_set_cpu(cpumask);
+	if (last_cpu > (cpus_offset + group_size - 1)) {
+		log_info("CPU mask cannot bind CPUs (e.g. %d, %d) that are "
+			 "in different processor groups\n", search_cpu,
+			 last_cpu);
+		print_mask(cpumask);
+		return 1;
+	}
+
+	/* Extract the current processor group mask from the cpumask */
+	row = cpus_offset / FIO_CPU_MASK_STRIDE;
+	bit_offset = cpus_offset % FIO_CPU_MASK_STRIDE;
+	group_cpumask = cpumask->row[row] >> bit_offset;
+	end = bit_offset + group_size;
+	if (end > FIO_CPU_MASK_STRIDE && (row + 1 < FIO_CPU_MASK_ROWS)) {
+		/* Some of the next row needs to be part of the mask */
+		int needed, needed_shift, needed_mask_shift;
+		uint64_t needed_mask;
+
+		needed = end - FIO_CPU_MASK_STRIDE;
+		needed_shift = FIO_CPU_MASK_STRIDE - bit_offset;
+		needed_mask_shift = FIO_CPU_MASK_STRIDE - needed;
+		needed_mask = (uint64_t)-1 >> needed_mask_shift;
+		dprint(FD_PROCESS, "bit_offset=%d end=%d needed=%d needed_shift=%d needed_mask=%ld needed_mask_shift=%d\n", bit_offset, end, needed, needed_shift, needed_mask, needed_mask_shift);
+		group_cpumask |= (cpumask->row[row + 1] & needed_mask) << needed_shift;
+	}
+	group_cpumask &= (uint64_t)-1 >> (FIO_CPU_MASK_STRIDE - group_size);
+
+	/* Return group and mask */
+	dprint(FD_PROCESS, "Returning group=%d group_mask=%lu\n", group, group_cpumask);
+	*processor_group = group;
+	*affinity_mask = group_cpumask;
+
+	return 0;
+}
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	HANDLE handle = NULL;
+	int group, ret;
+	uint64_t group_mask = 0;
+	GROUP_AFFINITY new_group_affinity;
+
+	ret = -1;
+
+	if (mask_to_group_mask(&cpumask, &group, &group_mask) != 0)
+		goto err;
+
+	handle = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION,
+			    TRUE, pid);
+	if (handle == NULL) {
+		log_err("fio_setaffinity: failed to get handle for pid %d\n", pid);
+		goto err;
+	}
+
+	/* Set group and mask.
+	 * Note: if the GROUP_AFFINITY struct's Reserved members are not
+	 * initialised to 0 then SetThreadGroupAffinity will fail with
+	 * GetLastError() set to ERROR_INVALID_PARAMETER */
+	new_group_affinity.Mask = (KAFFINITY) group_mask;
+	new_group_affinity.Group = group;
+	new_group_affinity.Reserved[0] = 0;
+	new_group_affinity.Reserved[1] = 0;
+	new_group_affinity.Reserved[2] = 0;
+	if (SetThreadGroupAffinity(handle, &new_group_affinity, NULL) != 0)
+		ret = 0;
+	else {
+		log_err("fio_setaffinity: failed to set thread affinity "
+			 "(pid %d, group %d, mask %" PRIx64 ", "
+			 "GetLastError=%d)\n", pid, group, group_mask,
+			 GetLastError());
+		goto err;
+	}
+
+err:
+	if (handle)
+		CloseHandle(handle);
+	return ret;
+}
+
+static inline void cpu_to_row_offset(int cpu, int *row, int *offset)
+{
+	*row = cpu / FIO_CPU_MASK_STRIDE;
+	*offset = cpu << FIO_CPU_MASK_STRIDE * *row;
+}
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		mask->row[i] = 0;
+	return 0;
+}
+
+/*
+ * fio_getaffinity() should not be called once a fio_setaffinity() call has
+ * been made because fio_setaffinity() may put the process into multiple
+ * processor groups
+ */
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	int ret;
+	int row, offset, end, group, group_size, group_start_cpu;
+	DWORD_PTR process_mask, system_mask;
+	HANDLE handle;
+	PUSHORT current_groups;
+	USHORT group_count;
+	WORD online_groups;
+
+	ret = -1;
+	current_groups = NULL;
+	handle = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
+	if (handle == NULL) {
+		log_err("fio_getaffinity: failed to get handle for pid %d\n",
+			pid);
+		goto err;
+	}
+
+	group_count = 1;
+	/*
+	 * GetProcessGroupAffinity() seems to expect more than the natural
+	 * alignment for a USHORT from the area pointed to by current_groups so
+	 * arrange for maximum alignment by allocating via malloc()
+	 */
+	current_groups = malloc(sizeof(USHORT));
+	if (!current_groups) {
+		log_err("fio_getaffinity: malloc failed\n");
+		goto err;
+	}
+	if (GetProcessGroupAffinity(handle, &group_count, current_groups) == 0) {
+		/* NB: we also fail here if we are a multi-group process */
+		log_err("fio_getaffinity: failed to get single group affinity for pid %d\n", pid);
+		goto err;
+	}
+	GetProcessAffinityMask(handle, &process_mask, &system_mask);
+
+	/* Convert group and group relative mask to full CPU mask */
+	online_groups = GetActiveProcessorGroupCount();
+	if (online_groups == 0) {
+		log_err("fio_getaffinity: error retrieving total processor groups\n");
+		goto err;
+	}
+
+	group = 0;
+	group_start_cpu = 0;
+	group_size = 0;
+	dprint(FD_PROCESS, "current_groups=%d group_count=%d\n",
+	       current_groups[0], group_count);
+	while (true) {
+		group_size = GetActiveProcessorCount(group);
+		if (group_size == 0) {
+			log_err("fio_getaffinity: error retrieving size of "
+				"processor group %d\n", group);
+			goto err;
+		} else if (group >= current_groups[0] || group >= online_groups)
+			break;
+		else {
+			group_start_cpu += group_size;
+			group++;
+		}
+	}
+
+	if (group != current_groups[0]) {
+		log_err("fio_getaffinity: could not find processor group %d\n",
+			current_groups[0]);
+		goto err;
+	}
+
+	dprint(FD_PROCESS, "group_start_cpu=%d, group size=%u\n",
+	       group_start_cpu, group_size);
+	if ((group_start_cpu + group_size) >= FIO_MAX_CPUS) {
+		log_err("fio_getaffinity failed: current CPU affinity (group "
+			"%d, group_start_cpu %d, group_size %d) extends "
+			"beyond mask's highest CPU (%d)\n", group,
+			group_start_cpu, group_size, FIO_MAX_CPUS);
+		goto err;
+	}
+
+	fio_cpuset_init(mask);
+	cpu_to_row_offset(group_start_cpu, &row, &offset);
+	mask->row[row] = process_mask;
+	mask->row[row] <<= offset;
+	end = offset + group_size;
+	if (end > FIO_CPU_MASK_STRIDE) {
+		int needed;
+		uint64_t needed_mask;
+
+		needed = FIO_CPU_MASK_STRIDE - end;
+		needed_mask = (uint64_t)-1 >> (FIO_CPU_MASK_STRIDE - needed);
+		row++;
+		mask->row[row] = process_mask;
+		mask->row[row] >>= needed;
+		mask->row[row] &= needed_mask;
+	}
+	ret = 0;
+
+err:
+	if (handle)
+		CloseHandle(handle);
+	if (current_groups)
+		free(current_groups);
+
+	return ret;
+}
+
+static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	mask->row[row] &= ~(1ULL << offset);
+}
+
+static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	mask->row[row] |= 1ULL << offset;
+}
+
+static inline int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	int row, offset;
+	cpu_to_row_offset(cpu, &row, &offset);
+
+	return (mask->row[row] & (1ULL << offset)) != 0;
+}
+
+static inline int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	int count = 0;
+
+	for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
+		count += hweight64(mask->row[i]);
+
+	return count;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
diff -Nru fio-2.1.3/os/os-windows.h fio-3.16/os/os-windows.h
--- fio-2.1.3/os/os-windows.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/os-windows.h	2019-09-20 01:01:52.000000000 +0000
@@ -13,22 +13,32 @@
 #include <stdlib.h>
 
 #include "../smalloc.h"
+#include "../debug.h"
 #include "../file.h"
 #include "../log.h"
+#include "../lib/hweight.h"
+#include "../oslib/strcasestr.h"
+#include "../lib/types.h"
 
 #include "windows/posix.h"
 
+/* MinGW won't declare rand_r unless _POSIX is defined */
+#if defined(WIN32) && !defined(rand_r)
+int rand_r(unsigned *);
+#endif
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 65535
+#endif
+
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_CPU_AFFINITY
 #define FIO_HAVE_CHARDEV_SIZE
 #define FIO_HAVE_GETTID
-#define FIO_USE_GENERIC_RAND
 
 #define FIO_PREFERRED_ENGINE		"windowsaio"
 #define FIO_PREFERRED_CLOCK_SOURCE	CS_CGETTIME
-#define FIO_OS_PATH_SEPARATOR		"\\"
-
-#define FIO_MAX_CPUS	MAXIMUM_PROCESSORS
+#define FIO_OS_PATH_SEPARATOR		'\\'
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -36,11 +46,6 @@
 #define fio_swap32(x)	_byteswap_ulong(x)
 #define fio_swap64(x)	_byteswap_uint64(x)
 
-typedef DWORD_PTR os_cpu_mask_t;
-
-#define CLOCK_REALTIME	1
-#define CLOCK_MONOTONIC	2
-
 #define _SC_PAGESIZE			0x1
 #define _SC_NPROCESSORS_ONLN	0x2
 #define _SC_PHYS_PAGES			0x4
@@ -68,10 +73,9 @@
 /* Winsock doesn't support MSG_WAIT */
 #define OS_MSG_DONTWAIT	0
 
-#define POLLOUT	1
-#define POLLIN	2
-#define POLLERR	0
-#define POLLHUP	1
+#ifndef S_ISSOCK
+#define S_ISSOCK(x) 0
+#endif
 
 #define SIGCONT	0
 #define SIGUSR1	1
@@ -103,11 +107,11 @@
 int fdatasync(int fildes);
 int lstat(const char * path, struct stat * buf);
 uid_t geteuid(void);
+char* ctime_r(const time_t *t, char *buf);
 int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
 ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
 ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
 		off_t offset);
-extern void td_fill_rand_seeds(struct thread_data *);
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
 {
@@ -115,7 +119,6 @@
 	HANDLE hFile;
 	GET_LENGTH_INFORMATION info;
 	DWORD outBytes;
-	LARGE_INTEGER size;
 
 	if (f->hFile == NULL) {
 		hFile = CreateFile(f->file_name, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE,
@@ -124,7 +127,6 @@
 		hFile = f->hFile;
 	}
 
-	size.QuadPart = 0;
 	if (DeviceIoControl(hFile, IOCTL_DISK_GET_LENGTH_INFO, NULL, 0, &info, sizeof(info), &outBytes, NULL))
 		*bytes = info.Length.QuadPart;
 	else
@@ -145,9 +147,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	/* There's no way to invalidate the cache in Windows
-	 * so just pretend to succeed */
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -162,71 +162,14 @@
 	return (unsigned long long) pages * (unsigned long long) pagesize;
 }
 
-static inline void os_get_tmpdir(char *path, int len)
-{
-	GetTempPath(len, path);
-}
-
+#ifndef CONFIG_HAVE_GETTID
 static inline int gettid(void)
 {
 	return GetCurrentThreadId();
 }
+#endif
 
-static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
-{
-	HANDLE h;
-	BOOL bSuccess = FALSE;
-
-	h = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION, TRUE, pid);
-	if (h != NULL) {
-		bSuccess = SetThreadAffinityMask(h, cpumask);
-		if (!bSuccess)
-			log_err("fio_setaffinity failed: failed to set thread affinity (pid %d, mask %.16llx)\n", pid, cpumask);
-
-		CloseHandle(h);
-	} else {
-		log_err("fio_setaffinity failed: failed to get handle for pid %d\n", pid);
-	}
-
-	return (bSuccess)? 0 : -1;
-}
-
-static inline void fio_getaffinity(int pid, os_cpu_mask_t *mask)
-{
-	os_cpu_mask_t systemMask;
-
-	HANDLE h = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
-
-	if (h != NULL) {
-		GetProcessAffinityMask(h, mask, &systemMask);
-		CloseHandle(h);
-	} else {
-		log_err("fio_getaffinity failed: failed to get handle for pid %d\n", pid);
-	}
-}
-
-static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
-{
-	*mask ^= 1 << (cpu-1);
-}
-
-static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
-{
-	*mask |= 1 << cpu;
-}
-
-static inline int fio_cpuset_init(os_cpu_mask_t *mask)
-{
-	*mask = 0;
-	return 0;
-}
-
-static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
-{
-	return 0;
-}
-
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(uint64_t *rand_seeds, int size)
 {
 	HCRYPTPROV hCryptProv;
 
@@ -245,16 +188,19 @@
 	}
 
 	CryptReleaseContext(hCryptProv, 0);
-	td_fill_rand_seeds(td);
 	return 0;
 }
 
-
 static inline int fio_set_sched_idle(void)
 {
 	/* SetThreadPriority returns nonzero for success */
 	return (SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE))? 0 : -1;
 }
 
+#ifdef CONFIG_WINDOWS_XP
+#include "os-windows-xp.h"
+#else
+#include "os-windows-7.h"
+#endif
 
 #endif /* FIO_OS_WINDOWS_H */
diff -Nru fio-2.1.3/os/os-windows-xp.h fio-3.16/os/os-windows-xp.h
--- fio-2.1.3/os/os-windows-xp.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/os-windows-xp.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,70 @@
+#define FIO_MAX_CPUS	MAXIMUM_PROCESSORS
+
+typedef DWORD_PTR os_cpu_mask_t;
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	HANDLE h;
+	BOOL bSuccess = FALSE;
+
+	h = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION, TRUE, pid);
+	if (h != NULL) {
+		bSuccess = SetThreadAffinityMask(h, cpumask);
+		if (!bSuccess)
+			log_err("fio_setaffinity failed: failed to set thread affinity (pid %d, mask %.16llx)\n", pid, cpumask);
+
+		CloseHandle(h);
+	} else {
+		log_err("fio_setaffinity failed: failed to get handle for pid %d\n", pid);
+	}
+
+	return (bSuccess)? 0 : -1;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	os_cpu_mask_t systemMask;
+
+	HANDLE h = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
+
+	if (h != NULL) {
+		GetProcessAffinityMask(h, mask, &systemMask);
+		CloseHandle(h);
+	} else {
+		log_err("fio_getaffinity failed: failed to get handle for pid %d\n", pid);
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	*mask &= ~(1ULL << cpu);
+}
+
+static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	*mask |= 1ULL << cpu;
+}
+
+static inline int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	return (*mask & (1ULL << cpu)) != 0;
+}
+
+static inline int fio_cpu_count(os_cpu_mask_t *mask)
+{
+	return hweight64(*mask);
+}
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	*mask = 0;
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
diff -Nru fio-2.1.3/os/windows/dobuild.cmd fio-3.16/os/windows/dobuild.cmd
--- fio-2.1.3/os/windows/dobuild.cmd	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/dobuild.cmd	2019-09-20 01:01:52.000000000 +0000
@@ -6,6 +6,16 @@
  set /a counter+=1
 )
 
+for /f "tokens=2 delims=-" %%i in ("%FIO_VERSION%") do (
+ set FIO_VERSION_NUMBERS=%%i
+)
+
+if not defined FIO_VERSION_NUMBERS (
+  echo Could not find version numbers in the string '%FIO_VERSION%'
+  echo Expected version to follow format 'fio-^([0-9]+.[0-9.]+^)'
+  goto end
+)
+
 if "%1"=="x86" set FIO_ARCH=x86
 if "%1"=="x64" set FIO_ARCH=x64
 
@@ -16,7 +26,7 @@
   goto end
 )
 
-"%WIX%bin\candle" -nologo -arch %FIO_ARCH% install.wxs
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% -dFioVersionNumbers="%FIO_VERSION_NUMBERS%" install.wxs
 @if ERRORLEVEL 1 goto end
 "%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs
 @if ERRORLEVEL 1 goto end
Binary files /tmp/tmpaiUoyN/55oBVA8RWn/fio-2.1.3/os/windows/eula.rtf and /tmp/tmpaiUoyN/623n9XWbO1/fio-3.16/os/windows/eula.rtf differ
diff -Nru fio-2.1.3/os/windows/examples.wxs fio-3.16/os/windows/examples.wxs
--- fio-2.1.3/os/windows/examples.wxs	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/examples.wxs	2019-09-20 01:01:52.000000000 +0000
@@ -3,51 +3,168 @@
     <Fragment>
         <DirectoryRef Id="examples">
                 <Component>
-                    <File Source="..\..\examples\1mbs_clients.fio" />
+                  <File Source="..\..\examples\1mbs_clients.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\aio-read.fio" />
+                  <File Source="..\..\examples\aio-read.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\disk-zone-profile.fio" />
+                  <File Source="..\..\examples\backwards-read.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\fsx.fio" />
+                  <File Source="..\..\examples\basic-verify.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\iometer-file-access-server.fio" />
+                  <File Source="..\..\examples\butterfly.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\netio.fio" />
+                  <File Source="..\..\examples\cpp_null.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\ssd-test.fio" />
+                  <File Source="..\..\examples\cpuio.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\surface-scan.fio" />
+                  <File Source="..\..\examples\dev-dax.fio" />
                 </Component>
                 <Component>
-                    <File Source="..\..\examples\tiobench-example.fio" />
+                  <File Source="..\..\examples\disk-zone-profile.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\null.fio" />
+                  <File Source="..\..\examples\e4defrag.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\flow.fio" />
+                  <File Source="..\..\examples\e4defrag2.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\cpuio.fio" />
+                  <File Source="..\..\examples\enospc-pressure.fio" />
                 </Component>
                 <Component>
                   <File Source="..\..\examples\falloc.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\fusion-aw-sync.fio" />
+                  <File Source="..\..\examples\filecreate-ioengine.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-RW.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-rand-write.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-read.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-RW.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fio-seq-write.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fixed-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\flow.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fsx.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ftruncate.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gfapi.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gpudirect-rdmaio-client.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gpudirect-rdmaio-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-s3.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-swift.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\http-webdav.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ime.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\iometer-file-access-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\jesd219.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\latency-profile.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libhdfs.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libpmem.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\mtd.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\netio.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\netio_multicast.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\null.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\numa.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\pmemblk.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\poisson-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rados.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rand-zones.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rbd.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-client.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-server.fio" />
                 </Component>
                 <Component>
                   <File Source="..\..\examples\ssd-steadystate.fio" />
                 </Component>
                 <Component>
+                    <File Source="..\..\examples\ssd-test.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\steadystate.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\surface-scan.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\tiobench-example.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\waitfor.fio" />
+                </Component>
+                <Component>
                   <File Source="..\..\examples\zipf.fio" />
                 </Component>
         </DirectoryRef>
@@ -56,19 +173,58 @@
         <ComponentGroup Id="examples">
             <ComponentRef Id="_1mbs_clients.fio" />
             <ComponentRef Id="aio_read.fio" />
+            <ComponentRef Id="backwards_read.fio" />
+            <ComponentRef Id="basic_verify.fio" />
+            <ComponentRef Id="butterfly.fio"/>
+            <ComponentRef Id="cpp_null.fio"/>
+            <ComponentRef Id="cpuio.fio" />
+            <ComponentRef Id="dev_dax.fio" />
             <ComponentRef Id="disk_zone_profile.fio" />
+            <ComponentRef Id="e4defrag.fio" />
+            <ComponentRef Id="e4defrag2.fio" />
+            <ComponentRef Id="enospc_pressure.fio" />
+            <ComponentRef Id="falloc.fio" />
+            <ComponentRef Id="filecreate_ioengine.fio"/>
+            <ComponentRef Id="fio_rand_read.fio"/>
+            <ComponentRef Id="fio_rand_RW.fio"/>
+            <ComponentRef Id="fio_rand_write.fio"/>
+            <ComponentRef Id="fio_seq_read.fio"/>
+            <ComponentRef Id="fio_seq_RW.fio"/>
+            <ComponentRef Id="fio_seq_write.fio"/>
+            <ComponentRef Id="fixed_rate_submission.fio" />
+            <ComponentRef Id="flow.fio" />
             <ComponentRef Id="fsx.fio" />
+            <ComponentRef Id="ftruncate.fio"/>
+            <ComponentRef Id="gfapi.fio" />
+            <ComponentRef Id="gpudirect_rdmaio_client.fio"/>
+            <ComponentRef Id="gpudirect_rdmaio_server.fio"/>
+            <ComponentRef Id="http_s3.fio"/>
+            <ComponentRef Id="http_swift.fio"/>
+            <ComponentRef Id="http_webdav.fio"/>
+            <ComponentRef Id="ime.fio"/>
             <ComponentRef Id="iometer_file_access_server.fio" />
+            <ComponentRef Id="jesd219.fio" />
+            <ComponentRef Id="latency_profile.fio" />
+            <ComponentRef Id="libhdfs.fio" />
+            <ComponentRef Id="libpmem.fio"/>
+            <ComponentRef Id="mtd.fio" />
             <ComponentRef Id="netio.fio" />
+            <ComponentRef Id="netio_multicast.fio" />
+            <ComponentRef Id="null.fio" />
+            <ComponentRef Id="numa.fio" />
+            <ComponentRef Id="pmemblk.fio" />
+            <ComponentRef Id="poisson_rate_submission.fio" />
+            <ComponentRef Id="rados.fio"/>
+            <ComponentRef Id="rand_zones.fio" />
+            <ComponentRef Id="rbd.fio" />
+            <ComponentRef Id="rdmaio_client.fio" />
+            <ComponentRef Id="rdmaio_server.fio" />
+            <ComponentRef Id="ssd_steadystate.fio" />
             <ComponentRef Id="ssd_test.fio" />
+            <ComponentRef Id="steadystate.fio" />
             <ComponentRef Id="surface_scan.fio" />
             <ComponentRef Id="tiobench_example.fio" />
-            <ComponentRef Id="null.fio" />
-            <ComponentRef Id="flow.fio" />
-            <ComponentRef Id="cpuio.fio" />
-            <ComponentRef Id="falloc.fio" />
-            <ComponentRef Id="fusion_aw_sync.fio" />
-            <ComponentRef Id="ssd_steadystate.fio" />
+            <ComponentRef Id="waitfor.fio" />
             <ComponentRef Id="zipf.fio" />
         </ComponentGroup>
     </Fragment>
diff -Nru fio-2.1.3/os/windows/install.wxs fio-3.16/os/windows/install.wxs
--- fio-2.1.3/os/windows/install.wxs	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/install.wxs	2019-09-20 01:01:52.000000000 +0000
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
 
-	<?if $(env.FIO_ARCH) = x86 ?>
+	<?if $(sys.BUILDARCH) = x86 ?>
 		<?define ProgramDirectory = ProgramFilesFolder ?>
 	<?else?>
 		<?define ProgramDirectory = ProgramFiles64Folder ?>
@@ -10,9 +10,9 @@
 	<Product Id="*"
 	  Codepage="1252" Language="1033"
 	  Manufacturer="fio" Name="fio"
-	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.1.3">
+	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="$(var.FioVersionNumbers)">
 		<Package
-		  Description="Flexible IO Tester"
+		  Description="Flexible I/O Tester"
 		  InstallerVersion="301" Keywords="Installer,MSI,Database"
 		  Languages="1033" Manufacturer="fio"
 		  InstallScope="perMachine" InstallPrivileges="elevated" Compressed="yes"/>
@@ -25,9 +25,7 @@
 					<Directory Id="fio" Name="fio">
 						<Component>
 							<File Source="..\..\fio.exe"/>
-						</Component>
-						<Component>
-							<File KeyPath="yes" Source="..\..\pthreadGC2.dll"/>
+							<Environment Action="set" Part="last" Id="PATH" Name="PATH" Value="[INSTALLDIR]fio\" System="yes"/>
 						</Component>
 						<Component>
 							<File Id="README" Name="README.txt" Source="..\..\README"/>
@@ -42,29 +40,62 @@
 							<File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
 						</Component>
 						<Component>
-							<File Id="LICENSE" Name="LICENSE.txt" Source="..\..\LICENSE"/>
+							<File Id="MORAL_LICENSE" Name="MORAL-LICENSE.txt" Source="..\..\MORAL-LICENSE"/>
 						</Component>
 						<Directory Id="examples" Name="examples"/>
+						<Directory Id="tests" Name="tests">
+							<Component>
+								<File Source="../../t/fio-genzipf.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/fio-dedupe.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/stest.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/ieee754.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/axmap.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/lfsr-test.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/gen-rand.exe"/>
+							</Component>
+							<Component>
+								<File Source="../../t/fio-verify-state.exe"/>
+							</Component>
+						</Directory>
 					</Directory>
 				</Directory>
 			</Directory>
 	</Directory>
 
-	<Feature Id="AlwaysInstall" Absent="disallow" ConfigurableDirectory="INSTALLDIR" Display="hidden" Level="1" Title="Flexible IO Tester">
+	<Feature Id="AlwaysInstall" Absent="disallow" ConfigurableDirectory="INSTALLDIR" Display="hidden" Level="1" Title="Flexible I/O Tester">
 		<ComponentRef Id="fio.exe"/>
-		<ComponentRef Id="pthreadGC2.dll"/>
 		<ComponentRef Id="HOWTO"/>
 		<ComponentRef Id="README"/>
 		<ComponentRef Id="REPORTING_BUGS"/>
 		<ComponentRef Id="COPYING"/>
-		<ComponentRef Id="LICENSE"/>
+		<ComponentRef Id="MORAL_LICENSE"/>
 		<ComponentGroupRef Id="examples"/>
+		<ComponentRef Id="fio_genzipf.exe"/>
+		<ComponentRef Id="fio_dedupe.exe"/>
+		<ComponentRef Id="stest.exe"/>
+		<ComponentRef Id="ieee754.exe"/>
+		<ComponentRef Id="axmap.exe"/>
+		<ComponentRef Id="lfsr_test.exe"/>
+		<ComponentRef Id="gen_rand.exe"/>
+		<ComponentRef Id="fio_verify_state.exe"/>
 	</Feature>
 
-	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/?p=fio.git" />
+	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/cgit/fio/" />
 	<Property Id='ARPCONTACT'>fio@vger.kernel.org</Property>
 	<Property Id='ARPHELPLINK'>http://www.spinics.net/lists/fio/</Property>
-	<Property Id='ARPURLUPDATEINFO'>http://bluestop.org/fio/</Property>
+	<Property Id='ARPURLUPDATEINFO'>https://bluestop.org/fio/</Property>
 
 	<WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
 
diff -Nru fio-2.1.3/os/windows/posix/include/arpa/inet.h fio-3.16/os/windows/posix/include/arpa/inet.h
--- fio-2.1.3/os/windows/posix/include/arpa/inet.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/arpa/inet.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,13 +1,21 @@
 #ifndef ARPA_INET_H
 #define ARPA_INET_H
 
-#include <winsock2.h>
+#include <ws2tcpip.h>
 #include <inttypes.h>
 
 typedef int socklen_t;
+typedef int in_addr_t;
 
+/* EAI_SYSTEM isn't used on Windows, so map it to EAI_FAIL */
+#define EAI_SYSTEM EAI_FAIL
+
+in_addr_t inet_network(const char *cp);
+
+#ifdef CONFIG_WINDOWS_XP
 const char *inet_ntop(int af, const void *restrict src,
         char *restrict dst, socklen_t size);
 int inet_pton(int af, const char *restrict src, void *restrict dst);
+#endif
 
 #endif /* ARPA_INET_H */
diff -Nru fio-2.1.3/os/windows/posix/include/netinet/in.h fio-3.16/os/windows/posix/include/netinet/in.h
--- fio-2.1.3/os/windows/posix/include/netinet/in.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/netinet/in.h	2019-09-20 01:01:52.000000000 +0000
@@ -4,18 +4,5 @@
 #include <inttypes.h>
 #include <sys/un.h>
 
-struct in6_addr
-{
-	uint8_t s6_addr[16];
-};
-
-struct sockaddr_in6
-{
-	sa_family_t		sin6_family;   /* AF_INET6 */
-	in_port_t		sin6_port;     /* Port number */
-	uint32_t		sin6_flowinfo; /* IPv6 traffic class and flow information */
-	struct in6_addr	sin6_addr;     /* IPv6 address */
-	uint32_t		sin6_scope_id; /* Set of interfaces for a scope */
-};
 
 #endif /* NETINET_IN_H */
diff -Nru fio-2.1.3/os/windows/posix/include/poll.h fio-3.16/os/windows/posix/include/poll.h
--- fio-2.1.3/os/windows/posix/include/poll.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/poll.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,24 @@
 #ifndef POLL_H
 #define POLL_H
 
+#include <winsock2.h>
+
+typedef int nfds_t;
+
+#ifdef CONFIG_WINDOWS_XP
+struct pollfd
+{
+	int fd;
+	short events;
+	short revents;
+};
+
+#define POLLOUT	1
+#define POLLIN	2
+#define POLLERR	0
+#define POLLHUP	1
+#endif /* CONFIG_WINDOWS_XP */
+
+int poll(struct pollfd fds[], nfds_t nfds, int timeout);
+
 #endif /* POLL_H */
diff -Nru fio-2.1.3/os/windows/posix/include/sys/ioctl.h fio-3.16/os/windows/posix/include/sys/ioctl.h
--- fio-2.1.3/os/windows/posix/include/sys/ioctl.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/os/windows/posix/include/sys/ioctl.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,7 @@
+#ifndef IOCTL_H
+#define IOCTL_H
+
+/* This file is empty since it only needs to exist on Windows
+   but isn't otherwise used */
+
+#endif /* IOCTL_H */
\ No newline at end of file
diff -Nru fio-2.1.3/os/windows/posix/include/sys/poll.h fio-3.16/os/windows/posix/include/sys/poll.h
--- fio-2.1.3/os/windows/posix/include/sys/poll.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/sys/poll.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
-#ifndef SYS_POLL_H
-#define SYS_POLL_H
-
-typedef int nfds_t;
-
-struct pollfd
-{
-	int fd;
-	short events;
-	short revents;
-};
-
-int poll(struct pollfd fds[], nfds_t nfds, int timeout);
-
-#endif /* SYS_POLL_H */
diff -Nru fio-2.1.3/os/windows/posix/include/sys/uio.h fio-3.16/os/windows/posix/include/sys/uio.h
--- fio-2.1.3/os/windows/posix/include/sys/uio.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/sys/uio.h	2019-09-20 01:01:52.000000000 +0000
@@ -4,8 +4,8 @@
 #include <inttypes.h>
 #include <unistd.h>
 
- struct iovec
- {
+struct iovec
+{
 	void	*iov_base;  /* Base address of a memory region for input or output */
 	size_t	 iov_len;   /* The size of the memory pointed to by iov_base */
 };
diff -Nru fio-2.1.3/os/windows/posix/include/sys/un.h fio-3.16/os/windows/posix/include/sys/un.h
--- fio-2.1.3/os/windows/posix/include/sys/un.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/sys/un.h	2019-09-20 01:01:52.000000000 +0000
@@ -7,7 +7,7 @@
  struct sockaddr_un
  {
 	sa_family_t	sun_family; /* Address family */
-	char		sun_path[]; /* Socket pathname */
+	char		sun_path[260]; /* Socket pathname */
 };
 
 #endif /* SYS_UN_H */
diff -Nru fio-2.1.3/os/windows/posix/include/sys/wait.h fio-3.16/os/windows/posix/include/sys/wait.h
--- fio-2.1.3/os/windows/posix/include/sys/wait.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix/include/sys/wait.h	2019-09-20 01:01:52.000000000 +0000
@@ -5,7 +5,7 @@
 #define WIFEXITED(a)	0
 #define WTERMSIG(a)		0
 #define WEXITSTATUS(a)	0
-#define WNOHANG			0
+#define WNOHANG			1
 
 pid_t waitpid(pid_t, int *stat_loc, int options);
 
diff -Nru fio-2.1.3/os/windows/posix.c fio-3.16/os/windows/posix.c
--- fio-2.1.3/os/windows/posix.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/os/windows/posix.c	2019-09-20 01:01:52.000000000 +0000
@@ -12,107 +12,168 @@
 #include <unistd.h>
 #include <dirent.h>
 #include <pthread.h>
+#include <time.h>
 #include <semaphore.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <sys/uio.h>
 #include <sys/resource.h>
-#include <sys/poll.h>
+#include <poll.h>
+#include <sys/wait.h>
+#include <setjmp.h>
 
 #include "../os-windows.h"
 #include "../../lib/hweight.h"
 
-extern unsigned long mtime_since_now(struct timeval *);
-extern void fio_gettime(struct timeval *, void *);
+extern unsigned long mtime_since_now(struct timespec *);
+extern void fio_gettime(struct timespec *, void *);
 
 /* These aren't defined in the MinGW headers */
-HRESULT WINAPI StringCchCopyA(
-  char *pszDest,
-  size_t cchDest,
-  const char *pszSrc);
-
-HRESULT WINAPI StringCchPrintfA(
-  char *pszDest,
-  size_t cchDest,
-  const char *pszFormat,
-  ...);
-
-int vsprintf_s(
-  char *buffer,
-  size_t numberOfElements,
-  const char *format,
-  va_list argptr);
+HRESULT WINAPI StringCchCopyA(char *pszDest, size_t cchDest, const char *pszSrc);
+HRESULT WINAPI StringCchPrintfA(char *pszDest, size_t cchDest, const char *pszFormat, ...);
 
 int win_to_posix_error(DWORD winerr)
 {
-	switch (winerr)
-	{
-	case ERROR_FILE_NOT_FOUND:		return ENOENT;
-	case ERROR_PATH_NOT_FOUND:		return ENOENT;
-	case ERROR_ACCESS_DENIED:		return EACCES;
-	case ERROR_INVALID_HANDLE:		return EBADF;
-	case ERROR_NOT_ENOUGH_MEMORY:	return ENOMEM;
-	case ERROR_INVALID_DATA:		return EINVAL;
-	case ERROR_OUTOFMEMORY:			return ENOMEM;
-	case ERROR_INVALID_DRIVE:		return ENODEV;
-	case ERROR_NOT_SAME_DEVICE:		return EXDEV;
-	case ERROR_WRITE_PROTECT:		return EROFS;
-	case ERROR_BAD_UNIT:			return ENODEV;
-	case ERROR_SHARING_VIOLATION:	return EACCES;
-	case ERROR_LOCK_VIOLATION:		return EACCES;
-	case ERROR_SHARING_BUFFER_EXCEEDED:	return ENOLCK;
-	case ERROR_HANDLE_DISK_FULL:	return ENOSPC;
-	case ERROR_NOT_SUPPORTED:		return ENOSYS;
-	case ERROR_FILE_EXISTS:			return EEXIST;
-	case ERROR_CANNOT_MAKE:			return EPERM;
-	case ERROR_INVALID_PARAMETER:	return EINVAL;
-	case ERROR_NO_PROC_SLOTS:		return EAGAIN;
-	case ERROR_BROKEN_PIPE:			return EPIPE;
-	case ERROR_OPEN_FAILED:			return EIO;
-	case ERROR_NO_MORE_SEARCH_HANDLES:	return ENFILE;
-	case ERROR_CALL_NOT_IMPLEMENTED:	return ENOSYS;
-	case ERROR_INVALID_NAME:		return ENOENT;
-	case ERROR_WAIT_NO_CHILDREN:	return ECHILD;
-	case ERROR_CHILD_NOT_COMPLETE:	return EBUSY;
-	case ERROR_DIR_NOT_EMPTY:		return ENOTEMPTY;
-	case ERROR_SIGNAL_REFUSED:		return EIO;
-	case ERROR_BAD_PATHNAME:		return ENOENT;
-	case ERROR_SIGNAL_PENDING:		return EBUSY;
-	case ERROR_MAX_THRDS_REACHED:	return EAGAIN;
-	case ERROR_BUSY:				return EBUSY;
-	case ERROR_ALREADY_EXISTS:		return EEXIST;
-	case ERROR_NO_SIGNAL_SENT:		return EIO;
-	case ERROR_FILENAME_EXCED_RANGE:	return EINVAL;
-	case ERROR_META_EXPANSION_TOO_LONG:	return EINVAL;
-	case ERROR_INVALID_SIGNAL_NUMBER:	return EINVAL;
-	case ERROR_THREAD_1_INACTIVE:	return EINVAL;
-	case ERROR_BAD_PIPE:			return EINVAL;
-	case ERROR_PIPE_BUSY:			return EBUSY;
-	case ERROR_NO_DATA:				return EPIPE;
-	case ERROR_MORE_DATA:			return EAGAIN;
-	case ERROR_DIRECTORY:			return ENOTDIR;
-	case ERROR_PIPE_CONNECTED:		return EBUSY;
-	case ERROR_NO_TOKEN:			return EINVAL;
-	case ERROR_PROCESS_ABORTED:		return EFAULT;
-	case ERROR_BAD_DEVICE:			return ENODEV;
-	case ERROR_BAD_USERNAME:		return EINVAL;
-	case ERROR_OPEN_FILES:			return EAGAIN;
-	case ERROR_ACTIVE_CONNECTIONS:	return EAGAIN;
-	case ERROR_DEVICE_IN_USE:		return EAGAIN;
-	case ERROR_INVALID_AT_INTERRUPT_TIME:	return EINTR;
-	case ERROR_IO_DEVICE:			return EIO;
-	case ERROR_NOT_OWNER:			return EPERM;
-	case ERROR_END_OF_MEDIA:		return ENOSPC;
-	case ERROR_EOM_OVERFLOW:		return ENOSPC;
-	case ERROR_BEGINNING_OF_MEDIA:	return ESPIPE;
-	case ERROR_SETMARK_DETECTED:	return ESPIPE;
-	case ERROR_NO_DATA_DETECTED:	return ENOSPC;
-	case ERROR_POSSIBLE_DEADLOCK:	return EDEADLOCK;
-	case ERROR_CRC:					return EIO;
-	case ERROR_NEGATIVE_SEEK:		return EINVAL;
-	case ERROR_DISK_FULL:			return ENOSPC;
-	case ERROR_NOACCESS:			return EFAULT;
-	case ERROR_FILE_INVALID:		return ENXIO;
+	switch (winerr) {
+	case ERROR_SUCCESS:
+		return 0;
+	case ERROR_FILE_NOT_FOUND:
+		return ENOENT;
+	case ERROR_PATH_NOT_FOUND:
+		return ENOENT;
+	case ERROR_ACCESS_DENIED:
+		return EACCES;
+	case ERROR_INVALID_HANDLE:
+		return EBADF;
+	case ERROR_NOT_ENOUGH_MEMORY:
+		return ENOMEM;
+	case ERROR_INVALID_DATA:
+		return EINVAL;
+	case ERROR_OUTOFMEMORY:
+		return ENOMEM;
+	case ERROR_INVALID_DRIVE:
+		return ENODEV;
+	case ERROR_NOT_SAME_DEVICE:
+		return EXDEV;
+	case ERROR_WRITE_PROTECT:
+		return EROFS;
+	case ERROR_BAD_UNIT:
+		return ENODEV;
+	case ERROR_NOT_READY:
+		return EAGAIN;
+	case ERROR_SHARING_VIOLATION:
+		return EACCES;
+	case ERROR_LOCK_VIOLATION:
+		return EACCES;
+	case ERROR_SHARING_BUFFER_EXCEEDED:
+		return ENOLCK;
+	case ERROR_HANDLE_DISK_FULL:
+		return ENOSPC;
+	case ERROR_NOT_SUPPORTED:
+		return ENOSYS;
+	case ERROR_FILE_EXISTS:
+		return EEXIST;
+	case ERROR_CANNOT_MAKE:
+		return EPERM;
+	case ERROR_INVALID_PARAMETER:
+		return EINVAL;
+	case ERROR_NO_PROC_SLOTS:
+		return EAGAIN;
+	case ERROR_BROKEN_PIPE:
+		return EPIPE;
+	case ERROR_OPEN_FAILED:
+		return EIO;
+	case ERROR_NO_MORE_SEARCH_HANDLES:
+		return ENFILE;
+	case ERROR_CALL_NOT_IMPLEMENTED:
+		return ENOSYS;
+	case ERROR_INVALID_NAME:
+		return ENOENT;
+	case ERROR_WAIT_NO_CHILDREN:
+		return ECHILD;
+	case ERROR_CHILD_NOT_COMPLETE:
+		return EBUSY;
+	case ERROR_DIR_NOT_EMPTY:
+		return ENOTEMPTY;
+	case ERROR_SIGNAL_REFUSED:
+		return EIO;
+	case ERROR_BAD_PATHNAME:
+		return ENOENT;
+	case ERROR_SIGNAL_PENDING:
+		return EBUSY;
+	case ERROR_MAX_THRDS_REACHED:
+		return EAGAIN;
+	case ERROR_BUSY:
+		return EBUSY;
+	case ERROR_ALREADY_EXISTS:
+		return EEXIST;
+	case ERROR_NO_SIGNAL_SENT:
+		return EIO;
+	case ERROR_FILENAME_EXCED_RANGE:
+		return EINVAL;
+	case ERROR_META_EXPANSION_TOO_LONG:
+		return EINVAL;
+	case ERROR_INVALID_SIGNAL_NUMBER:
+		return EINVAL;
+	case ERROR_THREAD_1_INACTIVE:
+		return EINVAL;
+	case ERROR_BAD_PIPE:
+		return EINVAL;
+	case ERROR_PIPE_BUSY:
+		return EBUSY;
+	case ERROR_NO_DATA:
+		return EPIPE;
+	case ERROR_MORE_DATA:
+		return EAGAIN;
+	case ERROR_DIRECTORY:
+		return ENOTDIR;
+	case ERROR_PIPE_CONNECTED:
+		return EBUSY;
+	case ERROR_NO_TOKEN:
+		return EINVAL;
+	case ERROR_PROCESS_ABORTED:
+		return EFAULT;
+	case ERROR_BAD_DEVICE:
+		return ENODEV;
+	case ERROR_BAD_USERNAME:
+		return EINVAL;
+	case ERROR_OPEN_FILES:
+		return EAGAIN;
+	case ERROR_ACTIVE_CONNECTIONS:
+		return EAGAIN;
+	case ERROR_DEVICE_IN_USE:
+		return EBUSY;
+	case ERROR_INVALID_AT_INTERRUPT_TIME:
+		return EINTR;
+	case ERROR_IO_DEVICE:
+		return EIO;
+	case ERROR_NOT_OWNER:
+		return EPERM;
+	case ERROR_END_OF_MEDIA:
+		return ENOSPC;
+	case ERROR_EOM_OVERFLOW:
+		return ENOSPC;
+	case ERROR_BEGINNING_OF_MEDIA:
+		return ESPIPE;
+	case ERROR_SETMARK_DETECTED:
+		return ESPIPE;
+	case ERROR_NO_DATA_DETECTED:
+		return ENOSPC;
+	case ERROR_POSSIBLE_DEADLOCK:
+		return EDEADLOCK;
+	case ERROR_CRC:
+		return EIO;
+	case ERROR_NEGATIVE_SEEK:
+		return EINVAL;
+	case ERROR_DISK_FULL:
+		return ENOSPC;
+	case ERROR_NOACCESS:
+		return EFAULT;
+	case ERROR_FILE_INVALID:
+		return ENXIO;
+	default:
+		log_err("fio: windows error %d not handled\n", winerr);
+		return EIO;
 	}
 
 	return winerr;
@@ -141,8 +202,7 @@
 		}
 	}
 
-	for (i = 0; i < len / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); i++)
-	{
+	for (i = 0; i < len / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); i++) {
 		if (processor_info[i].Relationship == RelationProcessorCore)
 			num_processors += hweight64(processor_info[i].ProcessorMask);
 	}
@@ -158,8 +218,7 @@
 	SYSTEM_INFO sysInfo;
 	MEMORYSTATUSEX status;
 
-	switch (name)
-	{
+	switch (name) {
 	case _SC_NPROCESSORS_ONLN:
 		val = GetNumLogicalProcessors();
 		if (val == -1)
@@ -226,6 +285,41 @@
 	return dl_error;
 }
 
+/* Copied from http://blogs.msdn.com/b/joshpoley/archive/2007/12/19/date-time-formats-and-conversions.aspx */
+void Time_tToSystemTime(time_t dosTime, SYSTEMTIME *systemTime)
+{
+	FILETIME utcFT;
+	LONGLONG jan1970;
+	SYSTEMTIME tempSystemTime;
+
+	jan1970 = Int32x32To64(dosTime, 10000000) + 116444736000000000;
+	utcFT.dwLowDateTime = (DWORD)jan1970;
+	utcFT.dwHighDateTime = jan1970 >> 32;
+
+	FileTimeToSystemTime((FILETIME*)&utcFT, &tempSystemTime);
+	SystemTimeToTzSpecificLocalTime(NULL, &tempSystemTime, systemTime);
+}
+
+char *ctime_r(const time_t *t, char *buf)
+{
+	SYSTEMTIME systime;
+	const char * const dayOfWeek[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
+	const char * const monthOfYear[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
+
+	Time_tToSystemTime(*t, &systime);
+
+	/*
+	 * We don't know how long `buf` is, but assume it's rounded up from
+	 * the minimum of 25 to 32
+	 */
+	StringCchPrintfA(buf, 31, "%s %s %d %02d:%02d:%02d %04d\n",
+				dayOfWeek[systime.wDayOfWeek % 7],
+				monthOfYear[(systime.wMonth - 1) % 12],
+				systime.wDay, systime.wHour, systime.wMinute,
+				systime.wSecond, systime.wYear);
+	return buf;
+}
+
 int gettimeofday(struct timeval *restrict tp, void *restrict tzp)
 {
 	FILETIME fileTime;
@@ -250,8 +344,7 @@
 	return 0;
 }
 
-int sigaction(int sig, const struct sigaction *act,
-		struct sigaction *oact)
+int sigaction(int sig, const struct sigaction *act, struct sigaction *oact)
 {
 	int rc = 0;
 	void (*prev_handler)(int);
@@ -266,31 +359,54 @@
 	return rc;
 }
 
-int lstat(const char * path, struct stat * buf)
+int lstat(const char *path, struct stat *buf)
 {
 	return stat(path, buf);
 }
 
-void *mmap(void *addr, size_t len, int prot, int flags,
-		int fildes, off_t off)
+void *mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)
 {
 	DWORD vaProt = 0;
+	DWORD mapAccess = 0;
+	DWORD lenlow;
+	DWORD lenhigh;
+	HANDLE hMap;
 	void* allocAddr = NULL;
 
 	if (prot & PROT_NONE)
 		vaProt |= PAGE_NOACCESS;
 
-	if ((prot & PROT_READ) && !(prot & PROT_WRITE))
+	if ((prot & PROT_READ) && !(prot & PROT_WRITE)) {
 		vaProt |= PAGE_READONLY;
+		mapAccess = FILE_MAP_READ;
+	}
 
-	if (prot & PROT_WRITE)
+	if (prot & PROT_WRITE) {
 		vaProt |= PAGE_READWRITE;
+		mapAccess |= FILE_MAP_WRITE;
+	}
+
+	lenlow = len & 0xFFFF;
+	lenhigh = len >> 16;
+	/* If the low DWORD is zero and the high DWORD is non-zero, `CreateFileMapping`
+	   will return ERROR_INVALID_PARAMETER. To avoid this, set both to zero. */
+	if (lenlow == 0)
+		lenhigh = 0;
 
-	if ((flags & MAP_ANON) | (flags & MAP_ANONYMOUS))
-	{
+	if (flags & MAP_ANON || flags & MAP_ANONYMOUS) {
 		allocAddr = VirtualAlloc(addr, len, MEM_COMMIT, vaProt);
 		if (allocAddr == NULL)
 			errno = win_to_posix_error(GetLastError());
+	} else {
+		hMap = CreateFileMapping((HANDLE)_get_osfhandle(fildes), NULL,
+						vaProt, lenhigh, lenlow, NULL);
+
+		if (hMap != NULL)
+			allocAddr = MapViewOfFile(hMap, mapAccess, off >> 16,
+							off & 0xFFFF, len);
+		if (hMap == NULL || allocAddr == NULL)
+			errno = win_to_posix_error(GetLastError());
+
 	}
 
 	return allocAddr;
@@ -298,12 +414,22 @@
 
 int munmap(void *addr, size_t len)
 {
-	if (!VirtualFree(addr, 0, MEM_RELEASE)) {
-		errno = win_to_posix_error(GetLastError());
-		return -1;
-	}
+	BOOL success;
 
-	return 0;
+	/* We may have allocated the memory with either MapViewOfFile or
+		 VirtualAlloc. Therefore, try calling UnmapViewOfFile first, and if that
+		 fails, call VirtualFree. */
+	success = UnmapViewOfFile(addr);
+
+	if (!success)
+		success = VirtualFree(addr, 0, MEM_RELEASE);
+
+	return !success;
+}
+
+int msync(void *addr, size_t len, int flags)
+{
+	return !FlushViewOfFile(addr, len);
 }
 
 int fork(void)
@@ -324,8 +450,12 @@
 
 void openlog(const char *ident, int logopt, int facility)
 {
-	if (log_file == INVALID_HANDLE_VALUE)
-		log_file = CreateFileA("syslog.txt", GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, 0, NULL);
+	if (log_file != INVALID_HANDLE_VALUE)
+		return;
+
+	log_file = CreateFileA("syslog.txt", GENERIC_WRITE,
+				FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+				OPEN_ALWAYS, 0, NULL);
 }
 
 void closelog(void)
@@ -342,7 +472,9 @@
 	DWORD bytes_written;
 
 	if (log_file == INVALID_HANDLE_VALUE) {
-		log_file = CreateFileA("syslog.txt", GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, 0, NULL);
+		log_file = CreateFileA("syslog.txt", GENERIC_WRITE,
+					FILE_SHARE_READ | FILE_SHARE_WRITE,
+					NULL, OPEN_ALWAYS, 0, NULL);
 	}
 
 	if (log_file == INVALID_HANDLE_VALUE) {
@@ -417,8 +549,7 @@
 {
 	int rc = 0;
 
-	if (clock_id == CLOCK_MONOTONIC)
-	{
+	if (clock_id == CLOCK_MONOTONIC) {
 		static LARGE_INTEGER freq = {{0,0}};
 		LARGE_INTEGER counts;
 		uint64_t t;
@@ -437,9 +568,7 @@
 		 * and then divide by the frequency. */
 		t *= 1000000000;
 		tp->tv_nsec = t / freq.QuadPart;
-	}
-	else if (clock_id == CLOCK_REALTIME)
-	{
+	} else if (clock_id == CLOCK_REALTIME) {
 		/* clock_gettime(CLOCK_REALTIME,...) is just an alias for gettimeofday with a
 		 * higher-precision field. */
 		struct timeval tv;
@@ -486,6 +615,7 @@
 int munlock(const void * addr, size_t len)
 {
 	BOOL success = VirtualUnlock((LPVOID)addr, len);
+
 	if (!success) {
 		errno = win_to_posix_error(GetLastError());
 		return -1;
@@ -520,7 +650,8 @@
 	while (path[i] != '\\' && path[i] != '/' && i >= 0)
 		i--;
 
-	strncpy(name, path + i + 1, MAX_PATH);
+	name[MAX_PATH - 1] = '\0';
+	strncpy(name, path + i + 1, MAX_PATH - 1);
 
 	return name;
 }
@@ -544,22 +675,26 @@
 	int mapid = -1;
 	uint32_t size_low = size & 0xFFFFFFFF;
 	uint32_t size_high = ((uint64_t)size) >> 32;
-	HANDLE hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, (PAGE_EXECUTE_READWRITE | SEC_RESERVE), size_high, size_low, NULL);
+	HANDLE hMapping;
+
+	hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL,
+					PAGE_EXECUTE_READWRITE | SEC_RESERVE,
+					size_high, size_low, NULL);
 	if (hMapping != NULL) {
 		fileMappings[nFileMappings] = hMapping;
 		mapid = nFileMappings;
 		nFileMappings++;
-	} else {
+	} else
 		errno = ENOSYS;
-	}
 
 	return mapid;
 }
 
 void *shmat(int shmid, const void *shmaddr, int shmflg)
 {
-	void* mapAddr;
+	void *mapAddr;
 	MEMORY_BASIC_INFORMATION memInfo;
+
 	mapAddr = MapViewOfFile(fileMappings[shmid], FILE_MAP_ALL_ACCESS, 0, 0, 0);
 	if (mapAddr == NULL) {
 		errno = win_to_posix_error(GetLastError());
@@ -595,9 +730,9 @@
 	if (cmd == IPC_RMID) {
 		fileMappings[shmid] = INVALID_HANDLE_VALUE;
 		return 0;
-	} else {
-		log_err("%s is not implemented\n", __func__);
 	}
+
+	log_err("%s is not implemented\n", __func__);
 	errno = ENOSYS;
 	return -1;
 }
@@ -618,10 +753,19 @@
 
 int nice(int incr)
 {
-	if (incr != 0) {
-		errno = EINVAL;
-		return -1;
-	}
+	DWORD prioclass = NORMAL_PRIORITY_CLASS;
+	
+	if (incr < -15)
+		prioclass = HIGH_PRIORITY_CLASS;
+	else if (incr < 0)
+		prioclass = ABOVE_NORMAL_PRIORITY_CLASS;
+	else if (incr > 15)
+		prioclass = IDLE_PRIORITY_CLASS;
+	else if (incr > 0)
+		prioclass = BELOW_NORMAL_PRIORITY_CLASS;
+	
+	if (!SetPriorityClass(GetCurrentProcess(), prioclass))
+		log_err("fio: SetPriorityClass failed\n");
 
 	return 0;
 }
@@ -664,17 +808,9 @@
 
 int posix_madvise(void *addr, size_t len, int advice)
 {
-	log_err("%s is not implemented\n", __func__);
 	return ENOSYS;
 }
 
-/* Windows doesn't support advice for memory pages. Just ignore it. */
-int msync(void *addr, size_t len, int flags)
-{
-	errno = ENOSYS;
-	return -1;
-}
-
 int fdatasync(int fildes)
 {
 	return fsync(fildes);
@@ -685,6 +821,7 @@
 {
 	int64_t pos = _telli64(fildes);
 	ssize_t len = _write(fildes, buf, nbyte);
+
 	_lseeki64(fildes, pos, SEEK_SET);
 	return len;
 }
@@ -693,6 +830,7 @@
 {
 	int64_t pos = _telli64(fildes);
 	ssize_t len = read(fildes, buf, nbyte);
+
 	_lseeki64(fildes, pos, SEEK_SET);
 	return len;
 }
@@ -706,13 +844,26 @@
 
 ssize_t writev(int fildes, const struct iovec *iov, int iovcnt)
 {
-	log_err("%s is not implemented\n", __func__);
-	errno = ENOSYS;
-	return -1;
+	int i;
+	DWORD bytes_written = 0;
+
+	for (i = 0; i < iovcnt; i++) {
+		int len;
+
+		len = send((SOCKET)fildes, iov[i].iov_base, iov[i].iov_len, 0);
+		if (len == SOCKET_ERROR) {
+			DWORD err = GetLastError();
+			errno = win_to_posix_error(err);
+			bytes_written = -1;
+			break;
+		}
+		bytes_written += len;
+	}
+
+	return bytes_written;
 }
 
-long long strtoll(const char *restrict str, char **restrict endptr,
-		int base)
+long long strtoll(const char *restrict str, char **restrict endptr, int base)
 {
 	return _strtoi64(str, endptr, base);
 }
@@ -735,8 +886,7 @@
 	FD_ZERO(&writefds);
 	FD_ZERO(&exceptfds);
 
-	for (i = 0; i < nfds; i++)
-	{
+	for (i = 0; i < nfds; i++) {
 		if (fds[i].fd < 0) {
 			fds[i].revents = 0;
 			continue;
@@ -750,15 +900,12 @@
 
 		FD_SET(fds[i].fd, &exceptfds);
 	}
-
 	rc = select(nfds, &readfds, &writefds, &exceptfds, to);
 
 	if (rc != SOCKET_ERROR) {
-		for (i = 0; i < nfds; i++)
-		{
-			if (fds[i].fd < 0) {
+		for (i = 0; i < nfds; i++) {
+			if (fds[i].fd < 0)
 				continue;
-			}
 
 			if ((fds[i].events & POLLIN) && FD_ISSET(fds[i].fd, &readfds))
 				fds[i].revents |= POLLIN;
@@ -770,13 +917,12 @@
 				fds[i].revents |= POLLHUP;
 		}
 	}
-
 	return rc;
 }
 
 int nanosleep(const struct timespec *rqtp, struct timespec *rmtp)
 {
-	struct timeval tv;
+	struct timespec tv;
 	DWORD ms_remaining;
 	DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0);
 
@@ -805,9 +951,11 @@
 DIR *opendir(const char *dirname)
 {
 	struct dirent_ctx *dc = NULL;
+	HANDLE file;
 
 	/* See if we can open it. If not, we'll return an error here */
-	HANDLE file = CreateFileA(dirname, 0, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+	file = CreateFileA(dirname, 0, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+				OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
 	if (file != INVALID_HANDLE_VALUE) {
 		CloseHandle(file);
 		dc = (struct dirent_ctx*)malloc(sizeof(struct dirent_ctx));
@@ -850,7 +998,8 @@
 
 	if (dirp->find_handle == INVALID_HANDLE_VALUE) {
 		char search_pattern[MAX_PATH];
-		StringCchPrintfA(search_pattern, MAX_PATH, "%s\\*", dirp->dirname);
+
+		StringCchPrintfA(search_pattern, MAX_PATH-1, "%s\\*", dirp->dirname);
 		dirp->find_handle = FindFirstFileA(search_pattern, &find_data);
 		if (dirp->find_handle == INVALID_HANDLE_VALUE)
 			return NULL;
@@ -872,8 +1021,17 @@
 	return -1;
 }
 
-const char* inet_ntop(int af, const void *restrict src,
-		char *restrict dst, socklen_t size)
+in_addr_t inet_network(const char *cp)
+{
+	in_addr_t hbo;
+	in_addr_t nbo = inet_addr(cp);
+	hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
+	return hbo;
+}
+
+#ifdef CONFIG_WINDOWS_XP
+const char *inet_ntop(int af, const void *restrict src, char *restrict dst,
+		      socklen_t size)
 {
 	INT status = SOCKET_ERROR;
 	WSADATA wsd;
@@ -889,6 +1047,7 @@
 	if (af == AF_INET) {
 		struct sockaddr_in si;
 		DWORD len = size;
+
 		memset(&si, 0, sizeof(si));
 		si.sin_family = af;
 		memcpy(&si.sin_addr, src, sizeof(si.sin_addr));
@@ -896,6 +1055,7 @@
 	} else if (af == AF_INET6) {
 		struct sockaddr_in6 si6;
 		DWORD len = size;
+
 		memset(&si6, 0, sizeof(si6));
 		si6.sin6_family = af;
 		memcpy(&si6.sin6_addr, src, sizeof(si6.sin6_addr));
@@ -928,6 +1088,7 @@
 	if (af == AF_INET) {
 		struct sockaddr_in si;
 		INT len = sizeof(si);
+
 		memset(&si, 0, sizeof(si));
 		si.sin_family = af;
 		status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si, &len);
@@ -936,6 +1097,7 @@
 	} else if (af == AF_INET6) {
 		struct sockaddr_in6 si6;
 		INT len = sizeof(si6);
+
 		memset(&si6, 0, sizeof(si6));
 		si6.sin6_family = af;
 		status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si6, &len);
@@ -952,3 +1114,4 @@
 
 	return ret;
 }
+#endif /* CONFIG_WINDOWS_XP */
diff -Nru fio-2.1.3/oslib/asprintf.c fio-3.16/oslib/asprintf.c
--- fio-2.1.3/oslib/asprintf.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/asprintf.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,43 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "oslib/asprintf.h"
+
+#ifndef CONFIG_HAVE_VASPRINTF
+int vasprintf(char **strp, const char *fmt, va_list ap)
+{
+	va_list ap_copy;
+	char *str;
+	int len;
+
+#ifdef va_copy
+	va_copy(ap_copy, ap);
+#else
+	__va_copy(ap_copy, ap);
+#endif
+	len = vsnprintf(NULL, 0, fmt, ap_copy);
+	va_end(ap_copy);
+
+	if (len < 0)
+		return len;
+
+	len++;
+	str = malloc(len);
+	*strp = str;
+	return str ? vsnprintf(str, len, fmt, ap) : -1;
+}
+#endif
+
+#ifndef CONFIG_HAVE_ASPRINTF
+int asprintf(char **strp, const char *fmt, ...)
+{
+	va_list arg;
+	int done;
+
+	va_start(arg, fmt);
+	done = vasprintf(strp, fmt, arg);
+	va_end(arg);
+
+	return done;
+}
+#endif
diff -Nru fio-2.1.3/oslib/asprintf.h fio-3.16/oslib/asprintf.h
--- fio-2.1.3/oslib/asprintf.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/asprintf.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,11 @@
+#ifndef FIO_ASPRINTF_H
+#define FIO_ASPRINTF_H
+
+#ifndef CONFIG_HAVE_VASPRINTF
+int vasprintf(char **strp, const char *fmt, va_list ap);
+#endif
+#ifndef CONFIG_HAVE_ASPRINTF
+int asprintf(char **strp, const char *fmt, ...);
+#endif
+
+#endif /* FIO_ASPRINTF_H */
diff -Nru fio-2.1.3/oslib/getopt.h fio-3.16/oslib/getopt.h
--- fio-2.1.3/oslib/getopt.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/getopt.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,26 @@
+#ifdef CONFIG_GETOPT_LONG_ONLY
+
+#include <getopt.h>
+
+#else
+
+#ifndef _GETOPT_H
+#define _GETOPT_H
+
+struct option {
+	const char *name;
+	int has_arg;
+	int *flag;
+	int val;
+};
+
+enum {
+	no_argument	  = 0,
+	required_argument = 1,
+	optional_argument = 2,
+};
+
+int getopt_long_only(int, char *const *, const char *, const struct option *, int *);
+
+#endif
+#endif
diff -Nru fio-2.1.3/oslib/getopt_long.c fio-3.16/oslib/getopt_long.c
--- fio-2.1.3/oslib/getopt_long.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/getopt_long.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,193 @@
+/*
+ * getopt.c
+ *
+ * getopt_long(), or at least a common subset thereof:
+ *
+ * - Option reordering is not supported
+ * - -W foo is not supported
+ * - First optstring character "-" not supported.
+ *
+ * This file was imported from the klibc library from hpa
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "getopt.h"
+
+char *optarg = NULL;
+int optind = 0, opterr = 0, optopt = 0;
+
+static struct getopt_private_state {
+	const char *optptr;
+	const char *last_optstring;
+	char *const *last_argv;
+} pvt;
+
+static inline const char *option_matches(const char *arg_str,
+					 const char *opt_name, int smatch)
+{
+	while (*arg_str != '\0' && *arg_str != '=') {
+		if (*arg_str++ != *opt_name++)
+			return NULL;
+	}
+
+	if (*opt_name && !smatch)
+		return NULL;
+
+	return arg_str;
+}
+
+int getopt_long_only(int argc, char *const *argv, const char *optstring,
+		const struct option *longopts, int *longindex)
+{
+	const char *carg;
+	const char *osptr;
+	int opt;
+
+	optarg = NULL;
+
+	/* getopt() relies on a number of different global state
+	   variables, which can make this really confusing if there is
+	   more than one use of getopt() in the same program.  This
+	   attempts to detect that situation by detecting if the
+	   "optstring" or "argv" argument have changed since last time
+	   we were called; if so, reinitialize the query state. */
+
+	if (optstring != pvt.last_optstring || argv != pvt.last_argv ||
+	    optind < 1 || optind > argc) {
+		/* optind doesn't match the current query */
+		pvt.last_optstring = optstring;
+		pvt.last_argv = argv;
+		optind = 1;
+		pvt.optptr = NULL;
+	}
+
+	carg = argv[optind];
+
+	/* First, eliminate all non-option cases */
+
+	if (!carg || carg[0] != '-' || !carg[1])
+		return -1;
+
+	if (carg[1] == '-') {
+		const struct option *lo;
+		const char *opt_end = NULL;
+
+		optind++;
+
+		/* Either it's a long option, or it's -- */
+		if (!carg[2]) {
+			/* It's -- */
+			return -1;
+		}
+
+		for (lo = longopts; lo->name; lo++) {
+			opt_end = option_matches(carg+2, lo->name, 0);
+			if (opt_end)
+			    break;
+		}
+		/*
+		 * The GNU getopt_long_only() apparently allows a short match,
+		 * if it's unique and if we don't have a full match. Let's
+		 * do the same here, search and see if there is one (and only
+		 * one) short match.
+		 */
+		if (!opt_end) {
+			const struct option *lo_match = NULL;
+
+			for (lo = longopts; lo->name; lo++) {
+				const char *ret;
+
+				ret = option_matches(carg+2, lo->name, 1);
+				if (!ret)
+					continue;
+				if (!opt_end) {
+					opt_end = ret;
+					lo_match = lo;
+				} else {
+					opt_end = NULL;
+					break;
+				}
+			}
+			if (!opt_end)
+				return '?';
+			lo = lo_match;
+		}
+
+		if (longindex)
+			*longindex = lo-longopts;
+
+		if (*opt_end == '=') {
+			if (lo->has_arg)
+				optarg = (char *)opt_end+1;
+			else
+				return '?';
+		} else if (lo->has_arg == 1) {
+			if (!(optarg = argv[optind]))
+				return '?';
+			optind++;
+		}
+
+		if (lo->flag) {
+			*lo->flag = lo->val;
+			return 0;
+		} else {
+			return lo->val;
+		}
+	}
+
+	if ((uintptr_t) (pvt.optptr - carg) > (uintptr_t) strlen(carg)) {
+		/* Someone frobbed optind, change to new opt. */
+		pvt.optptr = carg + 1;
+	}
+
+	opt = *pvt.optptr++;
+
+	if (opt != ':' && (osptr = strchr(optstring, opt))) {
+		if (osptr[1] == ':') {
+			if (*pvt.optptr) {
+				/* Argument-taking option with attached
+				   argument */
+				optarg = (char *)pvt.optptr;
+				optind++;
+			} else {
+				/* Argument-taking option with non-attached
+				   argument */
+				if (osptr[2] == ':') {
+					if (argv[optind + 1]) {
+						optarg = (char *)argv[optind+1];
+						optind += 2;
+					} else {
+						optarg = NULL;
+						optind++;
+					}
+					return opt;
+				} else if (argv[optind + 1]) {
+					optarg = (char *)argv[optind+1];
+					optind += 2;
+				} else {
+					/* Missing argument */
+					optind++;
+					return (optstring[0] == ':')
+						? ':' : '?';
+				}
+			}
+			return opt;
+		} else {
+			/* Non-argument-taking option */
+			/* pvt.optptr will remember the exact position to
+			   resume at */
+			if (!*pvt.optptr)
+				optind++;
+			return opt;
+		}
+	} else {
+		/* Unknown option */
+		optopt = opt;
+		if (!*pvt.optptr)
+			optind++;
+		return '?';
+	}
+}
diff -Nru fio-2.1.3/oslib/inet_aton.c fio-3.16/oslib/inet_aton.c
--- fio-2.1.3/oslib/inet_aton.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/inet_aton.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,6 @@
+#include "inet_aton.h"
+
+int inet_aton(const char *cp, struct in_addr *inp)
+{
+	return inet_pton(AF_INET, cp, inp);
+}
diff -Nru fio-2.1.3/oslib/inet_aton.h fio-3.16/oslib/inet_aton.h
--- fio-2.1.3/oslib/inet_aton.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/inet_aton.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+#ifndef FIO_INET_ATON_LIB_H
+#define FIO_INET_ATON_LIB_H
+
+#include <arpa/inet.h>
+
+int inet_aton(const char *cp, struct in_addr *inp);
+
+#endif
diff -Nru fio-2.1.3/oslib/libmtd.c fio-3.16/oslib/libmtd.c
--- fio-2.1.3/oslib/libmtd.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,1424 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <mtd/mtd-user.h>
+#include "libmtd.h"
+
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+/**
+ * mkpath - compose full path from 2 given components.
+ * @path: the first component
+ * @name: the second component
+ *
+ * This function returns the resulting path in case of success and %NULL in
+ * case of failure.
+ */
+static char *mkpath(const char *path, const char *name)
+{
+	char *n;
+	size_t len1 = strlen(path);
+	size_t len2 = strlen(name);
+
+	n = xmalloc(len1 + len2 + 6);
+
+	memcpy(n, path, len1);
+	if (n[len1 - 1] != '/')
+		n[len1++] = '/';
+
+	memcpy(n + len1, name, len2 + 1);
+	return n;
+}
+
+/**
+ * read_data - read data from a file.
+ * @file: the file to read from
+ * @buf: the buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure. Note, if the file contains more then @buf_len bytes of
+ * date, this function fails with %EINVAL error code.
+ */
+static int read_data(const char *file, void *buf, int buf_len)
+{
+	int fd, rd, tmp, tmp1;
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, buf_len);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+
+	if (rd == buf_len) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	((char *)buf)[rd] = '\0';
+
+	/* Make sure all data is read */
+	tmp1 = read(fd, &tmp, 1);
+	if (tmp1 == 1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (tmp1) {
+		errmsg("file \"%s\" contains too much data (> %d bytes)",
+		       file, buf_len);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return rd;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_major - read major and minor numbers from a file.
+ * @file: name of the file to read from
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns % in case of success, and %-1 in case of failure.
+ */
+static int read_major(const char *file, int *major, int *minor)
+{
+	int ret;
+	char buf[50];
+
+	ret = read_data(file, buf, 50);
+	if (ret < 0)
+		return ret;
+
+	ret = sscanf(buf, "%d:%d\n", major, minor);
+	if (ret != 2) {
+		errno = EINVAL;
+		return errmsg("\"%s\" does not have major:minor format", file);
+	}
+
+	if (*major < 0 || *minor < 0) {
+		errno = EINVAL;
+		return errmsg("bad major:minor %d:%d in \"%s\"",
+			      *major, *minor, file);
+	}
+
+	return 0;
+}
+
+/**
+ * dev_get_major - get major and minor numbers of an MTD device.
+ * @lib: libmtd descriptor
+ * @mtd_num: MTD device number
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns zero in case of success and %-1 in case of failure.
+ */
+static int dev_get_major(struct libmtd *lib, int mtd_num, int *major, int *minor)
+{
+	char file[strlen(lib->mtd_dev) + 50];
+
+	sprintf(file, lib->mtd_dev, mtd_num);
+	return read_major(file, major, minor);
+}
+
+/**
+ * dev_read_data - read data from an MTD device's sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @buf: buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure.
+ */
+static int dev_read_data(const char *patt, int mtd_num, void *buf, int buf_len)
+{
+	char file[strlen(patt) + 100];
+
+	sprintf(file, patt, mtd_num);
+	return read_data(file, buf, buf_len);
+}
+
+/**
+ * read_hex_ll - read a hex 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as hexadecimal
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_hex_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, sizeof(buf));
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == sizeof(buf)) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+	buf[rd] = '\0';
+
+	if (sscanf(buf, "%llx\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_pos_ll - read a positive 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as a positive
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_pos_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, 50);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == 50) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (sscanf(buf, "%lld\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_hex_int - read an 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_hex_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_hex_ll(file, &res))
+		return -1;
+
+	/* Make sure the value has correct range */
+	if (res > INT_MAX || res < INT_MIN) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * read_pos_int - read a positive 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_pos_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_pos_ll(file, &res))
+		return -1;
+
+	/* Make sure the value is not too big */
+	if (res > INT_MAX) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * dev_read_hex_int - read an hex 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_hex_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_hex_int(file, value);
+}
+
+/**
+ * dev_read_pos_int - read a positive 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_int(file, value);
+}
+
+/**
+ * dev_read_pos_ll - read a positive 'long long' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_ll(const char *patt, int mtd_num, long long *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_ll(file, value);
+}
+
+/**
+ * type_str2int - convert MTD device type to integer.
+ * @str: MTD device type string to convert
+ *
+ * This function converts MTD device type string @str, read from sysfs, into an
+ * integer.
+ */
+static int type_str2int(const char *str)
+{
+	if (!strcmp(str, "nand"))
+		return MTD_NANDFLASH;
+	if (!strcmp(str, "mlc-nand"))
+		return MTD_MLCNANDFLASH;
+	if (!strcmp(str, "nor"))
+		return MTD_NORFLASH;
+	if (!strcmp(str, "rom"))
+		return MTD_ROM;
+	if (!strcmp(str, "absent"))
+		return MTD_ABSENT;
+	if (!strcmp(str, "dataflash"))
+		return MTD_DATAFLASH;
+	if (!strcmp(str, "ram"))
+		return MTD_RAM;
+	if (!strcmp(str, "ubi"))
+		return MTD_UBIVOLUME;
+	return -1;
+}
+
+/**
+ * dev_node2num - find UBI device number by its character device node.
+ * @lib: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd_num: MTD device number is returned here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_node2num(struct libmtd *lib, const char *node, int *mtd_num)
+{
+	struct stat st;
+	int i, mjr, mnr;
+	struct mtd_info info;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr) {
+			errno = 0;
+			*mtd_num = i;
+			return 0;
+		}
+	}
+
+	errno = ENODEV;
+	return -1;
+}
+
+/**
+ * sysfs_is_supported - check whether the MTD sub-system supports MTD.
+ * @lib: MTD library descriptor
+ *
+ * The Linux kernel MTD subsystem gained MTD support starting from kernel
+ * 2.6.30 and libmtd tries to use sysfs interface if possible, because the NAND
+ * sub-page size is available there (and not available at all in pre-sysfs
+ * kernels).
+ *
+ * Very old kernels did not have "/sys/class/mtd" directory. Not very old
+ * kernels (e.g., 2.6.29) did have "/sys/class/mtd/mtdX" directories, by there
+ * were no files there, e.g., the "name" file was not present. So all we can do
+ * is to check for a "/sys/class/mtd/mtdX/name" file. But this is not a
+ * reliable check, because if this is a new system with no MTD devices - we'll
+ * treat it as a pre-sysfs system.
+ */
+static int sysfs_is_supported(struct libmtd *lib)
+{
+	int fd, num = -1;
+	DIR *sysfs_mtd;
+	char file[strlen(lib->mtd_name) + 10];
+
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = 0;
+			return 0;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	/*
+	 * First of all find an "mtdX" directory. This is needed because there
+	 * may be, for example, mtd1 but no mtd0.
+	 */
+	while (1) {
+		int ret, mtd_num;
+		char tmp_buf[256];
+		struct dirent *dirent;
+
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			closedir(sysfs_mtd);
+			return -1;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			num = mtd_num;
+			break;
+		}
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (num == -1)
+		/* No mtd device, treat this as pre-sysfs system */
+		return 0;
+
+	sprintf(file, lib->mtd_name, num);
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return 0;
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return 1;
+}
+
+libmtd_t libmtd_open(void)
+{
+	struct libmtd *lib;
+
+	lib = xzalloc(sizeof(*lib));
+
+	lib->offs64_ioctls = OFFS64_IOCTLS_UNKNOWN;
+
+	lib->sysfs_mtd = mkpath("/sys", SYSFS_MTD);
+	if (!lib->sysfs_mtd)
+		goto out_error;
+
+	lib->mtd = mkpath(lib->sysfs_mtd, MTD_NAME_PATT);
+	if (!lib->mtd)
+		goto out_error;
+
+	lib->mtd_name = mkpath(lib->mtd, MTD_NAME);
+	if (!lib->mtd_name)
+		goto out_error;
+
+	if (!sysfs_is_supported(lib)) {
+		free(lib->mtd);
+		free(lib->sysfs_mtd);
+		free(lib->mtd_name);
+		lib->mtd_name = lib->mtd = lib->sysfs_mtd = NULL;
+		return lib;
+	}
+
+	lib->mtd_dev = mkpath(lib->mtd, MTD_DEV);
+	if (!lib->mtd_dev)
+		goto out_error;
+
+	lib->mtd_type = mkpath(lib->mtd, MTD_TYPE);
+	if (!lib->mtd_type)
+		goto out_error;
+
+	lib->mtd_eb_size = mkpath(lib->mtd, MTD_EB_SIZE);
+	if (!lib->mtd_eb_size)
+		goto out_error;
+
+	lib->mtd_size = mkpath(lib->mtd, MTD_SIZE);
+	if (!lib->mtd_size)
+		goto out_error;
+
+	lib->mtd_min_io_size = mkpath(lib->mtd, MTD_MIN_IO_SIZE);
+	if (!lib->mtd_min_io_size)
+		goto out_error;
+
+	lib->mtd_subpage_size = mkpath(lib->mtd, MTD_SUBPAGE_SIZE);
+	if (!lib->mtd_subpage_size)
+		goto out_error;
+
+	lib->mtd_oob_size = mkpath(lib->mtd, MTD_OOB_SIZE);
+	if (!lib->mtd_oob_size)
+		goto out_error;
+
+	lib->mtd_region_cnt = mkpath(lib->mtd, MTD_REGION_CNT);
+	if (!lib->mtd_region_cnt)
+		goto out_error;
+
+	lib->mtd_flags = mkpath(lib->mtd, MTD_FLAGS);
+	if (!lib->mtd_flags)
+		goto out_error;
+
+	lib->sysfs_supported = 1;
+	return lib;
+
+out_error:
+	libmtd_close((libmtd_t)lib);
+	return NULL;
+}
+
+void libmtd_close(libmtd_t desc)
+{
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	free(lib->mtd_flags);
+	free(lib->mtd_region_cnt);
+	free(lib->mtd_oob_size);
+	free(lib->mtd_subpage_size);
+	free(lib->mtd_min_io_size);
+	free(lib->mtd_size);
+	free(lib->mtd_eb_size);
+	free(lib->mtd_type);
+	free(lib->mtd_dev);
+	free(lib->mtd_name);
+	free(lib->mtd);
+	free(lib->sysfs_mtd);
+	free(lib);
+}
+
+int mtd_dev_present(libmtd_t desc, int mtd_num) {
+	struct stat st;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported) {
+		return legacy_dev_present(mtd_num) == 1;
+	} else {
+		char file[strlen(lib->mtd) + 10];
+
+		sprintf(file, lib->mtd, mtd_num);
+		return !stat(file, &st);
+	}
+}
+
+int mtd_get_info(libmtd_t desc, struct mtd_info *info)
+{
+	DIR *sysfs_mtd;
+	struct dirent *dirent;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(info, 0, sizeof(struct mtd_info));
+
+	if (!lib->sysfs_supported)
+		return legacy_mtd_get_info(info);
+
+	info->sysfs_supported = 1;
+
+	/*
+	 * We have to scan the MTD sysfs directory to identify how many MTD
+	 * devices are present.
+	 */
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = ENODEV;
+			return -1;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	info->lowest_mtd_num = INT_MAX;
+	while (1) {
+		int mtd_num, ret;
+		char tmp_buf[256];
+
+		errno = 0;
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			goto out_close;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			info->mtd_dev_cnt += 1;
+			if (mtd_num > info->highest_mtd_num)
+				info->highest_mtd_num = mtd_num;
+			if (mtd_num < info->lowest_mtd_num)
+				info->lowest_mtd_num = mtd_num;
+		}
+	}
+
+	if (!dirent && errno) {
+		sys_errmsg("readdir failed on \"%s\"", lib->sysfs_mtd);
+		goto out_close;
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (info->lowest_mtd_num == INT_MAX)
+		info->lowest_mtd_num = 0;
+
+	return 0;
+
+out_close:
+	closedir(sysfs_mtd);
+	return -1;
+}
+
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(mtd, 0, sizeof(struct mtd_dev_info));
+	mtd->mtd_num = mtd_num;
+
+	if (!mtd_dev_present(desc, mtd_num)) {
+		errno = ENODEV;
+		return -1;
+	} else if (!lib->sysfs_supported)
+		return legacy_get_dev_info1(mtd_num, mtd);
+
+	if (dev_get_major(lib, mtd_num, &mtd->major, &mtd->minor))
+		return -1;
+
+	ret = dev_read_data(lib->mtd_name, mtd_num, &mtd->name,
+			    MTD_NAME_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->name)[ret - 1] = '\0';
+
+	ret = dev_read_data(lib->mtd_type, mtd_num, &mtd->type_str,
+			    MTD_TYPE_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->type_str)[ret - 1] = '\0';
+
+	if (dev_read_pos_int(lib->mtd_eb_size, mtd_num, &mtd->eb_size))
+		return -1;
+	if (dev_read_pos_ll(lib->mtd_size, mtd_num, &mtd->size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_min_io_size, mtd_num, &mtd->min_io_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_subpage_size, mtd_num, &mtd->subpage_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_oob_size, mtd_num, &mtd->oob_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_region_cnt, mtd_num, &mtd->region_cnt))
+		return -1;
+	if (dev_read_hex_int(lib->mtd_flags, mtd_num, &ret))
+		return -1;
+	mtd->writable = !!(ret & MTD_WRITEABLE);
+
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+	mtd->type = type_str2int(mtd->type_str);
+	mtd->bb_allowed = !!(mtd->type == MTD_NANDFLASH ||
+				mtd->type == MTD_MLCNANDFLASH);
+
+	return 0;
+}
+
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd)
+{
+	int mtd_num;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported)
+		return legacy_get_dev_info(node, mtd);
+
+	if (dev_node2num(lib, node, &mtd_num))
+		return -1;
+
+	return mtd_get_dev_info1(desc, mtd_num, mtd);
+}
+
+static inline int mtd_ioctl_error(const struct mtd_dev_info *mtd, int eb,
+				  const char *sreq)
+{
+	return sys_errmsg("%s ioctl failed for eraseblock %d (mtd%d)",
+			  sreq, eb, mtd->mtd_num);
+}
+
+static int mtd_valid_erase_block(const struct mtd_dev_info *mtd, int eb)
+{
+	if (eb < 0 || eb >= mtd->eb_cnt) {
+		errmsg("bad eraseblock number %d, mtd%d has %d eraseblocks",
+		       eb, mtd->mtd_num, mtd->eb_cnt);
+		errno = EINVAL;
+		return -1;
+	}
+	return 0;
+}
+
+static int mtd_xlock(const struct mtd_dev_info *mtd, int fd, int eb, int req,
+		     const char *sreq)
+{
+	int ret;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, req, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, sreq);
+
+	return 0;
+}
+#define mtd_xlock(mtd, fd, eb, req) mtd_xlock(mtd, fd, eb, req, #req)
+
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMLOCK);
+}
+
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMUNLOCK);
+}
+
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+	struct erase_info_user64 ei64;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei64.start = (__u64)eb * mtd->eb_size;
+	ei64.length = mtd->eb_size;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, MEMERASE64, &ei64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN)
+			return mtd_ioctl_error(mtd, eb, "MEMERASE64");
+
+		/*
+		 * MEMERASE64 support was added in kernel version 2.6.31, so
+		 * probably we are working with older kernel and this ioctl is
+		 * not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (ei64.start + ei64.length > 0xFFFFFFFF) {
+		errmsg("this system can address only %u eraseblocks",
+		       0xFFFFFFFFU / mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	ei.start = ei64.start;
+	ei.length = ei64.length;
+	ret = ioctl(fd, MEMERASE, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, "MEMERASE");
+	return 0;
+}
+
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo)
+{
+	int ret;
+
+	if (regidx < 0) {
+		errno = ENODEV;
+		return -1;
+	}
+
+	reginfo->regionindex = regidx;
+
+	ret = ioctl(fd, MEMGETREGIONINFO, reginfo);
+	if (ret < 0)
+		return sys_errmsg("%s ioctl failed for erase region %d",
+			"MEMGETREGIONINFO", regidx);
+
+	return 0;
+}
+
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	erase_info_t ei;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, MEMISLOCKED, &ei);
+	if (ret < 0) {
+		if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMISLOCKED");
+		else
+			errno = EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+/* Patterns to write to a physical eraseblock when torturing it */
+static uint8_t patterns[] = {0xa5, 0x5a, 0x0};
+
+/**
+ * check_pattern - check if buffer contains only a certain byte pattern.
+ * @buf: buffer to check
+ * @patt: the pattern to check
+ * @size: buffer size in bytes
+ *
+ * This function returns %1 in there are only @patt bytes in @buf, and %0 if
+ * something else was also found.
+ */
+static int check_pattern(const void *buf, uint8_t patt, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (((const uint8_t *)buf)[i] != patt)
+			return 0;
+	return 1;
+}
+
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int err, i, patt_count;
+	void *buf;
+
+	normsg("run torture test for PEB %d", eb);
+	patt_count = ARRAY_SIZE(patterns);
+
+	buf = xmalloc(mtd->eb_size);
+
+	for (i = 0; i < patt_count; i++) {
+		err = mtd_erase(desc, mtd, fd, eb);
+		if (err)
+			goto out;
+
+		/* Make sure the PEB contains only 0xFF bytes */
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, 0xFF, mtd->eb_size);
+		if (err == 0) {
+			errmsg("erased PEB %d, but a non-0xFF byte found", eb);
+			errno = EIO;
+			goto out;
+		}
+
+		/* Write a pattern and check it */
+		memset(buf, patterns[i], mtd->eb_size);
+		err = mtd_write(desc, mtd, fd, eb, 0, buf, mtd->eb_size, NULL,
+				0, 0);
+		if (err)
+			goto out;
+
+		memset(buf, ~patterns[i], mtd->eb_size);
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, patterns[i], mtd->eb_size);
+		if (err == 0) {
+			errmsg("pattern %x checking failed for PEB %d",
+				patterns[i], eb);
+			errno = EIO;
+			goto out;
+		}
+	}
+
+	normsg("PEB %d passed torture test, do not mark it a bad", eb);
+
+out:
+	free(buf);
+	return -1;
+}
+
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (!mtd->bb_allowed)
+		return 0;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMGETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMGETBADBLOCK");
+	return ret;
+}
+
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	if (!mtd->bb_allowed) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMSETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMSETBADBLOCK");
+	return 0;
+}
+
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len)
+{
+	int ret, rd = 0;
+	off_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek)
+		return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+				  mtd->mtd_num, seek);
+
+	while (rd < len) {
+		ret = read(fd, buf, len);
+		if (ret < 0)
+			return sys_errmsg("cannot read %d bytes from mtd%d (eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+		rd += ret;
+	}
+
+	return 0;
+}
+
+static int legacy_auto_oob_layout(const struct mtd_dev_info *mtd, int fd,
+				  int ooblen, void *oob) {
+	struct nand_oobinfo old_oobinfo;
+	int start, len;
+	uint8_t *tmp_buf;
+
+	/* Read the current oob info */
+	if (ioctl(fd, MEMGETOOBSEL, &old_oobinfo))
+		return sys_errmsg("MEMGETOOBSEL failed");
+
+	tmp_buf = malloc(ooblen);
+	memcpy(tmp_buf, oob, ooblen);
+
+	/*
+	 * We use autoplacement and have the oobinfo with the autoplacement
+	 * information from the kernel available
+	 */
+	if (old_oobinfo.useecc == MTD_NANDECC_AUTOPLACE) {
+		int i, tags_pos = 0;
+		for (i = 0; old_oobinfo.oobfree[i][1]; i++) {
+			/* Set the reserved bytes to 0xff */
+			start = old_oobinfo.oobfree[i][0];
+			len = old_oobinfo.oobfree[i][1];
+			memcpy(oob + start, tmp_buf + tags_pos, len);
+			tags_pos += len;
+		}
+	} else {
+		/* Set at least the ecc byte positions to 0xff */
+		start = old_oobinfo.eccbytes;
+		len = mtd->oob_size - start;
+		memcpy(oob + start, tmp_buf + start, len);
+	}
+	free(tmp_buf);
+
+	return 0;
+}
+
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode)
+{
+	int ret;
+	off_t seek;
+	struct mtd_write_req ops;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (len % mtd->subpage_size) {
+		errmsg("write length %d is not aligned to mtd%d min. I/O size %d",
+		       len, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Calculate seek address */
+	seek = (off_t)eb * mtd->eb_size + offs;
+
+	if (oob) {
+		ops.start = seek;
+		ops.len = len;
+		ops.ooblen = ooblen;
+		ops.usr_data = (uint64_t)(unsigned long)data;
+		ops.usr_oob = (uint64_t)(unsigned long)oob;
+		ops.mode = mode;
+
+		ret = ioctl(fd, MEMWRITE, &ops);
+		if (ret == 0)
+			return 0;
+		else if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMWRITE");
+
+		/* Fall back to old OOB ioctl() if necessary */
+		if (mode == MTD_OPS_AUTO_OOB)
+			if (legacy_auto_oob_layout(mtd, fd, ooblen, oob))
+				return -1;
+		if (mtd_write_oob(desc, mtd, fd, seek, ooblen, oob) < 0)
+			return sys_errmsg("cannot write to OOB");
+	}
+	if (data) {
+		/* Seek to the beginning of the eraseblock */
+		if (lseek(fd, seek, SEEK_SET) != seek)
+			return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+					mtd->mtd_num, seek);
+		ret = write(fd, data, len);
+		if (ret != len)
+			return sys_errmsg("cannot write %d bytes to mtd%d "
+					  "(eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+	}
+
+	return 0;
+}
+
+static int do_oob_op(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+	      uint64_t start, uint64_t length, void *data, unsigned int cmd64,
+	      unsigned int cmd)
+{
+	int ret, oob_offs;
+	struct mtd_oob_buf64 oob64;
+	struct mtd_oob_buf oob;
+	unsigned long long max_offs;
+	const char *cmd64_str, *cmd_str;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (cmd64 ==  MEMREADOOB64) {
+		cmd64_str = "MEMREADOOB64";
+		cmd_str   = "MEMREADOOB";
+	} else {
+		cmd64_str = "MEMWRITEOOB64";
+		cmd_str   = "MEMWRITEOOB";
+	}
+
+	max_offs = (unsigned long long)mtd->eb_cnt * mtd->eb_size;
+	if (start >= max_offs) {
+		errmsg("bad page address %" PRIu64 ", mtd%d has %d eraseblocks (%llu bytes)",
+		       start, mtd->mtd_num, mtd->eb_cnt, max_offs);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob_offs = start & (mtd->min_io_size - 1);
+	if (oob_offs + length > mtd->oob_size || length == 0) {
+		errmsg("Cannot write %" PRIu64 " OOB bytes to address %" PRIu64 " (OOB offset %u) - mtd%d OOB size is only %d bytes",
+		       length, start, oob_offs, mtd->mtd_num,  mtd->oob_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob64.start = start;
+	oob64.length = length;
+	oob64.usr_ptr = (uint64_t)(unsigned long)data;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, cmd64, &oob64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN) {
+			sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+				   cmd64_str, mtd->mtd_num, start, start / mtd->eb_size);
+		}
+
+		/*
+		 * MEMREADOOB64/MEMWRITEOOB64 support was added in kernel
+		 * version 2.6.31, so probably we are working with older kernel
+		 * and these ioctls are not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (oob64.start > 0xFFFFFFFFULL) {
+		errmsg("this system can address only up to address %lu",
+		       0xFFFFFFFFUL);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob.start = oob64.start;
+	oob.length = oob64.length;
+	oob.ptr = data;
+
+	ret = ioctl(fd, cmd, &oob);
+	if (ret < 0)
+		sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+			   cmd_str, mtd->mtd_num, start, start / mtd->eb_size);
+	return ret;
+}
+
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMREADOOB64, MEMREADOOB);
+}
+
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMWRITEOOB64, MEMWRITEOOB);
+}
+
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name)
+{
+	int tmp, ret, in_fd, len, written = 0;
+	off_t seek;
+	struct stat st;
+	char *buf;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs >= mtd->eb_size) {
+		errmsg("bad offset %d, mtd%d eraseblock size is %d",
+		       offs, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	in_fd = open(img_name, O_RDONLY | O_CLOEXEC);
+	if (in_fd == -1)
+		return sys_errmsg("cannot open \"%s\"", img_name);
+
+	if (fstat(in_fd, &st)) {
+		sys_errmsg("cannot stat %s", img_name);
+		goto out_close;
+	}
+
+	len = st.st_size;
+	if (len % mtd->subpage_size) {
+		errmsg("size of \"%s\" is %d byte, which is not aligned to "
+		       "mtd%d min. I/O size %d", img_name, len, mtd->mtd_num,
+		       mtd->subpage_size);
+		errno = EINVAL;
+		goto out_close;
+	}
+	tmp = (offs + len + mtd->eb_size - 1) / mtd->eb_size;
+	if (eb + tmp > mtd->eb_cnt) {
+		errmsg("\"%s\" image size is %d bytes, mtd%d size is %d "
+		       "eraseblocks, the image does not fit if we write it "
+		       "starting from eraseblock %d, offset %d",
+		       img_name, len, mtd->mtd_num, mtd->eb_cnt, eb, offs);
+		errno = EINVAL;
+		goto out_close;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek) {
+		sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+			    mtd->mtd_num, seek);
+		goto out_close;
+	}
+
+	buf = xmalloc(mtd->eb_size);
+
+	while (written < len) {
+		int rd = 0;
+
+		do {
+			ret = read(in_fd, buf, mtd->eb_size - offs - rd);
+			if (ret == -1) {
+				sys_errmsg("cannot read \"%s\"", img_name);
+				goto out_free;
+			}
+			rd += ret;
+		} while (ret && rd < mtd->eb_size - offs);
+
+		ret = write(fd, buf, rd);
+		if (ret != rd) {
+			sys_errmsg("cannot write %d bytes to mtd%d (eraseblock %d, offset %d)",
+				   len, mtd->mtd_num, eb, offs);
+			goto out_free;
+		}
+
+		offs = 0;
+		eb += 1;
+		written += rd;
+	}
+
+	free(buf);
+	close(in_fd);
+	return 0;
+
+out_free:
+	free(buf);
+out_close:
+	close(in_fd);
+	return -1;
+}
+
+int mtd_probe_node(libmtd_t desc, const char *node)
+{
+	struct stat st;
+	struct mtd_info info;
+	int i, mjr, mnr;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	if (!lib->sysfs_supported)
+		return 0;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr)
+			return 1;
+	}
+
+	errno = 0;
+	return -1;
+}
diff -Nru fio-2.1.3/oslib/libmtd_common.h fio-3.16/oslib/libmtd_common.h
--- fio-2.1.3/oslib/libmtd_common.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd_common.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __MTD_UTILS_COMMON_H__
+#define __MTD_UTILS_COMMON_H__
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <features.h>
+#include <inttypes.h>
+#include <sys/sysmacros.h>
+
+#ifndef PROGRAM_NAME
+# error "You must define PROGRAM_NAME before including this header"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MIN	/* some C lib headers define this for us */
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#define min(a, b) MIN(a, b) /* glue for linux kernel source */
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
+
+#define min_t(t,x,y) ({ \
+	__typeof__((x)) _x = (x); \
+	__typeof__((y)) _y = (y); \
+	(_x < _y) ? _x : _y; \
+})
+
+#define max_t(t,x,y) ({ \
+	__typeof__((x)) _x = (x); \
+	__typeof__((y)) _y = (y); \
+	(_x > _y) ? _x : _y; \
+})
+
+#ifndef O_CLOEXEC
+#define O_CLOEXEC 0
+#endif
+
+/* define a print format specifier for off_t */
+#ifdef __USE_FILE_OFFSET64
+#define PRIxoff_t PRIx64
+#define PRIdoff_t PRId64
+#else
+#define PRIxoff_t "l"PRIx32
+#define PRIdoff_t "l"PRId32
+#endif
+
+/* Verbose messages */
+#define bareverbose(verbose, fmt, ...) do {                        \
+	if (verbose)                                               \
+		printf(fmt, ##__VA_ARGS__);                        \
+} while(0)
+#define verbose(verbose, fmt, ...) \
+	bareverbose(verbose, "%s: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__)
+
+/* Normal messages */
+#define normsg_cont(fmt, ...) do {                                 \
+	printf("%s: " fmt, PROGRAM_NAME, ##__VA_ARGS__);           \
+} while(0)
+#define normsg(fmt, ...) do {                                      \
+	normsg_cont(fmt "\n", ##__VA_ARGS__);                      \
+} while(0)
+
+/* Error messages */
+#define errmsg(fmt, ...)  ({                                                \
+	fprintf(stderr, "%s: error!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+	-1;                                                                 \
+})
+#define errmsg_die(fmt, ...) do {                                           \
+	exit(errmsg(fmt, ##__VA_ARGS__));                                   \
+} while(0)
+
+/* System error messages */
+#define sys_errmsg(fmt, ...)  ({                                            \
+	int _err = errno;                                                   \
+	errmsg(fmt, ##__VA_ARGS__);                                         \
+	fprintf(stderr, "%*serror %d (%s)\n", (int)sizeof(PROGRAM_NAME) + 1,\
+		"", _err, strerror(_err));                                  \
+	-1;                                                                 \
+})
+#define sys_errmsg_die(fmt, ...) do {                                       \
+	exit(sys_errmsg(fmt, ##__VA_ARGS__));                               \
+} while(0)
+
+/* Warnings */
+#define warnmsg(fmt, ...) do {                                                \
+	fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+} while(0)
+
+static inline int is_power_of_2(unsigned long long n)
+{
+	return (n != 0 && ((n & (n - 1)) == 0));
+}
+
+/**
+ * simple_strtoX - convert a hex/dec/oct string into a number
+ * @snum: buffer to convert
+ * @error: set to 1 when buffer isn't fully consumed
+ *
+ * These functions are similar to the standard strtoX() functions, but they are
+ * a little bit easier to use if you want to convert full string of digits into
+ * the binary form. The typical usage:
+ *
+ * int error = 0;
+ * unsigned long num;
+ *
+ * num = simple_strtoul(str, &error);
+ * if (error || ... if needed, your check that num is not out of range ...)
+ * 	error_happened();
+ */
+#define simple_strtoX(func, type) \
+static inline type simple_##func(const char *snum, int *error) \
+{ \
+	char *endptr; \
+	type ret = func(snum, &endptr, 0); \
+ \
+	if (error && (!*snum || *endptr)) { \
+		errmsg("%s: unable to parse the number '%s'", #func, snum); \
+		*error = 1; \
+	} \
+ \
+	return ret; \
+}
+simple_strtoX(strtol, long int)
+simple_strtoX(strtoll, long long int)
+simple_strtoX(strtoul, unsigned long int)
+simple_strtoX(strtoull, unsigned long long int)
+
+/* Simple version-printing for utils */
+#define common_print_version() \
+do { \
+	printf("%s %s\n", PROGRAM_NAME, VERSION); \
+} while (0)
+
+#include "libmtd_xalloc.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__MTD_UTILS_COMMON_H__ */
diff -Nru fio-2.1.3/oslib/libmtd.h fio-3.16/oslib/libmtd.h
--- fio-2.1.3/oslib/libmtd.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,357 @@
+/*
+ * Copyright (C) 2008, 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_H__
+#define __LIBMTD_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Needed for uint8_t, uint64_t
+#include <stdint.h>
+
+/* Maximum MTD device name length */
+#define MTD_NAME_MAX 127
+/* Maximum MTD device type string length */
+#define MTD_TYPE_MAX 64
+
+/* MTD library descriptor */
+typedef void * libmtd_t;
+
+/* Forward decls */
+struct region_info_user;
+
+/**
+ * @mtd_dev_cnt: count of MTD devices in system
+ * @lowest_mtd_num: lowest MTD device number in system
+ * @highest_mtd_num: highest MTD device number in system
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ */
+struct mtd_info
+{
+	int mtd_dev_cnt;
+	int lowest_mtd_num;
+	int highest_mtd_num;
+	unsigned int sysfs_supported:1;
+};
+
+/**
+ * struct mtd_dev_info - information about an MTD device.
+ * @mtd_num: MTD device number
+ * @major: major number of corresponding character device
+ * @minor: minor number of corresponding character device
+ * @type: flash type (constants like %MTD_NANDFLASH defined in mtd-abi.h)
+ * @type_str: static R/O flash type string
+ * @name: device name
+ * @size: device size in bytes
+ * @eb_cnt: count of eraseblocks
+ * @eb_size: eraseblock size
+ * @min_io_size: minimum input/output unit size
+ * @subpage_size: sub-page size
+ * @oob_size: OOB size (zero if the device does not have OOB area)
+ * @region_cnt: count of additional erase regions
+ * @writable: zero if the device is read-only
+ * @bb_allowed: non-zero if the MTD device may have bad eraseblocks
+ */
+struct mtd_dev_info
+{
+	int mtd_num;
+	int major;
+	int minor;
+	int type;
+	char type_str[MTD_TYPE_MAX + 1];
+	char name[MTD_NAME_MAX + 1];
+	long long size;
+	int eb_cnt;
+	int eb_size;
+	int min_io_size;
+	int subpage_size;
+	int oob_size;
+	int region_cnt;
+	unsigned int writable:1;
+	unsigned int bb_allowed:1;
+};
+
+/**
+ * libmtd_open - open MTD library.
+ *
+ * This function initializes and opens the MTD library and returns MTD library
+ * descriptor in case of success and %NULL in case of failure. In case of
+ * failure, errno contains zero if MTD is not present in the system, or
+ * contains the error code if a real error happened.
+ */
+libmtd_t libmtd_open(void);
+
+/**
+ * libmtd_close - close MTD library.
+ * @desc: MTD library descriptor
+ */
+void libmtd_close(libmtd_t desc);
+
+/**
+ * mtd_dev_present - check whether a MTD device is present.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to check
+ *
+ * This function returns %1 if MTD device is present and %0 if not.
+ */
+int mtd_dev_present(libmtd_t desc, int mtd_num);
+
+/**
+ * mtd_get_info - get general MTD information.
+ * @desc: MTD library descriptor
+ * @info: the MTD device information is returned here
+ *
+ * This function fills the passed @info object with general MTD information and
+ * returns %0 in case of success and %-1 in case of failure. If MTD subsystem is
+ * not present in the system, errno is set to @ENODEV.
+ */
+int mtd_get_info(libmtd_t desc, struct mtd_info *info);
+
+/**
+ * mtd_get_dev_info - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function gets information about MTD device defined by the @node device
+ * node file and saves this information in the @mtd object. Returns %0 in case
+ * of success and %-1 in case of failure. If MTD subsystem is not present in the
+ * system, or the MTD device does not exist, errno is set to @ENODEV.
+ */
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_get_dev_info1 - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to fetch information about
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is identical to 'mtd_get_dev_info()' except that it accepts
+ * MTD device number, not MTD character device.
+ */
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_lock - lock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function locks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_unlock - unlock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function unlocks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_erase - erase an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to erase
+ *
+ * This function erases eraseblock @eb of MTD device described by @fd. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_regioninfo - get information about an erase region.
+ * @fd: MTD device node file descriptor
+ * @regidx: index of region to look up
+ * @reginfo: the region information is returned here
+ *
+ * This function gets information about an erase region defined by the
+ * @regidx index and saves this information in the @reginfo object.
+ * Returns %0 in case of success and %-1 in case of failure. If the
+ * @regidx is not valid or unavailable, errno is set to @ENODEV.
+ */
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo);
+
+/**
+ * mtd_is_locked - see if the specified eraseblock is locked.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks to see if eraseblock @eb of MTD device described
+ * by @fd is locked. Returns %0 if it is unlocked, %1 if it is locked, and
+ * %-1 in case of failure. If the ioctl is not supported (support was added in
+ * Linux kernel 2.6.36) or this particular device does not support it, errno is
+ * set to @ENOTSUPP.
+ */
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_torture - torture an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to torture
+ *
+ * This function tortures eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_is_bad - check if eraseblock is bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks if eraseblock @eb is bad. Returns %0 if not, %1 if yes,
+ * and %-1 in case of failure.
+ */
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_mark_bad - mark an eraseblock as bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to mark as bad
+ *
+ * This function marks eraseblock @eb as bad. Returns %0 in case of success and
+ * %-1 in case of failure.
+ */
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_read - read data from an MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to read from
+ * @offs: offset withing the eraseblock to read from
+ * @buf: buffer to read data to
+ * @len: how many bytes to read
+ *
+ * This function reads @len bytes of data from eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd and stores the read data at buffer @buf.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len);
+
+/**
+ * mtd_write - write data to an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @data: data buffer to write
+ * @len: how many data bytes to write
+ * @oob: OOB buffer to write
+ * @ooblen: how many OOB bytes to write
+ * @mode: write mode (e.g., %MTD_OOB_PLACE, %MTD_OOB_RAW)
+ *
+ * This function writes @len bytes of data to eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd. Returns %0 in case of success and %-1 in
+ * case of failure.
+ *
+ * Can only write to a single page at a time if writing to OOB.
+ */
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode);
+
+/**
+ * mtd_read_oob - read out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to read
+ * @data: read buffer
+ *
+ * This function reads @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. This function returns %0 in case of
+ * success and %-1 in case of failure.
+ */
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_oob - write out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to write
+ * @data: write buffer
+ *
+ * This function writes @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_img - write a file to MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @img_name: the file to write
+ *
+ * This function writes an image @img_name the MTD device defined by @mtd. @eb
+ * and @offs are the starting eraseblock and offset on the MTD device. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name);
+
+/**
+ * mtd_probe_node - test MTD node.
+ * @desc: MTD library descriptor
+ * @node: the node to test
+ *
+ * This function tests whether @node is an MTD device node and returns %1 if it
+ * is, and %-1 if it is not (errno is %ENODEV in this case) or if an error
+ * occurred.
+ */
+int mtd_probe_node(libmtd_t desc, const char *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LIBMTD_H__ */
diff -Nru fio-2.1.3/oslib/libmtd_int.h fio-3.16/oslib/libmtd_int.h
--- fio-2.1.3/oslib/libmtd_int.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd_int.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_INT_H__
+#define __LIBMTD_INT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROGRAM_NAME "libmtd"
+
+#define SYSFS_MTD        "class/mtd"
+#define MTD_NAME_PATT    "mtd%d"
+#define MTD_DEV          "dev"
+#define MTD_NAME         "name"
+#define MTD_TYPE         "type"
+#define MTD_EB_SIZE      "erasesize"
+#define MTD_SIZE         "size"
+#define MTD_MIN_IO_SIZE  "writesize"
+#define MTD_SUBPAGE_SIZE "subpagesize"
+#define MTD_OOB_SIZE     "oobsize"
+#define MTD_REGION_CNT   "numeraseregions"
+#define MTD_FLAGS        "flags"
+
+#define OFFS64_IOCTLS_UNKNOWN       0
+#define OFFS64_IOCTLS_NOT_SUPPORTED 1
+#define OFFS64_IOCTLS_SUPPORTED     2
+
+/**
+ * libmtd - MTD library description data structure.
+ * @sysfs_mtd: MTD directory in sysfs
+ * @mtd: MTD device sysfs directory pattern
+ * @mtd_dev: MTD device major/minor numbers file pattern
+ * @mtd_name: MTD device name file pattern
+ * @mtd_type: MTD device type file pattern
+ * @mtd_eb_size: MTD device eraseblock size file pattern
+ * @mtd_size: MTD device size file pattern
+ * @mtd_min_io_size: minimum I/O unit size file pattern
+ * @mtd_subpage_size: sub-page size file pattern
+ * @mtd_oob_size: MTD device OOB size file pattern
+ * @mtd_region_cnt: count of additional erase regions file pattern
+ * @mtd_flags: MTD device flags file pattern
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ * @offs64_ioctls: %OFFS64_IOCTLS_SUPPORTED if 64-bit %MEMERASE64,
+ *                 %MEMREADOOB64, %MEMWRITEOOB64 MTD device ioctls are
+ *                 supported, %OFFS64_IOCTLS_NOT_SUPPORTED if not, and
+ *                 %OFFS64_IOCTLS_UNKNOWN if it is not known yet;
+ *
+ *  Note, we cannot find out whether 64-bit ioctls are supported by MTD when we
+ *  are initializing the library, because this requires an MTD device node.
+ *  Indeed, we have to actually call the ioctl and check for %ENOTTY to find
+ *  out whether it is supported or not.
+ *
+ *  Thus, we leave %offs64_ioctls uninitialized in 'libmtd_open()', and
+ *  initialize it later, when corresponding libmtd function is used, and when
+ *  we actually have a device node and can invoke an ioctl command on it.
+ */
+struct libmtd
+{
+	char *sysfs_mtd;
+	char *mtd;
+	char *mtd_dev;
+	char *mtd_name;
+	char *mtd_type;
+	char *mtd_eb_size;
+	char *mtd_size;
+	char *mtd_min_io_size;
+	char *mtd_subpage_size;
+	char *mtd_oob_size;
+	char *mtd_region_cnt;
+	char *mtd_flags;
+	unsigned int sysfs_supported:1;
+	unsigned int offs64_ioctls:2;
+};
+
+int legacy_libmtd_open(void);
+int legacy_dev_present(int mtd_num);
+int legacy_mtd_get_info(struct mtd_info *info);
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd);
+int legacy_get_dev_info1(int dev_num, struct mtd_dev_info *mtd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__LIBMTD_INT_H__ */
diff -Nru fio-2.1.3/oslib/libmtd_legacy.c fio-3.16/oslib/libmtd_legacy.c
--- fio-2.1.3/oslib/libmtd_legacy.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd_legacy.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * This file  is part of the MTD library. Implements pre-2.6.30 kernels support,
+ * where MTD did not have sysfs interface. The main limitation of the old
+ * kernels was that the sub-page size was not exported to user-space, so it was
+ * not possible to get sub-page size.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "libmtd.h"
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+#define MTD_PROC_FILE "/proc/mtd"
+#define MTD_DEV_PATT  "/dev/mtd%d"
+#define MTD_DEV_MAJOR 90
+
+#define PROC_MTD_FIRST     "dev:    size   erasesize  name\n"
+#define PROC_MTD_FIRST_LEN (sizeof(PROC_MTD_FIRST) - 1)
+#define PROC_MTD_MAX_LEN   4096
+#define PROC_MTD_PATT      "mtd%d: %llx %x"
+
+/**
+ * struct proc_parse_info - /proc/mtd parsing information.
+ * @mtd_num: MTD device number
+ * @size: device size
+ * @eb_size: eraseblock size
+ * @name: device name
+ * @buf: contents of /proc/mtd
+ * @data_size: how much data was read into @buf
+ * @pos: next string in @buf to parse
+ */
+struct proc_parse_info
+{
+	int mtd_num;
+	long long size;
+	char name[MTD_NAME_MAX + 1];
+	int eb_size;
+	char *buf;
+	int data_size;
+	char *next;
+};
+
+static int proc_parse_start(struct proc_parse_info *pi)
+{
+	int fd, ret;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1)
+		return -1;
+
+	pi->buf = xmalloc(PROC_MTD_MAX_LEN);
+
+	ret = read(fd, pi->buf, PROC_MTD_MAX_LEN);
+	if (ret == -1) {
+		sys_errmsg("cannot read \"%s\"", MTD_PROC_FILE);
+		goto out_free;
+	}
+
+	if (ret < PROC_MTD_FIRST_LEN ||
+	    memcmp(pi->buf, PROC_MTD_FIRST, PROC_MTD_FIRST_LEN)) {
+		errmsg("\"%s\" does not start with \"%s\"", MTD_PROC_FILE,
+		       PROC_MTD_FIRST);
+		goto out_free;
+	}
+
+	pi->data_size = ret;
+	pi->next = pi->buf + PROC_MTD_FIRST_LEN;
+
+	close(fd);
+	return 0;
+
+out_free:
+	free(pi->buf);
+	close(fd);
+	return -1;
+}
+
+static int proc_parse_next(struct proc_parse_info *pi)
+{
+	int ret, len, pos = pi->next - pi->buf;
+	char *p, *p1;
+
+	if (pos >= pi->data_size) {
+		free(pi->buf);
+		return 0;
+	}
+
+	ret = sscanf(pi->next, PROC_MTD_PATT, &pi->mtd_num, &pi->size,
+		     &pi->eb_size);
+	if (ret != 3)
+		return errmsg("\"%s\" pattern not found", PROC_MTD_PATT);
+
+	p = memchr(pi->next, '\"', pi->data_size - pos);
+	if (!p)
+		return errmsg("opening \" not found");
+	p += 1;
+	pos = p - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("opening \" not found");
+
+	p1 = memchr(p, '\"', pi->data_size - pos);
+	if (!p1)
+		return errmsg("closing \" not found");
+	pos = p1 - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("closing \" not found");
+
+	len = p1 - p;
+	if (len > MTD_NAME_MAX)
+		return errmsg("too long mtd%d device name", pi->mtd_num);
+
+	memcpy(pi->name, p, len);
+	pi->name[len] = '\0';
+
+	if (p1[1] != '\n')
+		return errmsg("opening \"\n\" not found");
+	pi->next = p1 + 2;
+	return 1;
+}
+
+/**
+ * legacy_libmtd_open - legacy version of 'libmtd_open()'.
+ *
+ * This function is just checks that MTD is present in the system. Returns
+ * zero in case of success and %-1 in case of failure. In case of failure,
+ * errno contains zero if MTD is not present in the system, or contains the
+ * error code if a real error happened. This is similar to the 'libmtd_open()'
+ * return conventions.
+ */
+int legacy_libmtd_open(void)
+{
+	int fd;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT)
+			errno = 0;
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+/**
+ * legacy_dev_presentl - legacy version of 'mtd_dev_present()'.
+ * @info: the MTD device information is returned here
+ *
+ * When the kernel does not provide sysfs files for the MTD subsystem,
+ * fall-back to parsing the /proc/mtd file to determine whether an mtd device
+ * number @mtd_num is present.
+ */
+int legacy_dev_present(int mtd_num)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_mtd_get_info - legacy version of 'mtd_get_info()'.
+ * @info: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_info()' and has the same conventions.
+ */
+int legacy_mtd_get_info(struct mtd_info *info)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	info->lowest_mtd_num = INT_MAX;
+	while (proc_parse_next(&pi)) {
+		info->mtd_dev_cnt += 1;
+		if (pi.mtd_num > info->highest_mtd_num)
+			info->highest_mtd_num = pi.mtd_num;
+		if (pi.mtd_num < info->lowest_mtd_num)
+			info->lowest_mtd_num = pi.mtd_num;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_get_dev_info - legacy version of 'mtd_get_dev_info()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd)
+{
+	struct stat st;
+	struct mtd_info_user ui;
+	int fd, ret;
+	loff_t offs = 0;
+	struct proc_parse_info pi;
+
+	if (stat(node, &st)) {
+		sys_errmsg("cannot open \"%s\"", node);
+		if (errno == ENOENT)
+			normsg("MTD subsystem is old and does not support "
+			       "sysfs, so MTD character device nodes have "
+			       "to exist");
+	}
+
+	if (!S_ISCHR(st.st_mode)) {
+		errno = EINVAL;
+		return errmsg("\"%s\" is not a character device", node);
+	}
+
+	memset(mtd, '\0', sizeof(struct mtd_dev_info));
+	mtd->major = major(st.st_rdev);
+	mtd->minor = minor(st.st_rdev);
+
+	if (mtd->major != MTD_DEV_MAJOR) {
+		errno = EINVAL;
+		return errmsg("\"%s\" has major number %d, MTD devices have "
+			      "major %d", node, mtd->major, MTD_DEV_MAJOR);
+	}
+
+	mtd->mtd_num = mtd->minor / 2;
+
+	fd = open(node, O_RDONLY);
+	if (fd == -1)
+		return sys_errmsg("cannot open \"%s\"", node);
+
+	if (ioctl(fd, MEMGETINFO, &ui)) {
+		sys_errmsg("MEMGETINFO ioctl request failed");
+		goto out_close;
+	}
+
+	ret = ioctl(fd, MEMGETBADBLOCK, &offs);
+	if (ret == -1) {
+		if (errno != EOPNOTSUPP) {
+			sys_errmsg("MEMGETBADBLOCK ioctl failed");
+			goto out_close;
+		}
+		errno = 0;
+		mtd->bb_allowed = 0;
+	} else
+		mtd->bb_allowed = 1;
+
+	mtd->type = ui.type;
+	mtd->size = ui.size;
+	mtd->eb_size = ui.erasesize;
+	mtd->min_io_size = ui.writesize;
+	mtd->oob_size = ui.oobsize;
+
+	if (mtd->min_io_size <= 0) {
+		errmsg("mtd%d (%s) has insane min. I/O unit size %d",
+		       mtd->mtd_num, node, mtd->min_io_size);
+		goto out_close;
+	}
+	if (mtd->eb_size <= 0 || mtd->eb_size < mtd->min_io_size) {
+		errmsg("mtd%d (%s) has insane eraseblock size %d",
+		       mtd->mtd_num, node, mtd->eb_size);
+		goto out_close;
+	}
+	if (mtd->size <= 0 || mtd->size < mtd->eb_size) {
+		errmsg("mtd%d (%s) has insane size %lld",
+		       mtd->mtd_num, node, mtd->size);
+		goto out_close;
+	}
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+
+	switch(mtd->type) {
+	case MTD_ABSENT:
+		errmsg("mtd%d (%s) is removable and is not present",
+		       mtd->mtd_num, node);
+		goto out_close;
+	case MTD_RAM:
+		strcpy((char *)mtd->type_str, "ram");
+		break;
+	case MTD_ROM:
+		strcpy((char *)mtd->type_str, "rom");
+		break;
+	case MTD_NORFLASH:
+		strcpy((char *)mtd->type_str, "nor");
+		break;
+	case MTD_NANDFLASH:
+		strcpy((char *)mtd->type_str, "nand");
+		break;
+	case MTD_MLCNANDFLASH:
+		strcpy((char *)mtd->type_str, "mlc-nand");
+		break;
+	case MTD_DATAFLASH:
+		strcpy((char *)mtd->type_str, "dataflash");
+		break;
+	case MTD_UBIVOLUME:
+		strcpy((char *)mtd->type_str, "ubi");
+		break;
+	default:
+		goto out_close;
+	}
+
+	if (ui.flags & MTD_WRITEABLE)
+		mtd->writable = 1;
+	mtd->subpage_size = mtd->min_io_size;
+
+	close(fd);
+
+	/*
+	 * Unfortunately, the device name is not available via ioctl, and
+	 * we have to parse /proc/mtd to get it.
+	 */
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd->mtd_num) {
+			strcpy((char *)mtd->name, pi.name);
+			return 0;
+		}
+	}
+
+	errmsg("mtd%d not found in \"%s\"", mtd->mtd_num, MTD_PROC_FILE);
+	errno = ENOENT;
+	return -1;
+
+out_close:
+	close(fd);
+	return -1;
+}
+
+/**
+ * legacy_get_dev_info1 - legacy version of 'mtd_get_dev_info1()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info1()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info1(int mtd_num, struct mtd_dev_info *mtd)
+{
+	char node[sizeof(MTD_DEV_PATT) + 20];
+
+	sprintf(node, MTD_DEV_PATT, mtd_num);
+	return legacy_get_dev_info(node, mtd);
+}
diff -Nru fio-2.1.3/oslib/libmtd_xalloc.h fio-3.16/oslib/libmtd_xalloc.h
--- fio-2.1.3/oslib/libmtd_xalloc.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/libmtd_xalloc.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,106 @@
+/*
+ * memory wrappers
+ *
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ * Copyright 2001, 2002 Red Hat, Inc.
+ *           2001 David A. Schleef <ds@lineo.com>
+ *           2002 Axis Communications AB
+ *           2001, 2002 Erik Andersen <andersen@codepoet.org>
+ *           2004 University of Szeged, Hungary
+ *           2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __MTD_UTILS_XALLOC_H__
+#define __MTD_UTILS_XALLOC_H__
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Mark these functions as unused so that gcc does not emit warnings
+ * when people include this header but don't use every function.
+ */
+
+__attribute__((unused))
+static void *xmalloc(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ptr = calloc(nmemb, size);
+
+	if (ptr == NULL && nmemb != 0 && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xzalloc(size_t size)
+{
+	return xcalloc(1, size);
+}
+
+__attribute__((unused))
+static void *xrealloc(void *ptr, size_t size)
+{
+	ptr = realloc(ptr, size);
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static char *xstrdup(const char *s)
+{
+	char *t;
+
+	if (s == NULL)
+		return NULL;
+	t = strdup(s);
+	if (t == NULL)
+		sys_errmsg_die("out of memory");
+	return t;
+}
+
+#ifdef _GNU_SOURCE
+
+__attribute__((unused))
+static int xasprintf(char **strp, const char *fmt, ...)
+{
+	int cnt;
+	va_list ap;
+
+	va_start(ap, fmt);
+	cnt = vasprintf(strp, fmt, ap);
+	va_end(ap);
+
+	if (cnt == -1)
+		sys_errmsg_die("out of memory");
+
+	return cnt;
+}
+#endif
+
+#endif /* !__MTD_UTILS_XALLOC_H__ */
diff -Nru fio-2.1.3/oslib/linux-dev-lookup.c fio-3.16/oslib/linux-dev-lookup.c
--- fio-2.1.3/oslib/linux-dev-lookup.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/linux-dev-lookup.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,67 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "linux-dev-lookup.h"
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min)
+{
+	struct dirent *dir;
+	struct stat st;
+	int found = 0;
+	DIR *D;
+
+	D = opendir(path);
+	if (!D)
+		return 0;
+
+	while ((dir = readdir(D)) != NULL) {
+		char full_path[257];
+
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		sprintf(full_path, "%s/%s", path, dir->d_name);
+		if (lstat(full_path, &st) == -1) {
+			perror("lstat");
+			break;
+		}
+
+		if (S_ISDIR(st.st_mode)) {
+			found = blktrace_lookup_device(redirect, full_path,
+								maj, min);
+			if (found) {
+				strcpy(path, full_path);
+				break;
+			}
+		}
+
+		if (!S_ISBLK(st.st_mode))
+			continue;
+
+		/*
+		 * If replay_redirect is set then always return this device
+		 * upon lookup which overrides the device lookup based on
+		 * major minor in the actual blktrace
+		 */
+		if (redirect) {
+			strcpy(path, redirect);
+			found = 1;
+			break;
+		}
+
+		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
+			strcpy(path, full_path);
+			found = 1;
+			break;
+		}
+	}
+
+	closedir(D);
+	return found;
+}
diff -Nru fio-2.1.3/oslib/linux-dev-lookup.h fio-3.16/oslib/linux-dev-lookup.h
--- fio-2.1.3/oslib/linux-dev-lookup.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/linux-dev-lookup.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,7 @@
+#ifndef LINUX_DEV_LOOKUP
+#define LINUX_DEV_LOOKUP
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min);
+
+#endif
diff -Nru fio-2.1.3/oslib/strcasestr.c fio-3.16/oslib/strcasestr.c
--- fio-2.1.3/oslib/strcasestr.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strcasestr.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,30 @@
+#ifndef CONFIG_STRCASESTR
+
+#include <ctype.h>
+#include <stddef.h>
+#include "strcasestr.h"
+
+char *strcasestr(const char *s1, const char *s2)
+{
+	const char *s = s1;
+	const char *p = s2;
+
+	do {
+		if (!*p)
+			return (char *) s1;
+		if ((*p == *s) ||
+		    (tolower(*p) == tolower(*s))) {
+			++p;
+			++s;
+		} else {
+			p = s2;
+			if (!*s)
+				return NULL;
+			s = ++s1;
+		}
+	} while (1);
+
+	return *p ? NULL : (char *) s1;
+}
+
+#endif
diff -Nru fio-2.1.3/oslib/strcasestr.h fio-3.16/oslib/strcasestr.h
--- fio-2.1.3/oslib/strcasestr.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strcasestr.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+#ifndef CONFIG_STRCASESTR
+
+#ifndef FIO_STRCASESTR_H
+#define FIO_STRCASESTR_H
+
+char *strcasestr(const char *haystack, const char *needle);
+
+#endif
+
+#endif
diff -Nru fio-2.1.3/oslib/strlcat.c fio-3.16/oslib/strlcat.c
--- fio-2.1.3/oslib/strlcat.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strlcat.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,57 @@
+#ifndef CONFIG_STRLCAT
+/*
+ * Copyright (c) 1998, 2015 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+#include "strlcat.h"
+
+/*
+ * Appends src to string dst of size dsize (unlike strncat, dsize is the
+ * full size of dst, not space left).  At most dsize-1 characters
+ * will be copied.  Always NUL terminates (unless dsize <= strlen(dst)).
+ * Returns strlen(src) + MIN(dsize, strlen(initial dst)).
+ * If retval >= dsize, truncation occurred.
+ */
+size_t
+strlcat(char *dst, const char *src, size_t dsize)
+{
+	const char *odst = dst;
+	const char *osrc = src;
+	size_t n = dsize;
+	size_t dlen;
+
+	/* Find the end of dst and adjust bytes left but don't go past end. */
+	while (n-- != 0 && *dst != '\0')
+		dst++;
+	dlen = dst - odst;
+	n = dsize - dlen;
+
+	if (n-- == 0)
+		return(dlen + strlen(src));
+	while (*src != '\0') {
+		if (n != 0) {
+			*dst++ = *src;
+			n--;
+		}
+		src++;
+	}
+	*dst = '\0';
+
+	return(dlen + (src - osrc));	/* count does not include NUL */
+}
+
+#endif
diff -Nru fio-2.1.3/oslib/strlcat.h fio-3.16/oslib/strlcat.h
--- fio-2.1.3/oslib/strlcat.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strlcat.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+#ifndef CONFIG_STRLCAT
+
+#ifndef FIO_STRLCAT_H
+#define FIO_STRLCAT_H
+
+#include <stddef.h>
+
+size_t strlcat(char *dst, const char *src, size_t dsize);
+
+#endif
+
+#endif
diff -Nru fio-2.1.3/oslib/strndup.c fio-3.16/oslib/strndup.c
--- fio-2.1.3/oslib/strndup.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strndup.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,19 @@
+#ifndef CONFIG_HAVE_STRNDUP
+
+#include <stdlib.h>
+#include <string.h>
+#include "strndup.h"
+
+char *strndup(const char *s, size_t n)
+{
+	char *str = malloc(n + 1);
+
+	if (str) {
+		strncpy(str, s, n);
+		str[n] = '\0';
+	}
+
+	return str;
+}
+
+#endif
diff -Nru fio-2.1.3/oslib/strndup.h fio-3.16/oslib/strndup.h
--- fio-2.1.3/oslib/strndup.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strndup.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,12 @@
+#ifndef CONFIG_HAVE_STRNDUP
+
+#ifndef FIO_STRNDUP_LIB_H
+#define FIO_STRNDUP_LIB_H
+
+#include <stddef.h>
+
+char *strndup(const char *s, size_t n);
+
+#endif
+
+#endif
diff -Nru fio-2.1.3/oslib/strsep.c fio-3.16/oslib/strsep.c
--- fio-2.1.3/oslib/strsep.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strsep.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,34 @@
+#ifndef CONFIG_STRSEP
+
+#include <stddef.h>
+#include "strsep.h"
+
+char *strsep(char **stringp, const char *delim)
+{
+	char *s, *tok;
+	const char *spanp;
+	int c, sc;
+
+	s = *stringp;
+	if (!s)
+		return NULL;
+
+	tok = s;
+	do {
+		c = *s++;
+		spanp = delim;
+		do {
+			sc = *spanp++;
+			if (sc == c) {
+				if (c == 0)
+					s = NULL;
+				else
+					s[-1] = 0;
+				*stringp = s;
+				return tok;
+			}
+		} while (sc != 0);
+	} while (1);
+}
+
+#endif
diff -Nru fio-2.1.3/oslib/strsep.h fio-3.16/oslib/strsep.h
--- fio-2.1.3/oslib/strsep.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/oslib/strsep.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+#ifndef CONFIG_STRSEP
+
+#ifndef FIO_STRSEP_LIB_H
+#define FIO_STRSEP_LIB_H
+
+char *strsep(char **, const char *);
+
+#endif
+
+#endif
diff -Nru fio-2.1.3/parse.c fio-3.16/parse.c
--- fio-2.1.3/parse.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/parse.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,22 +3,46 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <ctype.h>
 #include <string.h>
 #include <errno.h>
 #include <limits.h>
-#include <stdlib.h>
-#include <math.h>
 #include <float.h>
 
+#include "compiler/compiler.h"
 #include "parse.h"
 #include "debug.h"
+#include "log.h"
 #include "options.h"
+#include "optgroup.h"
 #include "minmax.h"
 #include "lib/ieee754.h"
+#include "lib/pow2.h"
+
+#ifdef CONFIG_ARITHMETIC
+#include "y.tab.h"
+#endif
+
+static const char *opt_type_names[] = {
+	"OPT_INVALID",
+	"OPT_STR",
+	"OPT_STR_ULL",
+	"OPT_STR_MULTI",
+	"OPT_STR_VAL",
+	"OPT_STR_VAL_TIME",
+	"OPT_STR_STORE",
+	"OPT_RANGE",
+	"OPT_INT",
+	"OPT_ULL",
+	"OPT_BOOL",
+	"OPT_FLOAT_LIST",
+	"OPT_STR_SET",
+	"OPT_DEPRECATED",
+	"OPT_SOFT_DEPRECATED",
+	"OPT_UNSUPPORTED",
+};
 
-static struct fio_option *__fio_options;
+static const struct fio_option *__fio_options;
 
 static int vp_cmp(const void *p1, const void *p2)
 {
@@ -28,7 +52,7 @@
 	return strlen(vp2->ival) - strlen(vp1->ival);
 }
 
-static void posval_sort(struct fio_option *o, struct value_pair *vpmap)
+static void posval_sort(const struct fio_option *o, struct value_pair *vpmap)
 {
 	const struct value_pair *vp;
 	int entries;
@@ -46,16 +70,21 @@
 	qsort(vpmap, entries, sizeof(struct value_pair), vp_cmp);
 }
 
-static void show_option_range(struct fio_option *o,
-				int (*logger)(const char *format, ...))
+static void show_option_range(const struct fio_option *o,
+			      ssize_t (*logger)(const char *format, ...))
 {
 	if (o->type == FIO_OPT_FLOAT_LIST) {
-		if (o->minfp == DBL_MIN && o->maxfp == DBL_MAX)
+		const char *sep = "";
+		if (!o->minfp && !o->maxfp)
 			return;
 
-		logger("%20s: min=%f", "range", o->minfp);
+		logger("%20s: ", "range");
+		if (o->minfp != DBL_MIN) {
+			logger("min=%f", o->minfp);
+			sep = ", ";
+		}
 		if (o->maxfp != DBL_MAX)
-			logger(", max=%f", o->maxfp);
+			logger("%smax=%f", sep, o->maxfp);
 		logger("\n");
 	} else if (!o->posval[0].ival) {
 		if (!o->minval && !o->maxval)
@@ -68,7 +97,7 @@
 	}
 }
 
-static void show_option_values(struct fio_option *o)
+static void show_option_values(const struct fio_option *o)
 {
 	int i;
 
@@ -88,23 +117,27 @@
 		log_info("\n");
 }
 
-static void show_option_help(struct fio_option *o, int is_err)
+static void show_option_help(const struct fio_option *o, int is_err)
 {
 	const char *typehelp[] = {
-		"invalid",
-		"string (opt=bla)",
-		"string (opt=bla)",
-		"string with possible k/m/g postfix (opt=4k)",
-		"string with time postfix (opt=10s)",
-		"string (opt=bla)",
-		"string with dual range (opt=1k-4k,4k-8k)",
-		"integer value (opt=100)",
-		"boolean value (opt=1)",
-		"list of floating point values separated by ':' (opt=5.9:7.8)",
-		"no argument (opt)",
-		"deprecated",
+		[FIO_OPT_INVALID]	  = "invalid",
+		[FIO_OPT_STR]		  = "string (opt=bla)",
+		[FIO_OPT_STR_ULL]	  = "string (opt=bla)",
+		[FIO_OPT_STR_MULTI]	  = "string with possible k/m/g postfix (opt=4k)",
+		[FIO_OPT_STR_VAL]	  = "string (opt=bla)",
+		[FIO_OPT_STR_VAL_TIME]	  = "string with time postfix (opt=10s)",
+		[FIO_OPT_STR_STORE]	  = "string (opt=bla)",
+		[FIO_OPT_RANGE]		  = "one to three ranges (opt=1k-4k[,4k-8k[,1k-8k]])",
+		[FIO_OPT_INT]		  = "integer value (opt=100)",
+		[FIO_OPT_ULL]		  = "integer value (opt=100)",
+		[FIO_OPT_BOOL]		  = "boolean value (opt=1)",
+		[FIO_OPT_FLOAT_LIST]	  = "list of floating point values separated by ':' (opt=5.9:7.8)",
+		[FIO_OPT_STR_SET]	  = "empty or boolean value ([0|1])",
+		[FIO_OPT_DEPRECATED]	  = "deprecated",
+		[FIO_OPT_SOFT_DEPRECATED] = "deprecated",
+		[FIO_OPT_UNSUPPORTED]	  = "unsupported",
 	};
-	int (*logger)(const char *format, ...);
+	ssize_t (*logger)(const char *format, ...);
 
 	if (is_err)
 		logger = log_err;
@@ -122,21 +155,49 @@
 	show_option_values(o);
 }
 
-static unsigned long get_mult_time(char c)
+static unsigned long long get_mult_time(const char *str, int len,
+					int is_seconds)
 {
-	switch (c) {
-	case 'm':
-	case 'M':
-		return 60;
-	case 'h':
-	case 'H':
-		return 60 * 60;
-	case 'd':
-	case 'D':
-		return 24 * 60 * 60;
-	default:
-		return 1;
+	const char *p = str;
+	char *c;
+	unsigned long long mult = 1;
+	int i;
+
+	/*
+         * Go forward until we hit a non-digit, or +/- sign
+         */
+	while ((p - str) <= len) {
+		if (!isdigit((int) *p) && (*p != '+') && (*p != '-'))
+			break;
+		p++;
 	}
+
+	if (!isalpha((int) *p)) {
+		if (is_seconds)
+			return 1000000UL;
+		else
+			return 1;
+	}
+
+	c = strdup(p);
+	for (i = 0; i < strlen(c); i++)
+		c[i] = tolower((unsigned char)c[i]);
+
+	if (!strncmp("us", c, 2) || !strncmp("usec", c, 4))
+		mult = 1;
+	else if (!strncmp("ms", c, 2) || !strncmp("msec", c, 4))
+		mult = 1000;
+	else if (!strcmp("s", c))
+		mult = 1000000;
+	else if (!strcmp("m", c))
+		mult = 60 * 1000000UL;
+	else if (!strcmp("h", c))
+		mult = 60 * 60 * 1000000UL;
+	else if (!strcmp("d", c))
+		mult = 24 * 60 * 60 * 1000000ULL;
+
+	free(c);
+	return mult;
 }
 
 static int is_separator(char c)
@@ -166,39 +227,57 @@
 	c = strdup(p);
 
 	for (i = 0; i < strlen(c); i++) {
-		c[i] = tolower(c[i]);
+		c[i] = tolower((unsigned char)c[i]);
 		if (is_separator(c[i])) {
 			c[i] = '\0';
 			break;
 		}
 	}
 
+	/* If kb_base is 1000, use true units.
+	 * If kb_base is 1024, use opposite units.
+	 */
 	if (!strncmp("pib", c, 3)) {
 		pow = 5;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("tib", c, 3)) {
 		pow = 4;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("gib", c, 3)) {
 		pow = 3;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("mib", c, 3)) {
 		pow = 2;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("kib", c, 3)) {
 		pow = 1;
-		mult = 1000;
-	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2))
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) {
 		pow = 5;
-	else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2))
+	} else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) {
 		pow = 4;
-	else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2))
+	} else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) {
 		pow = 3;
-	else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2))
+	} else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) {
 		pow = 2;
-	else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2))
+	} else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) {
 		pow = 1;
-	else if (!strncmp("%", c, 1)) {
+	} else if (!strncmp("%", c, 1)) {
 		*percent = 1;
 		free(c);
 		return ret;
@@ -237,33 +316,72 @@
 	return __get_mult_bytes(p, data, percent);
 }
 
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units,
+					  int is_time);
+
 /*
  * Convert string into a floating number. Return 1 for success and 0 otherwise.
  */
-int str_to_float(const char *str, double *val)
+int str_to_float(const char *str, double *val, int is_time)
 {
-	return (1 == sscanf(str, "%lf", val));
+#ifdef CONFIG_ARITHMETIC
+	int rc;
+	long long ival;
+	double dval;
+
+	if (str[0] == '(') {
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, 1.0, is_time);
+		if (!rc) {
+			*val = dval;
+			return 1;
+		}
+	}
+#endif
+	return 1 == sscanf(str, "%lf", val);
 }
 
 /*
  * convert string into decimal value, noting any size suffix
  */
-int str_to_decimal(const char *str, long long *val, int kilo, void *data)
+int str_to_decimal(const char *str, long long *val, int kilo, void *data,
+		   int is_seconds, int is_time)
 {
 	int len, base;
+	int rc = 1;
+#ifdef CONFIG_ARITHMETIC
+	long long ival;
+	double dval;
+	double implied_units = 1.0;
+#endif
 
 	len = strlen(str);
 	if (!len)
 		return 1;
 
-	if (strstr(str, "0x") || strstr(str, "0X"))
-		base = 16;
-	else
-		base = 10;
+#ifdef CONFIG_ARITHMETIC
+	if (is_seconds)
+		implied_units = 1000000.0;
+	if (str[0] == '(')
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, implied_units, is_time);
+	if (str[0] == '(' && !rc) {
+		if (!kilo && is_seconds)
+			*val = ival / 1000000LL;
+		else
+			*val = ival;
+	}
+#endif
 
-	*val = strtoll(str, NULL, base);
-	if (*val == LONG_MAX && errno == ERANGE)
-		return 1;
+	if (rc == 1) {
+		if (strstr(str, "0x") || strstr(str, "0X"))
+			base = 16;
+		else
+			base = 10;
+
+		*val = strtoll(str, NULL, base);
+		if (*val == LONG_MAX && errno == ERANGE)
+			return 1;
+	}
 
 	if (kilo) {
 		unsigned long long mult;
@@ -275,19 +393,19 @@
 		else
 			*val *= mult;
 	} else
-		*val *= get_mult_time(str[len - 1]);
+		*val *= get_mult_time(str, len, is_seconds);
 
 	return 0;
 }
 
 int check_str_bytes(const char *p, long long *val, void *data)
 {
-	return str_to_decimal(p, val, 1, data);
+	return str_to_decimal(p, val, 1, data, 0, 0);
 }
 
-int check_str_time(const char *p, long long *val)
+int check_str_time(const char *p, long long *val, int is_seconds)
 {
-	return str_to_decimal(p, val, 0, NULL);
+	return str_to_decimal(p, val, 0, NULL, is_seconds, 1);
 }
 
 void strip_blank_front(char **p)
@@ -325,11 +443,11 @@
 	*(s + 1) = '\0';
 }
 
-static int check_range_bytes(const char *str, long *val, void *data)
+static int check_range_bytes(const char *str, long long *val, void *data)
 {
 	long long __val;
 
-	if (!str_to_decimal(str, &__val, 1, data)) {
+	if (!str_to_decimal(str, &__val, 1, data, 0, 0)) {
 		*val = __val;
 		return 0;
 	}
@@ -352,7 +470,7 @@
 	return 1;
 }
 
-static int opt_len(const char *str)
+static size_t opt_len(const char *str)
 {
 	char *postfix;
 
@@ -368,22 +486,61 @@
 	return max(strlen(vp->ival), opt_len(str));
 }
 
-#define val_store(ptr, val, off, or, data)		\
+#define val_store(ptr, val, off, or, data, o)		\
 	do {						\
-		ptr = td_var((data), (off));		\
+		ptr = td_var((data), (o), (off));	\
 		if ((or))				\
 			*ptr |= (val);			\
 		else					\
 			*ptr = (val);			\
 	} while (0)
 
-static int __handle_option(struct fio_option *o, const char *ptr, void *data,
-			   int first, int more, int curr)
+static const char *opt_type_name(const struct fio_option *o)
+{
+	compiletime_assert(ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
+				"opt_type_names[] index");
+
+	if (o->type <= FIO_OPT_UNSUPPORTED)
+		return opt_type_names[o->type];
+
+	return "OPT_UNKNOWN?";
+}
+
+static bool val_too_large(const struct fio_option *o, unsigned long long val,
+			  bool is_uint)
+{
+	if (!o->maxval)
+		return false;
+
+	if (is_uint) {
+		if ((int) val < 0)
+			return (int) val > (int) o->maxval;
+		return (unsigned int) val > o->maxval;
+	}
+
+	return val > o->maxval;
+}
+
+static bool val_too_small(const struct fio_option *o, unsigned long long val,
+			  bool is_uint)
+{
+	if (!o->minval)
+		return false;
+
+	if (is_uint)
+		return (int) val < o->minval;
+
+	return val < o->minval;
+}
+
+static int __handle_option(const struct fio_option *o, const char *ptr,
+			   void *data, int first, int more, int curr)
 {
 	int il=0, *ilp;
 	fio_fp64_t *flp;
 	long long ull, *ullp;
-	long ul1, ul2;
+	long ul2;
+	long long ull1, ull2;
 	double uf;
 	char **cp = NULL;
 	int ret = 0, is_time = 0;
@@ -391,8 +548,8 @@
 	struct value_pair posval[PARSE_MAX_VP];
 	int i, all_skipped = 1;
 
-	dprint(FD_PARSE, "__handle_option=%s, type=%d, ptr=%s\n", o->name,
-							o->type, ptr);
+	dprint(FD_PARSE, "__handle_option=%s, type=%s, ptr=%s\n", o->name,
+							opt_type_name(o), ptr);
 
 	if (!ptr && o->type != FIO_OPT_STR_SET && o->type != FIO_OPT_STR) {
 		log_err("Option %s requires an argument\n", o->name);
@@ -401,6 +558,7 @@
 
 	switch (o->type) {
 	case FIO_OPT_STR:
+	case FIO_OPT_STR_ULL:
 	case FIO_OPT_STR_MULTI: {
 		fio_opt_str_fn *fn = o->cb;
 
@@ -412,18 +570,16 @@
 			if (!vp->ival || vp->ival[0] == '\0')
 				continue;
 			all_skipped = 0;
+			if (!ptr)
+				break;
 			if (!strncmp(vp->ival, ptr, str_match_len(vp, ptr))) {
 				ret = 0;
-				if (o->roff1) {
-					if (vp->or)
-						*(unsigned int *) o->roff1 |= vp->oval;
-					else
-						*(unsigned int *) o->roff1 = vp->oval;
-				} else {
-					if (!o->off1)
-						continue;
-					val_store(ilp, vp->oval, o->off1, vp->or, data);
-				}
+				if (!o->off1)
+					continue;
+				if (o->type == FIO_OPT_STR_ULL)
+					val_store(ullp, vp->oval, o->off1, vp->orval, data, o);
+				else
+					val_store(ilp, vp->oval, o->off1, vp->orval, data, o);
 				continue;
 			}
 		}
@@ -436,18 +592,23 @@
 	}
 	case FIO_OPT_STR_VAL_TIME:
 		is_time = 1;
+		/* fall through */
+	case FIO_OPT_ULL:
 	case FIO_OPT_INT:
 	case FIO_OPT_STR_VAL: {
 		fio_opt_str_val_fn *fn = o->cb;
 		char tmp[128], *p;
 
-		strncpy(tmp, ptr, sizeof(tmp) - 1);
+		if (!is_time && o->is_time)
+			is_time = o->is_time;
+
+		snprintf(tmp, sizeof(tmp), "%s", ptr);
 		p = strchr(tmp, ',');
 		if (p)
 			*p = '\0';
 
 		if (is_time)
-			ret = check_str_time(tmp, &ull);
+			ret = check_str_time(tmp, &ull, o->is_seconds);
 		else
 			ret = check_str_bytes(tmp, &ull, data);
 
@@ -455,15 +616,19 @@
 
 		if (ret)
 			break;
+		if (o->pow2 && !is_power_of_2(ull)) {
+			log_err("%s: must be a power-of-2\n", o->name);
+			return 1;
+		}
 
-		if (o->maxval && ull > o->maxval) {
-			log_err("max value out of range: %llu"
-					" (%u max)\n", ull, o->maxval);
+		if (val_too_large(o, ull, o->type == FIO_OPT_INT)) {
+			log_err("%s: max value out of range: %llu"
+				" (%llu max)\n", o->name, ull, o->maxval);
 			return 1;
 		}
-		if (o->minval && ull < o->minval) {
-			log_err("min value out of range: %llu"
-					" (%u min)\n", ull, o->minval);
+		if (val_too_small(o, ull, o->type == FIO_OPT_INT)) {
+			log_err("%s: min value out of range: %lld"
+				" (%d min)\n", o->name, ull, o->minval);
 			return 1;
 		}
 		if (o->posval[0].ival) {
@@ -490,50 +655,53 @@
 			ret = fn(data, &ull);
 		else {
 			if (o->type == FIO_OPT_INT) {
-				if (first) {
-					if (o->roff1)
-						*(unsigned int *) o->roff1 = ull;
-					else
-						val_store(ilp, ull, o->off1, 0, data);
+				if (first)
+					val_store(ilp, ull, o->off1, 0, data, o);
+				if (curr == 1) {
+					if (o->off2)
+						val_store(ilp, ull, o->off2, 0, data, o);
 				}
+				if (curr == 2) {
+					if (o->off3)
+						val_store(ilp, ull, o->off3, 0, data, o);
+				}
+				if (!more) {
+					if (curr < 1) {
+						if (o->off2)
+							val_store(ilp, ull, o->off2, 0, data, o);
+					}
+					if (curr < 2) {
+						if (o->off3)
+							val_store(ilp, ull, o->off3, 0, data, o);
+					}
+				}
+			} else if (o->type == FIO_OPT_ULL) {
+				if (first)
+					val_store(ullp, ull, o->off1, 0, data, o);
 				if (curr == 1) {
-					if (o->roff2)
-						*(unsigned int *) o->roff2 = ull;
-					else if (o->off2)
-						val_store(ilp, ull, o->off2, 0, data);
+					if (o->off2)
+						val_store(ullp, ull, o->off2, 0, data, o);
 				}
 				if (curr == 2) {
-					if (o->roff3)
-						*(unsigned int *) o->roff3 = ull;
-					else if (o->off3)
-						val_store(ilp, ull, o->off3, 0, data);
+					if (o->off3)
+						val_store(ullp, ull, o->off3, 0, data, o);
 				}
 				if (!more) {
 					if (curr < 1) {
-						if (o->roff2)
-							*(unsigned int *) o->roff2 = ull;
-						else if (o->off2)
-							val_store(ilp, ull, o->off2, 0, data);
+						if (o->off2)
+							val_store(ullp, ull, o->off2, 0, data, o);
 					}
 					if (curr < 2) {
-						if (o->roff3)
-							*(unsigned int *) o->roff3 = ull;
-						else if (o->off3)
-							val_store(ilp, ull, o->off3, 0, data);
+						if (o->off3)
+							val_store(ullp, ull, o->off3, 0, data, o);
 					}
 				}
 			} else {
-				if (first) {
-					if (o->roff1)
-						*(unsigned long long *) o->roff1 = ull;
-					else
-						val_store(ullp, ull, o->off1, 0, data);
-				}
+				if (first)
+					val_store(ullp, ull, o->off1, 0, data, o);
 				if (!more) {
-					if (o->roff2)
-						*(unsigned long long *) o->roff2 =  ull;
-					else if (o->off2)
-						val_store(ullp, ull, o->off2, 0, data);
+					if (o->off2)
+						val_store(ullp, ull, o->off2, 0, data, o);
 				}
 			}
 		}
@@ -547,11 +715,13 @@
 			** Initialize precision to 0 and zero out list
 			** in case specified list is shorter than default
 			*/
-			ul2 = 0;
-			ilp = td_var(data, o->off2);
-			*ilp = ul2;
+			if (o->off2) {
+				ul2 = 0;
+				ilp = td_var(data, o, o->off2);
+				*ilp = ul2;
+			}
 
-			flp = td_var(data, o->off1);
+			flp = td_var(data, o, o->off1);
 			for(i = 0; i < o->maxlen; i++)
 				flp[i].u.f = 0.0;
 		}
@@ -560,22 +730,24 @@
 					o->maxlen);
 			return 1;
 		}
-		if (!str_to_float(ptr, &uf)) {
+		if (!str_to_float(ptr, &uf, 0)) { /* this breaks if we ever have lists of times */
 			log_err("not a floating point value: %s\n", ptr);
 			return 1;
 		}
-		if (uf > o->maxfp) {
-			log_err("value out of range: %f"
-				" (range max: %f)\n", uf, o->maxfp);
-			return 1;
-		}
-		if (uf < o->minfp) {
-			log_err("value out of range: %f"
-				" (range min: %f)\n", uf, o->minfp);
-			return 1;
+		if (o->minfp || o->maxfp) {
+			if (uf > o->maxfp) {
+				log_err("value out of range: %f"
+					" (range max: %f)\n", uf, o->maxfp);
+				return 1;
+			}
+			if (uf < o->minfp) {
+				log_err("value out of range: %f"
+					" (range min: %f)\n", uf, o->minfp);
+				return 1;
+			}
 		}
 
-		flp = td_var(data, o->off1);
+		flp = td_var(data, o, o->off1);
 		flp[curr].u.f = uf;
 
 		dprint(FD_PARSE, "  out=%f\n", uf);
@@ -592,9 +764,11 @@
 			while (*++cp2 != '\0' && *cp2 >= '0' && *cp2 <= '9')
 				len++;
 
-			ilp = td_var(data, o->off2);
-			if (len > *ilp)
-				*ilp = len;
+			if (o->off2) {
+				ilp = td_var(data, o, o->off2);
+				if (len > *ilp)
+					*ilp = len;
+			}
 		}
 
 		break;
@@ -602,12 +776,11 @@
 	case FIO_OPT_STR_STORE: {
 		fio_opt_str_fn *fn = o->cb;
 
-		if (o->roff1 || o->off1) {
-			if (o->roff1)
-				cp = (char **) o->roff1;
-			else if (o->off1)
-				cp = td_var(data, o->off1);
+		if (!strlen(ptr))
+			return 1;
 
+		if (o->off1) {
+			cp = td_var(data, o, o->off1);
 			*cp = strdup(ptr);
 		}
 
@@ -619,7 +792,7 @@
 			ret = 1;
 			for (i = 0; i < PARSE_MAX_VP; i++) {
 				vp = &posval[i];
-				if (!vp->ival || vp->ival[0] == '\0')
+				if (!vp->ival || vp->ival[0] == '\0' || !cp)
 					continue;
 				all_skipped = 0;
 				if (!strncmp(vp->ival, ptr, str_match_len(vp, ptr))) {
@@ -655,7 +828,7 @@
 		char tmp[128];
 		char *p1, *p2;
 
-		strncpy(tmp, ptr, sizeof(tmp) - 1);
+		snprintf(tmp, sizeof(tmp), "%s", ptr);
 
 		/* Handle bsrange with separate read,write values: */
 		p1 = strchr(tmp, ',');
@@ -676,61 +849,43 @@
 		p1 = tmp;
 
 		ret = 1;
-		if (!check_range_bytes(p1, &ul1, data) &&
-		    !check_range_bytes(p2, &ul2, data)) {
+		if (!check_range_bytes(p1, &ull1, data) &&
+			!check_range_bytes(p2, &ull2, data)) {
 			ret = 0;
-			if (ul1 > ul2) {
-				unsigned long foo = ul1;
+			if (ull1 > ull2) {
+				unsigned long long foo = ull1;
 
-				ul1 = ul2;
-				ul2 = foo;
+				ull1 = ull2;
+				ull2 = foo;
 			}
 
 			if (first) {
-				if (o->roff1)
-					*(unsigned int *) o->roff1 = ul1;
-				else
-					val_store(ilp, ul1, o->off1, 0, data);
-				if (o->roff2)
-					*(unsigned int *) o->roff2 = ul2;
-				else
-					val_store(ilp, ul2, o->off2, 0, data);
+				val_store(ullp, ull1, o->off1, 0, data, o);
+				val_store(ullp, ull2, o->off2, 0, data, o);
 			}
 			if (curr == 1) {
-				if (o->roff3 && o->roff4) {
-					*(unsigned int *) o->roff3 = ul1;
-					*(unsigned int *) o->roff4 = ul2;
-				} else if (o->off3 && o->off4) {
-					val_store(ilp, ul1, o->off3, 0, data);
-					val_store(ilp, ul2, o->off4, 0, data);
+				if (o->off3 && o->off4) {
+					val_store(ullp, ull1, o->off3, 0, data, o);
+					val_store(ullp, ull2, o->off4, 0, data, o);
 				}
 			}
 			if (curr == 2) {
-				if (o->roff5 && o->roff6) {
-					*(unsigned int *) o->roff5 = ul1;
-					*(unsigned int *) o->roff6 = ul2;
-				} else if (o->off5 && o->off6) {
-					val_store(ilp, ul1, o->off5, 0, data);
-					val_store(ilp, ul2, o->off6, 0, data);
+				if (o->off5 && o->off6) {
+					val_store(ullp, ull1, o->off5, 0, data, o);
+					val_store(ullp, ull2, o->off6, 0, data, o);
 				}
 			}
 			if (!more) {
 				if (curr < 1) {
-					if (o->roff3 && o->roff4) {
-						*(unsigned int *) o->roff3 = ul1;
-						*(unsigned int *) o->roff4 = ul2;
-					} else if (o->off3 && o->off4) {
-						val_store(ilp, ul1, o->off3, 0, data);
-						val_store(ilp, ul2, o->off4, 0, data);
+					if (o->off3 && o->off4) {
+						val_store(ullp, ull1, o->off3, 0, data, o);
+						val_store(ullp, ull2, o->off4, 0, data, o);
 					}
 				}
 				if (curr < 2) {
-					if (o->roff5 && o->roff6) {
-						*(unsigned int *) o->roff5 = ul1;
-						*(unsigned int *) o->roff6 = ul2;
-					} else if (o->off5 && o->off6) {
-						val_store(ilp, ul1, o->off5, 0, data);
-						val_store(ilp, ul2, o->off6, 0, data);
+					if (o->off5 && o->off6) {
+						val_store(ullp, ull1, o->off5, 0, data, o);
+						val_store(ullp, ull2, o->off6, 0, data, o);
 					}
 				}
 			}
@@ -755,7 +910,7 @@
 			break;
 
 		if (o->maxval && il > (int) o->maxval) {
-			log_err("max value out of range: %d (%d max)\n",
+			log_err("max value out of range: %d (%llu max)\n",
 								il, o->maxval);
 			return 1;
 		}
@@ -771,24 +926,20 @@
 		if (fn)
 			ret = fn(data, &il);
 		else {
-			if (first) {
-				if (o->roff1)
-					*(unsigned int *)o->roff1 = il;
-				else
-					val_store(ilp, il, o->off1, 0, data);
-			}
+			if (first)
+				val_store(ilp, il, o->off1, 0, data, o);
 			if (!more) {
-				if (o->roff2)
-					*(unsigned int *) o->roff2 = il;
-				else if (o->off2)
-					val_store(ilp, il, o->off2, 0, data);
+				if (o->off2)
+					val_store(ilp, il, o->off2, 0, data, o);
 			}
 		}
 		break;
 	}
 	case FIO_OPT_DEPRECATED:
-		log_info("Option %s is deprecated\n", o->name);
 		ret = 1;
+		/* fall through */
+	case FIO_OPT_SOFT_DEPRECATED:
+		log_info("Option %s is deprecated\n", o->name);
 		break;
 	default:
 		log_err("Bad option type %u\n", o->type);
@@ -810,7 +961,8 @@
 	return ret;
 }
 
-static int handle_option(struct fio_option *o, const char *__ptr, void *data)
+static int handle_option(const struct fio_option *o, const char *__ptr,
+			 void *data)
 {
 	char *o_ptr, *ptr, *ptr2;
 	int ret, done;
@@ -835,6 +987,7 @@
 		if (ptr &&
 		    (o->type != FIO_OPT_STR_STORE) &&
 		    (o->type != FIO_OPT_STR) &&
+		    (o->type != FIO_OPT_STR_ULL) &&
 		    (o->type != FIO_OPT_FLOAT_LIST)) {
 			ptr2 = strchr(ptr, ',');
 			if (ptr2 && *(ptr2 + 1) == '\0')
@@ -870,10 +1023,34 @@
 	return ret;
 }
 
-static struct fio_option *get_option(char *opt,
-				     struct fio_option *options, char **post)
+struct fio_option *find_option(struct fio_option *options, const char *opt)
 {
 	struct fio_option *o;
+
+	for (o = &options[0]; o->name; o++) {
+		if (!o_match(o, opt))
+			continue;
+		if (o->type == FIO_OPT_UNSUPPORTED) {
+			log_err("Option <%s>: %s\n", o->name, o->help);
+			continue;
+		}
+
+		return o;
+	}
+
+	return NULL;
+}
+
+const struct fio_option *
+find_option_c(const struct fio_option *options, const char *opt)
+{
+	return find_option((struct fio_option *)options, opt);
+}
+
+static const struct fio_option *
+get_option(char *opt, const struct fio_option *options, char **post)
+{
+	const struct fio_option *o;
 	char *ret;
 
 	ret = strchr(opt, '=');
@@ -883,9 +1060,9 @@
 		ret = opt;
 		(*post)++;
 		strip_blank_end(ret);
-		o = find_option(options, ret);
+		o = find_option_c(options, ret);
 	} else {
-		o = find_option(options, opt);
+		o = find_option_c(options, opt);
 		*post = NULL;
 	}
 
@@ -894,7 +1071,7 @@
 
 static int opt_cmp(const void *p1, const void *p2)
 {
-	struct fio_option *o;
+	const struct fio_option *o;
 	char *s, *foo;
 	int prio1, prio2;
 
@@ -918,33 +1095,55 @@
 	return prio2 - prio1;
 }
 
-void sort_options(char **opts, struct fio_option *options, int num_opts)
+void sort_options(char **opts, const struct fio_option *options, int num_opts)
 {
 	__fio_options = options;
 	qsort(opts, num_opts, sizeof(char *), opt_cmp);
 	__fio_options = NULL;
 }
 
+static void add_to_dump_list(const struct fio_option *o,
+			     struct flist_head *dump_list, const char *post)
+{
+	struct print_option *p;
+
+	if (!dump_list)
+		return;
+
+	p = malloc(sizeof(*p));
+	p->name = strdup(o->name);
+	if (post)
+		p->value = strdup(post);
+	else
+		p->value = NULL;
+
+	flist_add_tail(&p->list, dump_list);
+}
+
 int parse_cmd_option(const char *opt, const char *val,
-		     struct fio_option *options, void *data)
+		     const struct fio_option *options, void *data,
+		     struct flist_head *dump_list)
 {
-	struct fio_option *o;
+	const struct fio_option *o;
 
-	o = find_option(options, opt);
+	o = find_option_c(options, opt);
 	if (!o) {
 		log_err("Bad option <%s>\n", opt);
 		return 1;
 	}
 
-	if (!handle_option(o, val, data))
-		return 0;
+	if (handle_option(o, val, data)) {
+		log_err("fio: failed parsing %s=%s\n", opt, val);
+		return 1;
+	}
 
-	log_err("fio: failed parsing %s=%s\n", opt, val);
-	return 1;
+	add_to_dump_list(o, dump_list, val);
+	return 0;
 }
 
-int parse_option(char *opt, const char *input,
-		 struct fio_option *options, struct fio_option **o, void *data)
+int parse_option(char *opt, const char *input, const struct fio_option *options,
+		 const struct fio_option **o, void *data,
+		 struct flist_head *dump_list)
 {
 	char *post;
 
@@ -965,18 +1164,20 @@
 		return 1;
 	}
 
-	if (!handle_option(*o, post, data))
-		return 0;
+	if (handle_option(*o, post, data)) {
+		log_err("fio: failed parsing %s\n", input);
+		return 1;
+	}
 
-	log_err("fio: failed parsing %s\n", input);
-	return 1;
+	add_to_dump_list(*o, dump_list, post);
+	return 0;
 }
 
 /*
  * Option match, levenshtein distance. Handy for not quite remembering what
  * the option name is.
  */
-static int string_distance(const char *s1, const char *s2)
+int string_distance(const char *s1, const char *s2)
 {
 	unsigned int s1_len = strlen(s1);
 	unsigned int s2_len = strlen(s2);
@@ -994,11 +1195,13 @@
 		q[0] = p[0] + 1;
 		for (j = 1; j <= s2_len; j++) {
 			unsigned int sub = p[j - 1];
+			unsigned int pmin;
 
 			if (s1[i - 1] != s2[j - 1])
 				sub++;
 
-			q[j] = min(p[j] + 1, min(q[j - 1] + 1, sub));
+			pmin = min(q[j - 1] + 1, sub);
+			q[j] = min(p[j] + 1, pmin);
 		}
 		r = p;
 		p = q;
@@ -1011,10 +1214,23 @@
 	return i;
 }
 
-static struct fio_option *find_child(struct fio_option *options,
-				     struct fio_option *o)
+/*
+ * Make a guess of whether the distance from 's1' is significant enough
+ * to warrant printing the guess. We set this to a 1/2 match.
+ */
+int string_distance_ok(const char *opt, int distance)
 {
-	struct fio_option *__o;
+	size_t len;
+
+	len = strlen(opt);
+	len = (len + 1) / 2;
+	return distance <= len;
+}
+
+static const struct fio_option *find_child(const struct fio_option *options,
+					   const struct fio_option *o)
+{
+	const struct fio_option *__o;
 
 	for (__o = options + 1; __o->name; __o++)
 		if (__o->parent && !strcmp(__o->parent, o->name))
@@ -1023,7 +1239,8 @@
 	return NULL;
 }
 
-static void __print_option(struct fio_option *o, struct fio_option *org,
+static void __print_option(const struct fio_option *o,
+			   const struct fio_option *org,
 			   int level)
 {
 	char name[256], *p;
@@ -1031,8 +1248,6 @@
 
 	if (!o)
 		return;
-	if (!org)
-		org = o;
 
 	p = name;
 	depth = level;
@@ -1044,10 +1259,10 @@
 	log_info("%-24s: %s\n", name, o->help);
 }
 
-static void print_option(struct fio_option *o)
+static void print_option(const struct fio_option *o)
 {
-	struct fio_option *parent;
-	struct fio_option *__o;
+	const struct fio_option *parent;
+	const struct fio_option *__o;
 	unsigned int printed;
 	unsigned int level;
 
@@ -1068,9 +1283,9 @@
 	} while (printed);
 }
 
-int show_cmd_help(struct fio_option *options, const char *name)
+int show_cmd_help(const struct fio_option *options, const char *name)
 {
-	struct fio_option *o, *closest;
+	const struct fio_option *o, *closest;
 	unsigned int best_dist = -1U;
 	int found = 0;
 	int show_all = 0;
@@ -1083,7 +1298,8 @@
 	for (o = &options[0]; o->name; o++) {
 		int match = 0;
 
-		if (o->type == FIO_OPT_DEPRECATED)
+		if (o->type == FIO_OPT_DEPRECATED ||
+		    o->type == FIO_OPT_SOFT_DEPRECATED)
 			continue;
 		if (!exec_profile && o->prof_name)
 			continue;
@@ -1144,9 +1360,9 @@
 /*
  * Handle parsing of default parameters.
  */
-void fill_default_options(void *data, struct fio_option *options)
+void fill_default_options(void *data, const struct fio_option *options)
 {
-	struct fio_option *o;
+	const struct fio_option *o;
 
 	dprint(FD_PARSE, "filling default options\n");
 
@@ -1155,10 +1371,13 @@
 			handle_option(o, o->def, data);
 }
 
-void option_init(struct fio_option *o)
+static void option_init(struct fio_option *o)
 {
-	if (o->type == FIO_OPT_DEPRECATED)
+	if (o->type == FIO_OPT_DEPRECATED || o->type == FIO_OPT_UNSUPPORTED ||
+	    o->type == FIO_OPT_SOFT_DEPRECATED)
 		return;
+	if (o->name && !o->lname)
+		log_err("Option %s: missing long option name\n", o->name);
 	if (o->type == FIO_OPT_BOOL) {
 		o->minval = 0;
 		o->maxval = 1;
@@ -1167,28 +1386,21 @@
 		if (!o->maxval)
 			o->maxval = UINT_MAX;
 	}
-	if (o->type == FIO_OPT_FLOAT_LIST) {
-		o->minfp = DBL_MIN;
-		o->maxfp = DBL_MAX;
+	if (o->type == FIO_OPT_ULL) {
+		if (!o->maxval)
+			o->maxval = ULLONG_MAX;
 	}
-	if (o->type == FIO_OPT_STR_SET && o->def) {
+	if (o->type == FIO_OPT_STR_SET && o->def && !o->no_warn_def) {
 		log_err("Option %s: string set option with"
 				" default will always be true\n", o->name);
 	}
-	if (!o->cb && (!o->off1 && !o->roff1))
+	if (!o->cb && !o->off1)
 		log_err("Option %s: neither cb nor offset given\n", o->name);
 	if (!o->category) {
 		log_info("Option %s: no category defined. Setting to misc\n", o->name);
 		o->category = FIO_OPT_C_GENERAL;
 		o->group = FIO_OPT_G_INVALID;
 	}
-	if (o->type == FIO_OPT_STR || o->type == FIO_OPT_STR_STORE ||
-	    o->type == FIO_OPT_STR_MULTI)
-		return;
-	if (o->cb && ((o->off1 || o->off2 || o->off3 || o->off4) ||
-		      (o->roff1 || o->roff2 || o->roff3 || o->roff4))) {
-		log_err("Option %s: both cb and offset given\n", o->name);
-	}
 }
 
 /*
@@ -1208,18 +1420,35 @@
 	}
 }
 
-void options_free(struct fio_option *options, void *data)
+void options_mem_dupe(const struct fio_option *options, void *data)
 {
-	struct fio_option *o;
+	const struct fio_option *o;
+	char **ptr;
+
+	dprint(FD_PARSE, "dup options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		if (o->type != FIO_OPT_STR_STORE)
+			continue;
+
+		ptr = td_var(data, o, o->off1);
+		if (*ptr)
+			*ptr = strdup(*ptr);
+	}
+}
+
+void options_free(const struct fio_option *options, void *data)
+{
+	const struct fio_option *o;
 	char **ptr;
 
 	dprint(FD_PARSE, "free options\n");
 
 	for (o = &options[0]; o->name; o++) {
-		if (o->type != FIO_OPT_STR_STORE || !o->off1)
+		if (o->type != FIO_OPT_STR_STORE || !o->off1 || o->no_free)
 			continue;
 
-		ptr = td_var(data, o->off1);
+		ptr = td_var(data, o, o->off1);
 		if (*ptr) {
 			free(*ptr);
 			*ptr = NULL;
diff -Nru fio-2.1.3/parse.h fio-3.16/parse.h
--- fio-2.1.3/parse.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/parse.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,7 @@
 #ifndef FIO_PARSE_H
 #define FIO_PARSE_H
 
+#include <inttypes.h>
 #include "flist.h"
 
 /*
@@ -9,16 +10,20 @@
 enum fio_opt_type {
 	FIO_OPT_INVALID = 0,
 	FIO_OPT_STR,
+	FIO_OPT_STR_ULL,
 	FIO_OPT_STR_MULTI,
 	FIO_OPT_STR_VAL,
 	FIO_OPT_STR_VAL_TIME,
 	FIO_OPT_STR_STORE,
 	FIO_OPT_RANGE,
 	FIO_OPT_INT,
+	FIO_OPT_ULL,
 	FIO_OPT_BOOL,
 	FIO_OPT_FLOAT_LIST,
 	FIO_OPT_STR_SET,
 	FIO_OPT_DEPRECATED,
+	FIO_OPT_SOFT_DEPRECATED,
+	FIO_OPT_UNSUPPORTED,	/* keep this last */
 };
 
 /*
@@ -26,14 +31,14 @@
  */
 struct value_pair {
 	const char *ival;		/* string option */
-	unsigned int oval;		/* output value */
+	unsigned long long oval;/* output value */
 	const char *help;		/* help text for sub option */
-	int or;				/* OR value */
+	int orval;			/* OR value */
 	void *cb;			/* sub-option callback */
 };
 
-#define OPT_LEN_MAX 	4096
-#define PARSE_MAX_VP	24
+#define OPT_LEN_MAX 	8192
+#define PARSE_MAX_VP	32
 
 /*
  * Option define
@@ -49,8 +54,7 @@
 	unsigned int off4;
 	unsigned int off5;
 	unsigned int off6;
-	void *roff1, *roff2, *roff3, *roff4, *roff5, *roff6;
-	unsigned int maxval;		/* max and min value */
+	unsigned long long maxval;		/* max and min value */
 	int minval;
 	double maxfp;			/* max and min floating value */
 	double minfp;
@@ -67,30 +71,41 @@
 	int hide_on_set;		/* hide on set, not on unset */
 	const char *inverse;		/* if set, apply opposite action to this option */
 	struct fio_option *inv_opt;	/* cached lookup */
-	int (*verify)(struct fio_option *, void *);
+	int (*verify)(const struct fio_option *, void *);
 	const char *prof_name;		/* only valid for specific profile */
-	unsigned int category;		/* what type of option */
-	unsigned int group;		/* who to group with */
+	void *prof_opts;
+	uint64_t category;		/* what type of option */
+	uint64_t group;			/* who to group with */
 	void *gui_data;
+	int is_seconds;			/* time value with seconds base */
+	int is_time;			/* time based value */
+	int no_warn_def;
+	int pow2;			/* must be a power-of-2 */
+	int no_free;
 };
 
-typedef int (str_cb_fn)(void *, char *);
-
-extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *);
-extern void sort_options(char **, struct fio_option *, int);
-extern int parse_cmd_option(const char *t, const char *l, struct fio_option *, void *);
-extern int show_cmd_help(struct fio_option *, const char *);
-extern void fill_default_options(void *, struct fio_option *);
-extern void option_init(struct fio_option *);
+extern int parse_option(char *, const char *, const struct fio_option *,
+			const struct fio_option **, void *,
+			struct flist_head *);
+extern void sort_options(char **, const struct fio_option *, int);
+extern int parse_cmd_option(const char *t, const char *l,
+			    const struct fio_option *, void *,
+			    struct flist_head *);
+extern int show_cmd_help(const struct fio_option *, const char *);
+extern void fill_default_options(void *, const struct fio_option *);
 extern void options_init(struct fio_option *);
-extern void options_free(struct fio_option *, void *);
+extern void options_mem_dupe(const struct fio_option *, void *);
+extern void options_free(const struct fio_option *, void *);
 
 extern void strip_blank_front(char **);
 extern void strip_blank_end(char *);
-extern int str_to_decimal(const char *, long long *, int, void *);
+extern int str_to_decimal(const char *, long long *, int, void *, int, int);
 extern int check_str_bytes(const char *p, long long *val, void *data);
-extern int check_str_time(const char *p, long long *val);
-extern int str_to_float(const char *str, double *val);
+extern int check_str_time(const char *p, long long *val, int);
+extern int str_to_float(const char *str, double *val, int is_time);
+
+extern int string_distance(const char *s1, const char *s2);
+extern int string_distance_ok(const char *s1, int dist);
 
 /*
  * Handlers for the options
@@ -98,13 +113,30 @@
 typedef int (fio_opt_str_fn)(void *, const char *);
 typedef int (fio_opt_str_val_fn)(void *, long long *);
 typedef int (fio_opt_int_fn)(void *, int *);
-typedef int (fio_opt_str_set_fn)(void *);
 
-#define td_var(start, offset)	((void *) start + (offset))
+struct thread_options;
+static inline void *td_var(void *to, const struct fio_option *o,
+			   unsigned int offset)
+{
+	void *ret;
+
+	if (o->prof_opts)
+		ret = o->prof_opts;
+	else
+		ret = to;
+
+	return ret + offset;
+}
 
 static inline int parse_is_percent(unsigned long long val)
 {
 	return val <= -1ULL && val >= (-1ULL - 100ULL);
 }
 
+struct print_option {
+	struct flist_head list;
+	char *name;
+	char *value;
+};
+
 #endif
diff -Nru fio-2.1.3/printing.c fio-3.16/printing.c
--- fio-2.1.3/printing.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/printing.c	2019-09-20 01:01:52.000000000 +0000
@@ -31,7 +31,7 @@
 			      gpointer data)
 {
 	cairo_t *cr;
-	char str[20];
+	char str[32];
 	double x, y;
 
 	cr = gtk_print_context_get_cairo_context(context);
diff -Nru fio-2.1.3/profile.c fio-3.16/profile.c
--- fio-2.1.3/profile.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/profile.c	2019-09-20 01:01:52.000000000 +0000
@@ -52,6 +52,7 @@
 	o = ops->options;
 	while (o->name) {
 		o->prof_name = ops->name;
+		o->prof_opts = ops->opt_data;
 		if (add_option(o))
 			return 1;
 		o++;
diff -Nru fio-2.1.3/profile.h fio-3.16/profile.h
--- fio-2.1.3/profile.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/profile.h	2019-09-20 01:01:52.000000000 +0000
@@ -10,10 +10,6 @@
 	int (*td_init)(struct thread_data *);
 	void (*td_exit)(struct thread_data *);
 
-	int (*fill_io_u_off)(struct thread_data *, struct io_u *, unsigned int *);
-	int (*fill_io_u_size)(struct thread_data *, struct io_u *, unsigned int);
-	struct fio_file *(*get_next_file)(struct thread_data *);
-
 	int (*io_u_lat)(struct thread_data *, uint64_t);
 };
 
@@ -27,6 +23,7 @@
 	 * Profile specific options
 	 */
 	struct fio_option *options;
+	void *opt_data;
 
 	/*
 	 * Called after parsing options, to prepare 'cmdline'
diff -Nru fio-2.1.3/profiles/act.c fio-3.16/profiles/act.c
--- fio-2.1.3/profiles/act.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/profiles/act.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,7 @@
 #include "../fio.h"
 #include "../profile.h"
 #include "../parse.h"
+#include "../optgroup.h"
 
 /*
  * 1x loads
@@ -37,7 +38,7 @@
 };
 
 struct act_run_data {
-	struct fio_mutex *mutex;
+	struct fio_sem *sem;
 	unsigned int pending;
 
 	struct act_slice *slices;
@@ -46,20 +47,12 @@
 static struct act_run_data *act_run_data;
 
 struct act_prof_data {
-	struct timeval sample_tv;
+	struct timespec sample_tv;
 	struct act_slice *slices;
 	unsigned int cur_slice;
 	unsigned int nr_slices;
 };
 
-static char *device_names;
-static unsigned int load;
-static unsigned int prep;
-static unsigned int threads_per_queue;
-static unsigned int num_read_blocks;
-static unsigned int write_size;
-static unsigned long long test_duration;
-
 #define ACT_MAX_OPTS	128
 static const char *act_opts[ACT_MAX_OPTS] = {
 	"direct=1",
@@ -74,21 +67,35 @@
 
 static int act_add_opt(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
 
+struct act_options {
+	unsigned int pad;
+	char *device_names;
+	unsigned int load;
+	unsigned int prep;
+	unsigned int threads_per_queue;
+	unsigned int num_read_blocks;
+	unsigned int write_size;
+	unsigned long long test_duration;
+};
+
+static struct act_options act_options;
+
 static struct fio_option options[] = {
 	{
 		.name	= "device-names",
 		.lname	= "device-names",
 		.type	= FIO_OPT_STR_STORE,
-		.roff1	= &device_names,
+		.off1	= offsetof(struct act_options, device_names),
 		.help	= "Devices to use",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
+		.no_free = true,
 	},
 	{
 		.name	= "load",
 		.lname	= "Load multiplier",
 		.type	= FIO_OPT_INT,
-		.roff1	= &load,
+		.off1	= offsetof(struct act_options, load),
 		.help	= "ACT load multipler (default 1x)",
 		.def	= "1",
 		.category = FIO_OPT_C_PROFILE,
@@ -98,7 +105,7 @@
 		.name	= "test-duration",
 		.lname	= "Test duration",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.roff1	= &test_duration,
+		.off1	= offsetof(struct act_options, test_duration),
 		.help	= "How long the entire test takes to run",
 		.def	= "24h",
 		.category = FIO_OPT_C_PROFILE,
@@ -108,7 +115,7 @@
 		.name	= "threads-per-queue",
 		.lname	= "Number of read IO threads per device",
 		.type	= FIO_OPT_INT,
-		.roff1	= &threads_per_queue,
+		.off1	= offsetof(struct act_options, threads_per_queue),
 		.help	= "Number of read IO threads per device",
 		.def	= "8",
 		.category = FIO_OPT_C_PROFILE,
@@ -116,21 +123,21 @@
 	},
 	{
 		.name	= "read-req-num-512-blocks",
-		.lname	= "Number of 512b blocks to read",
+		.lname	= "Number of 512B blocks to read",
 		.type	= FIO_OPT_INT,
-		.roff1	= &num_read_blocks,
-		.help	= "Number of 512b blocks to read at the time",
+		.off1	= offsetof(struct act_options, num_read_blocks),
+		.help	= "Number of 512B blocks to read at the time",
 		.def	= "3",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
 	{
 		.name	= "large-block-op-kbytes",
-		.lname	= "Size of large block ops (writes)",
+		.lname	= "Size of large block ops in KiB (writes)",
 		.type	= FIO_OPT_INT,
-		.roff1	= &write_size,
-		.help	= "Size of large block ops (writes)",
-		.def	= "128k",
+		.off1	= offsetof(struct act_options, write_size),
+		.help	= "Size of large block ops in KiB (writes)",
+		.def	= "131072",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
@@ -138,7 +145,7 @@
 		.name	= "prep",
 		.lname	= "Run ACT prep phase",
 		.type	= FIO_OPT_STR_SET,
-		.roff1	= &prep,
+		.off1	= offsetof(struct act_options, prep),
 		.help	= "Set to run ACT prep phase",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
@@ -171,6 +178,8 @@
 
 static int act_add_rw(const char *dev, int reads)
 {
+	struct act_options *ao = &act_options;
+
 	if (act_add_opt("name=act-%s-%s", reads ? "read" : "write", dev))
 		return 1;
 	if (act_add_opt("filename=%s", dev))
@@ -178,21 +187,21 @@
 	if (act_add_opt("rw=%s", reads ? "randread" : "randwrite"))
 		return 1;
 	if (reads) {
-		int rload = load * R_LOAD / threads_per_queue;
+		int rload = ao->load * R_LOAD / ao->threads_per_queue;
 
-		if (act_add_opt("numjobs=%u", threads_per_queue))
+		if (act_add_opt("numjobs=%u", ao->threads_per_queue))
 			return 1;
 		if (act_add_opt("rate_iops=%u", rload))
 			return 1;
-		if (act_add_opt("bs=%u", num_read_blocks * 512))
+		if (act_add_opt("bs=%u", ao->num_read_blocks * 512))
 			return 1;
 	} else {
-		const int rsize = write_size / (num_read_blocks * 512);
-		int wload = (load * W_LOAD + rsize - 1) / rsize;
+		const int rsize = ao->write_size / (ao->num_read_blocks * 512);
+		int wload = (ao->load * W_LOAD + rsize - 1) / rsize;
 
 		if (act_add_opt("rate_iops=%u", wload))
 			return 1;
-		if (act_add_opt("bs=%u", write_size))
+		if (act_add_opt("bs=%u", ao->write_size))
 			return 1;
 	}
 
@@ -206,7 +215,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=1M"))
+	if (act_add_opt("bs=1048576"))
 		return 1;
 	if (act_add_opt("zero_buffers"))
 		return 1;
@@ -220,7 +229,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=4k"))
+	if (act_add_opt("bs=4096"))
 		return 1;
 	if (act_add_opt("ioengine=libaio"))
 		return 1;
@@ -234,10 +243,10 @@
 
 static int act_add_dev(const char *dev)
 {
-	if (prep)
+	if (act_options.prep)
 		return act_add_dev_prep(dev);
 
-	if (act_add_opt("runtime=%llus", test_duration))
+	if (act_add_opt("runtime=%llus", act_options.test_duration))
 		return 1;
 	if (act_add_opt("time_based=1"))
 		return 1;
@@ -255,7 +264,7 @@
  */
 static int act_prep_cmdline(void)
 {
-	if (!device_names) {
+	if (!act_options.device_names) {
 		log_err("act: you need to set IO target(s) with the "
 			"device-names option.\n");
 		return 1;
@@ -266,7 +275,7 @@
 	do {
 		char *dev;
 
-		dev = strsep(&device_names, ",");
+		dev = strsep(&act_options.device_names, ",");
 		if (!dev)
 			break;
 
@@ -279,14 +288,15 @@
 	return 0;
 }
 
-static int act_io_u_lat(struct thread_data *td, uint64_t usec)
+static int act_io_u_lat(struct thread_data *td, uint64_t nsec)
 {
 	struct act_prof_data *apd = td->prof_data;
 	struct act_slice *slice;
+	uint64_t usec = nsec / 1000ULL;
 	int i, ret = 0;
 	double perm;
 
-	if (prep)
+	if (act_options.prep)
 		return 0;
 
 	/*
@@ -327,9 +337,9 @@
 
 static void get_act_ref(void)
 {
-	fio_mutex_down(act_run_data->mutex);
+	fio_sem_down(act_run_data->sem);
 	act_run_data->pending++;
-	fio_mutex_up(act_run_data->mutex);
+	fio_sem_up(act_run_data->sem);
 }
 
 static int show_slice(struct act_slice *slice, unsigned int slice_num)
@@ -386,10 +396,10 @@
 	struct act_prof_data *apd = td->prof_data;
 	unsigned int i, slice;
 
-	fio_mutex_down(act_run_data->mutex);
+	fio_sem_down(act_run_data->sem);
 
 	if (!act_run_data->slices) {
-		act_run_data->slices = calloc(sizeof(struct act_slice), apd->nr_slices);
+		act_run_data->slices = calloc(apd->nr_slices, sizeof(struct act_slice));
 		act_run_data->nr_slices = apd->nr_slices;
 	}
 
@@ -406,7 +416,7 @@
 	if (!--act_run_data->pending)
 		act_show_all_stats();
 
-	fio_mutex_up(act_run_data->mutex);
+	fio_sem_up(act_run_data->sem);
 }
 
 static int act_td_init(struct thread_data *td)
@@ -416,9 +426,9 @@
 
 	get_act_ref();
 
-	apd = calloc(sizeof(*apd), 1);
-	nr_slices = (test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
-	apd->slices = calloc(sizeof(struct act_slice), nr_slices);
+	apd = calloc(1, sizeof(*apd));
+	nr_slices = (act_options.test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
+	apd->slices = calloc(nr_slices, sizeof(struct act_slice));
 	apd->nr_slices = nr_slices;
 	fio_gettime(&apd->sample_tv, NULL);
 	td->prof_data = apd;
@@ -445,6 +455,7 @@
 	.name		= "act",
 	.desc		= "ACT Aerospike like benchmark",
 	.options	= options,
+	.opt_data	= &act_options,
 	.prep_cmd	= act_prep_cmdline,
 	.cmdline	= act_opts,
 	.io_ops		= &act_io_ops,
@@ -452,8 +463,8 @@
 
 static void fio_init act_register(void)
 {
-	act_run_data = calloc(sizeof(*act_run_data), 1);
-	act_run_data->mutex = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	act_run_data = calloc(1, sizeof(*act_run_data));
+	act_run_data->sem = fio_sem_init(FIO_SEM_UNLOCKED);
 
 	if (register_profile(&act_profile))
 		log_err("fio: failed to register profile 'act'\n");
@@ -465,7 +476,7 @@
 		free((void *) act_opts[++org_idx]);
 
 	unregister_profile(&act_profile);
-	fio_mutex_remove(act_run_data->mutex);
+	fio_sem_remove(act_run_data->sem);
 	free(act_run_data->slices);
 	free(act_run_data);
 	act_run_data = NULL;
diff -Nru fio-2.1.3/profiles/tiobench.c fio-3.16/profiles/tiobench.c
--- fio-2.1.3/profiles/tiobench.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/profiles/tiobench.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,7 @@
 #include "../fio.h"
 #include "../profile.h"
 #include "../parse.h"
+#include "../optgroup.h"
 
 static unsigned long long size;
 static unsigned int loops = 1;
@@ -8,7 +9,7 @@
 static unsigned int nthreads = 1;
 static char *dir;
 
-char sz_idx[80], bs_idx[80], loop_idx[80], dir_idx[80], t_idx[80];
+static char sz_idx[80], bs_idx[80], loop_idx[80], dir_idx[80], t_idx[80];
 
 static const char *tb_opts[] = {
 	"buffered=0", sz_idx, bs_idx, loop_idx, dir_idx, t_idx,
@@ -21,13 +22,24 @@
 	"name=randread", "stonewall", "rw=randread", NULL,
 };
 
+struct tiobench_options {
+	unsigned int pad;
+	unsigned long long size;
+	unsigned int loops;
+	unsigned int bs;
+	unsigned int nthreads;
+	char *dir;
+};
+
+static struct tiobench_options tiobench_options;
+
 static struct fio_option options[] = {
 	{
 		.name	= "size",
 		.lname	= "Tiobench size",
 		.type	= FIO_OPT_STR_VAL,
-		.roff1	= &size,
-		.help	= "Size in MB",
+		.off1	= offsetof(struct tiobench_options, size),
+		.help	= "Size in MiB",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -35,9 +47,9 @@
 		.name	= "block",
 		.lname	= "Tiobench block",
 		.type	= FIO_OPT_INT,
-		.roff1	= &bs,
+		.off1	= offsetof(struct tiobench_options, bs),
 		.help	= "Block size in bytes",
-		.def	= "4k",
+		.def	= "4096",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -45,7 +57,7 @@
 		.name	= "numruns",
 		.lname	= "Tiobench numruns",
 		.type	= FIO_OPT_INT,
-		.roff1	= &loops,
+		.off1	= offsetof(struct tiobench_options, loops),
 		.help	= "Number of runs",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
@@ -54,16 +66,17 @@
 		.name	= "dir",
 		.lname	= "Tiobench directory",
 		.type	= FIO_OPT_STR_STORE,
-		.roff1	= &dir,
+		.off1	= offsetof(struct tiobench_options, dir),
 		.help	= "Test directory",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
+		.no_free = true,
 	},
 	{
 		.name	= "threads",
 		.lname	= "Tiobench threads",
 		.type	= FIO_OPT_INT,
-		.roff1	= &nthreads,
+		.off1	= offsetof(struct tiobench_options, nthreads),
 		.help	= "Number of Threads",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
@@ -79,7 +92,7 @@
 static int tb_prep_cmdline(void)
 {
 	/*
-	 * tiobench uses size as MB, so multiply up
+	 * tiobench uses size as MiB, so multiply up
 	 */
 	size *= 1024 * 1024ULL;
 	if (size)
@@ -102,9 +115,10 @@
 static struct profile_ops tiobench_profile = {
 	.name		= "tiobench",
 	.desc		= "tiotest/tiobench benchmark",
-	.options	= options,
 	.prep_cmd	= tb_prep_cmdline,
 	.cmdline	= tb_opts,
+	.options	= options,
+	.opt_data	= &tiobench_options,
 };
 
 static void fio_init tiobench_register(void)
diff -Nru fio-2.1.3/pshared.c fio-3.16/pshared.c
--- fio-2.1.3/pshared.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/pshared.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,76 @@
+#include <string.h>
+
+#include "log.h"
+#include "pshared.h"
+
+int cond_init_pshared(pthread_cond_t *cond)
+{
+	pthread_condattr_t cattr;
+	int ret;
+
+	ret = pthread_condattr_init(&cattr);
+	if (ret) {
+		log_err("pthread_condattr_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+#ifdef CONFIG_PSHARED
+	ret = pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_condattr_setpshared: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+	ret = pthread_cond_init(cond, &cattr);
+	if (ret) {
+		log_err("pthread_cond_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int mutex_init_pshared(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t mattr;
+	int ret;
+
+	ret = pthread_mutexattr_init(&mattr);
+	if (ret) {
+		log_err("pthread_mutexattr_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	/*
+	 * Not all platforms support process shared mutexes (FreeBSD)
+	 */
+#ifdef CONFIG_PSHARED
+	ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+	ret = pthread_mutex_init(mutex, &mattr);
+	if (ret) {
+		log_err("pthread_mutex_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int mutex_cond_init_pshared(pthread_mutex_t *mutex, pthread_cond_t *cond)
+{
+	int ret;
+
+	ret = mutex_init_pshared(mutex);
+	if (ret)
+		return ret;
+
+	ret = cond_init_pshared(cond);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff -Nru fio-2.1.3/pshared.h fio-3.16/pshared.h
--- fio-2.1.3/pshared.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/pshared.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+#ifndef FIO_PSHARED_H
+#define FIO_PSHARED_H
+
+#include <pthread.h>
+
+extern int mutex_init_pshared(pthread_mutex_t *);
+extern int cond_init_pshared(pthread_cond_t *);
+extern int mutex_cond_init_pshared(pthread_mutex_t *, pthread_cond_t *);
+
+#endif
diff -Nru fio-2.1.3/rate-submit.c fio-3.16/rate-submit.c
--- fio-2.1.3/rate-submit.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/rate-submit.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,295 @@
+/*
+ * Rated submission helpers
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include "fio.h"
+#include "ioengines.h"
+#include "lib/getrusage.h"
+#include "rate-submit.h"
+
+static void check_overlap(struct io_u *io_u)
+{
+	int i;
+	struct thread_data *td;
+	bool overlap = false;
+
+	do {
+		/*
+		 * Allow only one thread to check for overlap at a
+		 * time to prevent two threads from thinking the coast
+		 * is clear and then submitting IOs that overlap with
+		 * each other
+		 *
+		 * If an overlap is found, release the lock and
+		 * re-acquire it before checking again to give other
+		 * threads a chance to make progress
+		 *
+		 * If an overlap is not found, release the lock when the
+		 * io_u's IO_U_F_FLIGHT flag is set so that this io_u
+		 * can be checked by other threads as they assess overlap
+		 */
+		pthread_mutex_lock(&overlap_check);
+		for_each_td(td, i) {
+			if (td->runstate <= TD_SETTING_UP ||
+				td->runstate >= TD_FINISHING ||
+				!td->o.serialize_overlap ||
+				td->o.io_submit_mode != IO_MODE_OFFLOAD)
+				continue;
+
+			overlap = in_flight_overlap(&td->io_u_all, io_u);
+			if (overlap) {
+				pthread_mutex_unlock(&overlap_check);
+				break;
+			}
+		}
+	} while (overlap);
+}
+
+static int io_workqueue_fn(struct submit_worker *sw,
+			   struct workqueue_work *work)
+{
+	struct io_u *io_u = container_of(work, struct io_u, work);
+	const enum fio_ddir ddir = io_u->ddir;
+	struct thread_data *td = sw->priv;
+	int ret, error;
+
+	if (td->o.serialize_overlap)
+		check_overlap(io_u);
+
+	dprint(FD_RATE, "io_u %p queued by %u\n", io_u, gettid());
+
+	io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+
+	td->cur_depth++;
+
+	do {
+		ret = td_io_queue(td, io_u);
+		if (ret != FIO_Q_BUSY)
+			break;
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			td->cur_depth -= ret;
+		else if (ret < 0)
+			break;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	} while (1);
+
+	dprint(FD_RATE, "io_u %p ret %d by %u\n", io_u, ret, gettid());
+
+	error = io_queue_event(td, io_u, &ret, ddir, NULL, 0, NULL);
+
+	if (ret == FIO_Q_COMPLETED)
+		td->cur_depth--;
+	else if (ret == FIO_Q_QUEUED) {
+		unsigned int min_evts;
+
+		if (td->o.iodepth == 1)
+			min_evts = 1;
+		else
+			min_evts = 0;
+
+		ret = io_u_queued_complete(td, min_evts);
+		if (ret > 0)
+			td->cur_depth -= ret;
+	}
+
+	if (error || td->error)
+		pthread_cond_signal(&td->parent->free_cond);
+
+	return 0;
+}
+
+static bool io_workqueue_pre_sleep_flush_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+
+	if (td->error)
+		return false;
+	if (td->io_u_queued || td->cur_depth || td->io_u_in_flight)
+		return true;
+
+	return false;
+}
+
+static void io_workqueue_pre_sleep_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+	int ret;
+
+	ret = io_u_quiesce(td);
+	if (ret > 0)
+		td->cur_depth -= ret;
+}
+
+static int io_workqueue_alloc_fn(struct submit_worker *sw)
+{
+	struct thread_data *td;
+
+	td = calloc(1, sizeof(*td));
+	sw->priv = td;
+	return 0;
+}
+
+static void io_workqueue_free_fn(struct submit_worker *sw)
+{
+	free(sw->priv);
+	sw->priv = NULL;
+}
+
+static int io_workqueue_init_worker_fn(struct submit_worker *sw)
+{
+	struct thread_data *parent = sw->wq->td;
+	struct thread_data *td = sw->priv;
+
+	memcpy(&td->o, &parent->o, sizeof(td->o));
+	memcpy(&td->ts, &parent->ts, sizeof(td->ts));
+	td->o.uid = td->o.gid = -1U;
+	dup_files(td, parent);
+	td->eo = parent->eo;
+	fio_options_mem_dupe(td);
+
+	if (ioengine_load(td))
+		goto err;
+
+	td->pid = gettid();
+
+	INIT_FLIST_HEAD(&td->io_log_list);
+	INIT_FLIST_HEAD(&td->io_hist_list);
+	INIT_FLIST_HEAD(&td->verify_list);
+	INIT_FLIST_HEAD(&td->trim_list);
+	td->io_hist_tree = RB_ROOT;
+
+	td->o.iodepth = 1;
+	if (td_io_init(td))
+		goto err_io_init;
+
+	if (td->io_ops->post_init && td->io_ops->post_init(td))
+		goto err_io_init;
+
+	set_epoch_time(td, td->o.log_unix_epoch);
+	fio_getrusage(&td->ru_start);
+	clear_io_state(td, 1);
+
+	td_set_runstate(td, TD_RUNNING);
+	td->flags |= TD_F_CHILD | TD_F_NEED_LOCK;
+	td->parent = parent;
+	return 0;
+
+err_io_init:
+	close_ioengine(td);
+err:
+	return 1;
+
+}
+
+static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
+					unsigned int *sum_cnt)
+{
+	struct thread_data *td = sw->priv;
+
+	(*sum_cnt)++;
+	sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+	fio_options_free(td);
+	close_and_free_files(td);
+	if (td->io_ops)
+		close_ioengine(td);
+	td_set_runstate(td, TD_EXITED);
+}
+
+#ifdef CONFIG_SFAA
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		__sync_fetch_and_add(dst, *src);
+		*src = 0;
+	}
+}
+#else
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		*dst += *src;
+		*src = 0;
+	}
+}
+#endif
+
+static void pthread_double_unlock(pthread_mutex_t *lock1,
+				  pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	pthread_mutex_unlock(lock1);
+	pthread_mutex_unlock(lock2);
+#endif
+}
+
+static void pthread_double_lock(pthread_mutex_t *lock1, pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	if (lock1 < lock2) {
+		pthread_mutex_lock(lock1);
+		pthread_mutex_lock(lock2);
+	} else {
+		pthread_mutex_lock(lock2);
+		pthread_mutex_lock(lock1);
+	}
+#endif
+}
+
+static void sum_ddir(struct thread_data *dst, struct thread_data *src,
+		     enum fio_ddir ddir)
+{
+	pthread_double_lock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+
+	sum_val(&dst->io_bytes[ddir], &src->io_bytes[ddir]);
+	sum_val(&dst->io_blocks[ddir], &src->io_blocks[ddir]);
+	sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]);
+	sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]);
+	sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]);
+
+	pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+}
+
+static void io_workqueue_update_acct_fn(struct submit_worker *sw)
+{
+	struct thread_data *src = sw->priv;
+	struct thread_data *dst = sw->wq->td;
+
+	if (td_read(src))
+		sum_ddir(dst, src, DDIR_READ);
+	if (td_write(src))
+		sum_ddir(dst, src, DDIR_WRITE);
+	if (td_trim(src))
+		sum_ddir(dst, src, DDIR_TRIM);
+
+}
+
+static struct workqueue_ops rated_wq_ops = {
+	.fn			= io_workqueue_fn,
+	.pre_sleep_flush_fn	= io_workqueue_pre_sleep_flush_fn,
+	.pre_sleep_fn		= io_workqueue_pre_sleep_fn,
+	.update_acct_fn		= io_workqueue_update_acct_fn,
+	.alloc_worker_fn	= io_workqueue_alloc_fn,
+	.free_worker_fn		= io_workqueue_free_fn,
+	.init_worker_fn		= io_workqueue_init_worker_fn,
+	.exit_worker_fn		= io_workqueue_exit_worker_fn,
+};
+
+int rate_submit_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return 0;
+
+	return workqueue_init(td, &td->io_wq, &rated_wq_ops, td->o.iodepth, sk_out);
+}
+
+void rate_submit_exit(struct thread_data *td)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return;
+
+	workqueue_exit(&td->io_wq);
+}
diff -Nru fio-2.1.3/rate-submit.h fio-3.16/rate-submit.h
--- fio-2.1.3/rate-submit.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/rate-submit.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,7 @@
+#ifndef FIO_RATE_SUBMIT
+#define FIO_RATE_SUBMIT
+
+int rate_submit_init(struct thread_data *, struct sk_out *);
+void rate_submit_exit(struct thread_data *);
+
+#endif
diff -Nru fio-2.1.3/README fio-3.16/README
--- fio-2.1.3/README	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/README	2019-09-20 01:01:52.000000000 +0000
@@ -1,18 +1,31 @@
-fio
----
+Overview and history
+--------------------
 
-fio is a tool that will spawn a number of threads or processes doing a
-particular type of io action as specified by the user. fio takes a
-number of global parameters, each inherited by the thread unless
-otherwise parameters given to them overriding that setting is given.
-The typical use of fio is to write a job file matching the io load
-one wants to simulate.
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in an
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
 
 
 Source
 ------
 
-fio resides in a git repo, the canonical place is:
+Fio resides in a git repo, the canonical place is:
 
 	git://git.kernel.dk/fio.git
 
@@ -21,51 +34,37 @@
 
 	http://git.kernel.dk/fio.git
 
-Snapshots are frequently generated and include the git meta data as well.
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
 Snapshots can download from:
 
 	http://brick.kernel.dk/snaps/
 
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
 
-Binary packages
----------------
+	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
 
-Debian:
-Starting with Debian "Squeeze", fio packages are part of the official
-Debian repository. http://packages.debian.org/search?keywords=fio
+	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
 
-Ubuntu:
-Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
-of the Ubuntu "universe" repository.
-http://packages.ubuntu.com/search?keywords=fio
-
-Red Hat, CentOS & Co:
-Dag Wieërs has RPMs for Red Hat related distros, find them here:
-http://dag.wieers.com/rpm/packages/fio/
+or
 
-Mandriva:
-Mandriva has integrated fio into their package repository, so installing
-on that distro should be as easy as typing 'urpmi fio'.
+	git://github.com/axboe/fio.git
 
-Solaris:
-Packages for Solaris are available from OpenCSW. Install their pkgutil
-tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
-'pkgutil -i fio'.
-
-Windows:
-Bruce Cran <bruce@cran.org.uk> has fio packages for Windows at
-http://www.bluestop.org/fio/ .
+	https://github.com/axboe/fio.git
 
 
 Mailing list
 ------------
 
 The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development.
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
 
-An automated mail detailing recent commits is automatically sent to the
-list at most daily. The list address is fio@vger.kernel.org, subscribe
-by sending an email to majordomo@vger.kernel.org with
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
 
 	subscribe fio
 
@@ -78,250 +77,165 @@
 	http://maillist.kernel.dk/fio-devel/
 
 
-Building
---------
-
-Just type 'configure', 'make' and 'make install'.
+Author
+------
 
-Note that GNU make is required. On BSD it's available from devel/gmake;
-on Solaris it's in the SUNWgmake package. On platforms where GNU make
-isn't the default, type 'gmake' instead of 'make'.
-
-Configure will print the enabled options. Note that on Linux based
-platforms, the libaio development packages must be installed to use
-the libaio engine. Depending on distro, it is usually called
-libaio-devel or libaio-dev.
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
 
-For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed.  gfio isn't built automatically and can be enabled
-with a --enable-gfio option to configure.
+Jens Axboe <axboe@kernel.dk> 20060905
 
-To build FIO with a cross-compiler:
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-Configure will attempt to determine the target platform automatically.
 
+Binary packages
+---------------
 
-Windows
--------
+Debian:
+	Starting with Debian "Squeeze", fio packages are part of the official
+	Debian repository. http://packages.debian.org/search?keywords=fio .
 
-On Windows Cygwin (http://www.cygwin.com/) is required in order to
-build fio. To create an MSI installer package install WiX 3.7 from
-http://wixtoolset.org and run dobuild.cmd from the
-os/windows directory.
-
-How to compile FIO on 64-bit Windows:
-
- 1. Install Cygwin (http://www.cygwin.com/setup.exe). Install 'make' and all
-    packages starting with 'mingw64-i686' and 'mingw64-x86_64'.
- 2. Download ftp://sourceware.org/pub/pthreads-win32/prebuilt-dll-2-9-1-release/dll/x64/pthreadGC2.dll
-    and copy to the fio source directory.
- 3. Open the Cygwin Terminal.
- 4. Go to the fio directory (source files).
- 5. Run 'make clean'.
- 6. Run 'make'.
-
-To build fio on 32-bit Windows, download x86/pthreadGC2.dll instead and do
-'./configure --build-32bit-win=yes' before 'make'.
-
-It's recommended that once built or installed, fio be run in a Command Prompt
-or other 'native' console such as console2, since there are known to be display
-and signal issues when running it under a Cygwin shell
-(see http://code.google.com/p/mintty/issues/detail?id=56 for details).
+Ubuntu:
+	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+	of the Ubuntu "universe" repository.
+	http://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+	packages are part of the Fedora/EPEL repositories.
+	https://apps.fedoraproject.org/packages/fio .
 
+Mandriva:
+	Mandriva has integrated fio into their package repository, so installing
+	on that distro should be as easy as typing ``urpmi fio``.
 
-Command line
-------------
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
 
-$ fio
-	--debug			Enable some debugging options (see below)
-	--parse-only		Parse options only, don't start any IO
-	--output		Write output to file
-	--runtime		Runtime in seconds
-	--latency-log		Generate per-job latency logs
-	--bandwidth-log		Generate per-job bandwidth logs
-	--minimal		Minimal (terse) output
-	--output-format=type	Output format (terse,json,normal)
-	--terse-version=type	Terse version output format (default 3, or 2 or 4).
-	--version		Print version info and exit
-	--help			Print this page
-	--cpuclock-test		Perform test/validation of CPU clock
-	--cmdhelp=cmd		Print command help, "all" for all of them
-	--enghelp=engine	Print ioengine help, or list available ioengines
-	--enghelp=engine,cmd	Print help for an ioengine cmd
-	--showcmd		Turn a job file into command line options
-	--readonly		Turn on safety read-only checks, preventing
-				writes
-	--eta=when		When ETA estimate should be printed
-				May be "always", "never" or "auto"
-	--eta-newline=time	Force a new line for every 'time' period passed
-	--status-interval=t	Force full status dump every 't' period passed
-	--section=name		Only run specified section in job file.
-				Multiple sections can be specified.
-	--alloc-size=kb		Set smalloc pool to this size in kb (def 1024)
-	--warnings-fatal	Fio parser warnings are fatal
-	--max-jobs		Maximum number of threads/processes to support
-	--server=args		Start backend server. See Client/Server section.
-	--client=host		Connect to specified backend.
-	--idle-prof=option	Report cpu idleness on a system or percpu basis
-				(option=system,percpu) or run unit work
-				calibration only (option=calibrate).
-
-
-Any parameters following the options will be assumed to be job files,
-unless they match a job file parameter. Multiple job files can be listed 
-and each job file will be regarded as a separate group. fio will stonewall
-execution between each group.
-
-The --readonly option is an extra safety guard to prevent users from
-accidentally starting a write workload when that is not desired.  Fio
-will only write if rw=write/randwrite/rw/randrw is given.  This extra
-safety net can be used as an extra precaution as --readonly will also
-enable a write check in the io engine core to prevent writes due to
-unknown user space bug(s).
-
-The --debug option triggers additional logging by fio.
-Currently, additional logging is available for:
-
-	process		Dump info related to processes
-	file		Dump info related to file actions
-	io		Dump info related to IO queuing
-	mem		Dump info related to memory allocations
-	blktrace	Dump info related to blktrace setup
-	verify		Dump info related to IO verification
-	all		Enable all debug options
-	random		Dump info related to random offset generation
-	parse		Dump info related to option matching and parsing
-	diskutil	Dump info related to disk utilization updates
-	job:x		Dump info only related to job number x
-	mutex		Dump info only related to mutex up/down ops
-	profile		Dump info related to profile extensions
-	time		Dump info related to internal time keeping
-	? or help	Show available debug options.
-
-One can specify multiple debug options: e.g. --debug=file,mem will enable
-file and memory debugging.
-
-The --section option allows one to combine related jobs into one file.
-E.g. one job file could define light, moderate, and heavy sections. Tell fio to
-run only the "heavy" section by giving --section=heavy command line option.
-One can also specify the "write" operations in one section and "verify"
-operation in another section.  The --section option only applies to job
-sections.  The reserved 'global' section is always parsed and used.
-
-The --alloc-size switch allows one to use a larger pool size for smalloc.
-If running large jobs with randommap enabled, fio can run out of memory.
-Smalloc is an internal allocator for shared structures from a fixed size
-memory pool. The pool size defaults to 1024k and can grow to 128 pools.
+Solaris:
+	Packages for Solaris are available from OpenCSW. Install their pkgutil
+	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+	``pkgutil -i fio``.
 
-NOTE: While running .fio_smalloc.* backing store files are visible in /tmp.
+Windows:
+	Rebecca Cran <rebecca+fio@bluestop.org> has fio packages for Windows at
+	https://www.bluestop.org/fio/ . The latest builds for Windows can also
+	be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
+	the latest x86 or x64 build, then selecting the ARTIFACTS tab.
+
+BSDs:
+	Packages for BSDs may be available from their binary package repositories.
+	Look for a package "fio" using their binary package managers.
 
 
-Job file
+Building
 --------
 
-See the HOWTO file for a complete description of job file syntax and
-parameters.  The --cmdhelp option also lists all options. If used with
-an option argument, --cmdhelp will detail the given option.  The job file
-format is in the ini style format, as that is easy for the user to review
-and modify.
-
-This README contains the terse version. Job files can describe big and
-complex setups that are not possible with the command line.  Job files
-are a good practice even for simple jobs since the file provides an
-easily accessed record of the workload and can include comments.
-
-See the examples/ directory for inspiration on how to write job files.  Note
-the copyright and license requirements currently apply to examples/ files.
-
-
-Client/server
-------------
-
-Normally fio is invoked as a stand-alone application on the machine
-where the IO workload should be generated. However, the frontend and
-backend of fio can be run separately. Ie the fio server can generate
-an IO workload on the "Device Under Test" while being controlled from
-another machine.
-
-Start the server on the machine which has access to the storage DUT:
-
-fio --server=args
-
-where args defines what fio listens to. The arguments are of the form
-'type,hostname or IP,port'. 'type' is either 'ip' (or ip4) for TCP/IP v4,
-'ip6' for TCP/IP v6, or 'sock' for a local unix domain socket.
-'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) fio --server
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
-2) fio --server=ip:hostname,4444
+Just type::
 
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
 
-3) fio --server=ip6:::1,4444
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) fio --server=,4444
-
-   Start a fio server, listening on all interfaces on port 4444.
-
-5) fio --server=1.2.3.4
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
 
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
+To build fio with a cross-compiler::
 
-6) fio --server=sock:/tmp/fio.sock
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
 
-   Start a fio server, listening on the local socket /tmp/fio.sock.
+Configure will attempt to determine the target platform automatically.
 
-Once a server is running, a "client" can connect to the fio server with:
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
 
-fio --local-args --client=<server> --remote-args <job file(s)>
 
-where --local-args are arguments for the client where it is
-running, 'server' is the connect string, and --remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
+Windows
+~~~~~~~
 
-Fio can connect to multiple servers this way:
+On Windows, Cygwin (http://www.cygwin.com/) is required in order to build
+fio. To create an MSI installer package install WiX 3.8 from
+http://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
+
+How to compile fio on 64-bit Windows:
+
+ 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-x86_64**. Ensure
+    **mingw64-x86_64-zlib** are installed if you wish
+    to enable fio's log compression functionality.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
+
+To build fio for 32-bit Windows, ensure the -i686 versions of the previously
+mentioned -x86_64 packages are installed and run ``./configure
+--build-32bit-win`` before ``make``. To build an fio that supports versions of
+Windows below Windows 7/Windows Server 2008 R2 also add ``--target-win-ver=xp``
+to the end of the configure line that you run before doing ``make``.
+
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
 
-fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+.. _reStructuredText: http://www.sphinx-doc.org/rest.html
+.. _Sphinx: http://www.sphinx-doc.org
 
 
 Platforms
 ---------
 
-Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, Windows
-and FreeBSD.  Some features and/or options may only be available on some of
-the platforms, typically because those features only apply to that platform
-(like the solarisaio engine, or the splice engine on Linux).
+Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
 
 Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is
-disk utility statistics and (I think) huge page support, support for that
-does exist in FreeBSD/Solaris.
-
-Fio uses pthread mutexes for signalling and locking and FreeBSD does not
-support process shared pthread mutexes. As a result, only threads are
-supported on FreeBSD. This could be fixed with sysv ipc locking or
-other locking alternatives.
-
-Other *BSD platforms are untested, but fio should work there almost out
-of the box. Since I don't do test runs or even compiles on those platforms,
-your mileage may vary. Sending me patches for other platforms is greatly
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signalling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
 appreciated. There's a lot of value in having the same test/benchmark tool
 available on all platforms.
 
-Note that POSIX aio is not enabled by default on AIX. Messages like these:
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
 
     Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
         Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
 
-indicate one needs to enable POSIX aio. Run the following commands as root:
+indicate one needs to enable POSIX aio. Run the following commands as root::
 
     # lsdev -C -l posix_aio0
         posix_aio0 Defined  Posix Asynchronous I/O
@@ -329,20 +243,41 @@
     # lsdev -C -l posix_aio0
         posix_aio0 Available  Posix Asynchronous I/O
 
-POSIX aio should work now. To make the change permanent:
+POSIX aio should work now. To make the change permanent::
 
     # chdev -l posix_aio0 -P -a autoconfig='available'
         posix_aio0 changed
 
 
-Author
-------
+Running fio
+-----------
 
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing
-of the Linux IO subsystem and schedulers. He got tired of writing
-specific test applications to simulate a given workload, and found that
-the existing io benchmark/test tools out there weren't flexible enough
-to do what he wanted.
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
 
-Jens Axboe <axboe@kernel.dk> 20060905
+	$ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
 
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
diff -Nru fio-2.1.3/REPORTING-BUGS fio-3.16/REPORTING-BUGS
--- fio-2.1.3/REPORTING-BUGS	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/REPORTING-BUGS	2019-09-20 01:01:52.000000000 +0000
@@ -2,8 +2,10 @@
 ---------------
 
 If you notice anything that seems like a fio bug, please do send email
-to the list (fio@vger.kernel.org, see README) about it. You'll need
-to report at least:
+to the list (fio@vger.kernel.org, see README) about it. If you are not
+running the newest release of fio, upgrading first is recommended.
+
+When reporting a bug, you'll need to include:
 
 1) A description of what you think the bug is
 2) Environment (Linux distro version, kernel version). This is mostly
@@ -12,4 +14,8 @@
 4) How to reproduce. Please include a full list of the parameters
    passed to fio and the job file used (if any).
 
+A bug report can't have too much information. Any time information that
+is left out and has to be asked for will add to the turn-around time
+of getting to the bottom of the issue, and an eventual fix.
+
 That's it!
diff -Nru fio-2.1.3/rwlock.c fio-3.16/rwlock.c
--- fio-2.1.3/rwlock.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/rwlock.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "log.h"
+#include "rwlock.h"
+#include "os/os.h"
+
+void fio_rwlock_write(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_wrlock(&lock->lock);
+}
+
+void fio_rwlock_read(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_rdlock(&lock->lock);
+}
+
+void fio_rwlock_unlock(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_unlock(&lock->lock);
+}
+
+void fio_rwlock_remove(struct fio_rwlock *lock)
+{
+	assert(lock->magic == FIO_RWLOCK_MAGIC);
+	pthread_rwlock_destroy(&lock->lock);
+	munmap((void *) lock, sizeof(*lock));
+}
+
+struct fio_rwlock *fio_rwlock_init(void)
+{
+	struct fio_rwlock *lock;
+	pthread_rwlockattr_t attr;
+	int ret;
+
+	lock = (void *) mmap(NULL, sizeof(struct fio_rwlock),
+				PROT_READ | PROT_WRITE,
+				OS_MAP_ANON | MAP_SHARED, -1, 0);
+	if (lock == MAP_FAILED) {
+		perror("mmap rwlock");
+		lock = NULL;
+		goto err;
+	}
+
+	lock->magic = FIO_RWLOCK_MAGIC;
+
+	ret = pthread_rwlockattr_init(&attr);
+	if (ret) {
+		log_err("pthread_rwlockattr_init: %s\n", strerror(ret));
+		goto err;
+	}
+#ifdef CONFIG_PSHARED
+	ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret));
+		goto destroy_attr;
+	}
+
+	ret = pthread_rwlock_init(&lock->lock, &attr);
+#else
+	ret = pthread_rwlock_init(&lock->lock, NULL);
+#endif
+
+	if (ret) {
+		log_err("pthread_rwlock_init: %s\n", strerror(ret));
+		goto destroy_attr;
+	}
+
+	pthread_rwlockattr_destroy(&attr);
+
+	return lock;
+destroy_attr:
+	pthread_rwlockattr_destroy(&attr);
+err:
+	if (lock)
+		fio_rwlock_remove(lock);
+	return NULL;
+}
diff -Nru fio-2.1.3/rwlock.h fio-3.16/rwlock.h
--- fio-2.1.3/rwlock.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/rwlock.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,19 @@
+#ifndef FIO_RWLOCK_H
+#define FIO_RWLOCK_H
+
+#include <pthread.h>
+
+#define FIO_RWLOCK_MAGIC	0x52574c4fU
+
+struct fio_rwlock {
+	pthread_rwlock_t lock;
+	int magic;
+};
+
+extern void fio_rwlock_read(struct fio_rwlock *);
+extern void fio_rwlock_write(struct fio_rwlock *);
+extern void fio_rwlock_unlock(struct fio_rwlock *);
+extern struct fio_rwlock *fio_rwlock_init(void);
+extern void fio_rwlock_remove(struct fio_rwlock *);
+
+#endif
diff -Nru fio-2.1.3/server.c fio-3.16/server.c
--- fio-2.1.3/server.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/server.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,11 +1,8 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <unistd.h>
-#include <limits.h>
 #include <errno.h>
-#include <fcntl.h>
-#include <sys/poll.h>
+#include <poll.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <sys/socket.h>
@@ -22,15 +19,35 @@
 #endif
 
 #include "fio.h"
+#include "options.h"
 #include "server.h"
 #include "crc/crc16.h"
 #include "lib/ieee754.h"
+#include "verify-state.h"
+#include "smalloc.h"
 
 int fio_net_port = FIO_NET_PORT;
 
-int exit_backend = 0;
+bool exit_backend = false;
+
+enum {
+	SK_F_FREE	= 1,
+	SK_F_COPY	= 2,
+	SK_F_SIMPLE	= 4,
+	SK_F_VEC	= 8,
+	SK_F_INLINE	= 16,
+};
+
+struct sk_entry {
+	struct flist_head list;	/* link on sk_out->list */
+	int flags;		/* SK_F_* */
+	int opcode;		/* Actual command fields */
+	void *buf;
+	off_t size;
+	uint64_t tag;
+	struct flist_head next;	/* Other sk_entry's, if linked command */
+};
 
-static int server_fd = -1;
 static char *fio_server_arg;
 static char *bind_sock;
 static struct sockaddr_in saddr_in;
@@ -42,6 +59,9 @@
 static unsigned int has_zlib = 0;
 #endif
 static unsigned int use_zlib;
+static char me[128];
+
+static pthread_key_t sk_out_key;
 
 struct fio_fork_item {
 	struct flist_head list;
@@ -51,11 +71,12 @@
 	pid_t pid;
 };
 
-/* Created on fork on new connection */
-static FLIST_HEAD(conn_list);
-
-/* Created on job fork from connection */
-static FLIST_HEAD(job_list);
+struct cmd_reply {
+	struct fio_sem lock;
+	void *data;
+	size_t size;
+	int error;
+};
 
 static const char *fio_server_ops[FIO_NET_CMD_NR] = {
 	"",
@@ -74,10 +95,93 @@
 	"DISK_UTIL",
 	"SERVER_START",
 	"ADD_JOB",
-	"CMD_RUN"
-	"CMD_IOLOG",
+	"RUN",
+	"IOLOG",
+	"UPDATE_JOB",
+	"LOAD_FILE",
+	"VTRIGGER",
+	"SENDFILE",
+	"JOB_OPT",
 };
 
+static void sk_lock(struct sk_out *sk_out)
+{
+	fio_sem_down(&sk_out->lock);
+}
+
+static void sk_unlock(struct sk_out *sk_out)
+{
+	fio_sem_up(&sk_out->lock);
+}
+
+void sk_out_assign(struct sk_out *sk_out)
+{
+	if (!sk_out)
+		return;
+
+	sk_lock(sk_out);
+	sk_out->refs++;
+	sk_unlock(sk_out);
+	pthread_setspecific(sk_out_key, sk_out);
+}
+
+static void sk_out_free(struct sk_out *sk_out)
+{
+	__fio_sem_remove(&sk_out->lock);
+	__fio_sem_remove(&sk_out->wait);
+	__fio_sem_remove(&sk_out->xmit);
+	sfree(sk_out);
+}
+
+static int __sk_out_drop(struct sk_out *sk_out)
+{
+	if (sk_out) {
+		int refs;
+
+		sk_lock(sk_out);
+		assert(sk_out->refs != 0);
+		refs = --sk_out->refs;
+		sk_unlock(sk_out);
+
+		if (!refs) {
+			sk_out_free(sk_out);
+			pthread_setspecific(sk_out_key, NULL);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+void sk_out_drop(void)
+{
+	struct sk_out *sk_out;
+
+	sk_out = pthread_getspecific(sk_out_key);
+	__sk_out_drop(sk_out);
+}
+
+static void __fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			       uint32_t pdu_len, uint64_t tag)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->version	= __cpu_to_le16(FIO_SERVER_VER);
+	cmd->opcode	= cpu_to_le16(opcode);
+	cmd->tag	= cpu_to_le64(tag);
+	cmd->pdu_len	= cpu_to_le32(pdu_len);
+}
+
+
+static void fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			     const void *pdu, uint32_t pdu_len, uint64_t tag)
+{
+	__fio_init_net_cmd(cmd, opcode, pdu_len, tag);
+
+	if (pdu)
+		memcpy(&cmd->payload, pdu, pdu_len);
+}
+
 const char *fio_server_op(unsigned int op)
 {
 	static char buf[32];
@@ -134,13 +238,10 @@
 	if (!total_len)
 		return 0;
 
-	if (errno)
-		return -errno;
-
 	return 1;
 }
 
-int fio_send_data(int sk, const void *p, unsigned int len)
+static int fio_send_data(int sk, const void *p, unsigned int len)
 {
 	struct iovec iov = { .iov_base = (void *) p, .iov_len = len };
 
@@ -149,10 +250,18 @@
 	return fio_sendv_data(sk, &iov, 1);
 }
 
-int fio_recv_data(int sk, void *p, unsigned int len)
+static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
 {
+	int flags;
+	char *p = buf;
+
+	if (wait)
+		flags = MSG_WAITALL;
+	else
+		flags = OS_MSG_DONTWAIT;
+
 	do {
-		int ret = recv(sk, p, len, MSG_WAITALL);
+		int ret = recv(sk, p, len, flags);
 
 		if (ret > 0) {
 			len -= ret;
@@ -162,9 +271,11 @@
 			continue;
 		} else if (!ret)
 			break;
-		else if (errno == EAGAIN || errno == EINTR)
-			continue;
-		else
+		else if (errno == EAGAIN || errno == EINTR) {
+			if (wait)
+				continue;
+			break;
+		} else
 			break;
 	} while (!exit_backend);
 
@@ -185,6 +296,8 @@
 	if (crc != cmd->cmd_crc16) {
 		log_err("fio: server bad crc on command (got %x, wanted %x)\n",
 				cmd->cmd_crc16, crc);
+		fprintf(f_err, "fio: server bad crc on command (got %x, wanted %x)\n",
+				cmd->cmd_crc16, crc);
 		return 1;
 	}
 
@@ -199,6 +312,8 @@
 		break;
 	default:
 		log_err("fio: bad server cmd version %d\n", cmd->version);
+		fprintf(f_err, "fio: client/server version mismatch (%d != %d)\n",
+				cmd->version, FIO_SERVER_VER);
 		return 1;
 	}
 
@@ -213,16 +328,16 @@
 /*
  * Read (and defragment, if necessary) incoming commands
  */
-struct fio_net_cmd *fio_net_recv_cmd(int sk)
+struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
 {
-	struct fio_net_cmd cmd, *cmdret = NULL;
+	struct fio_net_cmd cmd, *tmp, *cmdret = NULL;
 	size_t cmd_size = 0, pdu_offset = 0;
 	uint16_t crc;
 	int ret, first = 1;
 	void *pdu = NULL;
 
 	do {
-		ret = fio_recv_data(sk, &cmd, sizeof(cmd));
+		ret = fio_recv_data(sk, &cmd, sizeof(cmd), wait);
 		if (ret)
 			break;
 
@@ -238,7 +353,19 @@
 		} else
 			cmd_size += cmd.pdu_len;
 
-		cmdret = realloc(cmdret, cmd_size);
+		if (cmd_size / 1024 > FIO_SERVER_MAX_CMD_MB * 1024) {
+			log_err("fio: cmd+pdu too large (%llu)\n", (unsigned long long) cmd_size);
+			ret = 1;
+			break;
+		}
+
+		tmp = realloc(cmdret, cmd_size);
+		if (!tmp) {
+			log_err("fio: server failed allocating cmd\n");
+			ret = 1;
+			break;
+		}
+		cmdret = tmp;
 
 		if (first)
 			memcpy(cmdret, &cmd, sizeof(cmd));
@@ -253,8 +380,8 @@
 			break;
 
 		/* There's payload, get it */
-		pdu = (void *) cmdret->payload + pdu_offset;
-		ret = fio_recv_data(sk, pdu, cmd.pdu_len);
+		pdu = (char *) cmdret->payload + pdu_offset;
+		ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
 		if (ret)
 			break;
 
@@ -280,14 +407,14 @@
 		/* zero-terminate text input */
 		if (cmdret->pdu_len) {
 			if (cmdret->opcode == FIO_NET_CMD_TEXT) {
-				struct cmd_text_pdu *pdu = (struct cmd_text_pdu *) cmdret->payload;
-				char *buf = (char *) pdu->buf;
+				struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
 
-				buf[pdu->buf_len] = '\0';
+				buf[__pdu->buf_len] = '\0';
 			} else if (cmdret->opcode == FIO_NET_CMD_JOB) {
-				struct cmd_job_pdu *pdu = (struct cmd_job_pdu *) cmdret->payload;
-				char *buf = (char *) pdu->buf;
-				int len = le32_to_cpu(pdu->buf_len);
+				struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
+				int len = le32_to_cpu(__pdu->buf_len);
 
 				buf[len] = '\0';
 			}
@@ -314,7 +441,7 @@
 
 	reply = calloc(1, sizeof(*reply));
 	INIT_FLIST_HEAD(&reply->list);
-	gettimeofday(&reply->tv, NULL);
+	fio_gettime(&reply->ts, NULL);
 	reply->saved_tag = tag;
 	reply->opcode = opcode;
 
@@ -329,7 +456,7 @@
 	free(reply);
 }
 
-void fio_net_cmd_crc_pdu(struct fio_net_cmd *cmd, const void *pdu)
+static void fio_net_cmd_crc_pdu(struct fio_net_cmd *cmd, const void *pdu)
 {
 	uint32_t pdu_len;
 
@@ -339,7 +466,7 @@
 	cmd->pdu_crc16 = __cpu_to_le16(fio_crc16(pdu, pdu_len));
 }
 
-void fio_net_cmd_crc(struct fio_net_cmd *cmd)
+static void fio_net_cmd_crc(struct fio_net_cmd *cmd)
 {
 	fio_net_cmd_crc_pdu(cmd, cmd->payload);
 }
@@ -396,6 +523,64 @@
 	return ret;
 }
 
+static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf,
+					 size_t size, uint64_t *tagptr,
+					 int flags)
+{
+	struct sk_entry *entry;
+
+	entry = smalloc(sizeof(*entry));
+	if (!entry)
+		return NULL;
+
+	INIT_FLIST_HEAD(&entry->next);
+	entry->opcode = opcode;
+	if (flags & SK_F_COPY) {
+		entry->buf = smalloc(size);
+		memcpy(entry->buf, buf, size);
+	} else
+		entry->buf = buf;
+
+	entry->size = size;
+	if (tagptr)
+		entry->tag = *tagptr;
+	else
+		entry->tag = 0;
+	entry->flags = flags;
+	return entry;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry);
+
+static void fio_net_queue_entry(struct sk_entry *entry)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	if (entry->flags & SK_F_INLINE)
+		handle_sk_entry(sk_out, entry);
+	else {
+		sk_lock(sk_out);
+		flist_add_tail(&entry->list, &sk_out->list);
+		sk_unlock(sk_out);
+
+		fio_sem_up(&sk_out->wait);
+	}
+}
+
+static int fio_net_queue_cmd(uint16_t opcode, void *buf, off_t size,
+			     uint64_t *tagptr, int flags)
+{
+	struct sk_entry *entry;
+
+	entry = fio_net_prep_cmd(opcode, buf, size, tagptr, flags);
+	if (entry) {
+		fio_net_queue_entry(entry);
+		return 0;
+	}
+
+	return 1;
+}
+
 static int fio_net_send_simple_stack_cmd(int sk, uint16_t opcode, uint64_t tag)
 {
 	struct fio_net_cmd cmd;
@@ -432,6 +617,13 @@
 	return 0;
 }
 
+static int fio_net_queue_quit(void)
+{
+	dprint(FD_NET, "server: sending quit\n");
+
+	return fio_net_queue_cmd(FIO_NET_CMD_QUIT, NULL, 0, NULL, SK_F_SIMPLE);
+}
+
 int fio_net_send_quit(int sk)
 {
 	dprint(FD_NET, "server: sending quit\n");
@@ -439,8 +631,7 @@
 	return fio_net_send_simple_cmd(sk, FIO_NET_CMD_QUIT, 0, NULL);
 }
 
-static int fio_net_send_ack(int sk, struct fio_net_cmd *cmd, int error,
-			    int signal)
+static int fio_net_send_ack(struct fio_net_cmd *cmd, int error, int signal)
 {
 	struct cmd_end_pdu epdu;
 	uint64_t tag = 0;
@@ -450,13 +641,13 @@
 
 	epdu.error = __cpu_to_le32(error);
 	epdu.signal = __cpu_to_le32(signal);
-	return fio_net_send_cmd(sk, FIO_NET_CMD_STOP, &epdu, sizeof(epdu), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_STOP, &epdu, sizeof(epdu), &tag, SK_F_COPY);
 }
 
-int fio_net_send_stop(int sk, int error, int signal)
+static int fio_net_queue_stop(int error, int signal)
 {
 	dprint(FD_NET, "server: sending stop (%d, %d)\n", error, signal);
-	return fio_net_send_ack(sk, NULL, error, signal);
+	return fio_net_send_ack(NULL, error, signal);
 }
 
 static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
@@ -471,16 +662,16 @@
 	flist_add_tail(&ffi->list, list);
 }
 
-static void fio_server_add_conn_pid(pid_t pid)
+static void fio_server_add_conn_pid(struct flist_head *conn_list, pid_t pid)
 {
-	dprint(FD_NET, "server: forked off connection job (pid=%u)\n", pid);
-	fio_server_add_fork_item(pid, &conn_list);
+	dprint(FD_NET, "server: forked off connection job (pid=%u)\n", (int) pid);
+	fio_server_add_fork_item(pid, conn_list);
 }
 
-static void fio_server_add_job_pid(pid_t pid)
+static void fio_server_add_job_pid(struct flist_head *job_list, pid_t pid)
 {
-	dprint(FD_NET, "server: forked off job job (pid=%u)\n", pid);
-	fio_server_add_fork_item(pid, &job_list);
+	dprint(FD_NET, "server: forked off job job (pid=%u)\n", (int) pid);
+	fio_server_add_fork_item(pid, job_list);
 }
 
 static void fio_server_check_fork_item(struct fio_fork_item *ffi)
@@ -490,7 +681,7 @@
 	ret = waitpid(ffi->pid, &status, WNOHANG);
 	if (ret < 0) {
 		if (errno == ECHILD) {
-			log_err("fio: connection pid %u disappeared\n", ffi->pid);
+			log_err("fio: connection pid %u disappeared\n", (int) ffi->pid);
 			ffi->exited = 1;
 		} else
 			log_err("fio: waitpid: %s\n", strerror(errno));
@@ -507,20 +698,23 @@
 	}
 }
 
-static void fio_server_fork_item_done(struct fio_fork_item *ffi)
+static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
 {
-	dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", ffi->pid, ffi->signal, ffi->exitval);
+	dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
 
 	/*
 	 * Fold STOP and QUIT...
 	 */
-	fio_net_send_stop(server_fd, ffi->exitval, ffi->signal);
-	fio_net_send_quit(server_fd);
+	if (stop) {
+		fio_net_queue_stop(ffi->exitval, ffi->signal);
+		fio_net_queue_quit();
+	}
+
 	flist_del(&ffi->list);
 	free(ffi);
 }
 
-static void fio_server_check_fork_items(struct flist_head *list)
+static void fio_server_check_fork_items(struct flist_head *list, bool stop)
 {
 	struct flist_head *entry, *tmp;
 	struct fio_fork_item *ffi;
@@ -531,35 +725,62 @@
 		fio_server_check_fork_item(ffi);
 
 		if (ffi->exited)
-			fio_server_fork_item_done(ffi);
+			fio_server_fork_item_done(ffi, stop);
 	}
 }
 
-static void fio_server_check_jobs(void)
+static void fio_server_check_jobs(struct flist_head *job_list)
+{
+	fio_server_check_fork_items(job_list, true);
+}
+
+static void fio_server_check_conns(struct flist_head *conn_list)
 {
-	fio_server_check_fork_items(&job_list);
+	fio_server_check_fork_items(conn_list, false);
 }
 
-static void fio_server_check_conns(void)
+static int handle_load_file_cmd(struct fio_net_cmd *cmd)
 {
-	fio_server_check_fork_items(&conn_list);
+	struct cmd_load_file_pdu *pdu = (struct cmd_load_file_pdu *) cmd->payload;
+	void *file_name = pdu->file;
+	struct cmd_start_pdu spdu;
+
+	dprint(FD_NET, "server: loading local file %s\n", (char *) file_name);
+
+	pdu->name_len = le16_to_cpu(pdu->name_len);
+	pdu->client_type = le16_to_cpu(pdu->client_type);
+
+	if (parse_jobs_ini(file_name, 0, 0, pdu->client_type)) {
+		fio_net_queue_quit();
+		return -1;
+	}
+
+	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
+	return 0;
 }
 
-static int handle_run_cmd(struct fio_net_cmd *cmd)
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
 {
 	pid_t pid;
 	int ret;
 
+	sk_out_assign(sk_out);
+
+	fio_time_init();
 	set_genesis_time();
 
 	pid = fork();
 	if (pid) {
-		fio_server_add_job_pid(pid);
+		fio_server_add_job_pid(job_list, pid);
 		return 0;
 	}
 
-	ret = fio_backend();
+	ret = fio_backend(sk_out);
 	free_threads_shm();
+	sk_out_drop();
 	_exit(ret);
 }
 
@@ -573,13 +794,14 @@
 	pdu->client_type = le32_to_cpu(pdu->client_type);
 
 	if (parse_jobs_ini(buf, 1, 0, pdu->client_type)) {
-		fio_net_send_quit(server_fd);
+		fio_net_queue_quit();
 		return -1;
 	}
 
 	spdu.jobs = cpu_to_le32(thread_number);
 	spdu.stat_outputs = cpu_to_le32(stat_number);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
 	return 0;
 }
 
@@ -610,7 +832,7 @@
 	}
 
 	if (parse_cmd_line(clp->lines, argv, clp->client_type)) {
-		fio_net_send_quit(server_fd);
+		fio_net_queue_quit();
 		free(argv);
 		return -1;
 	}
@@ -619,29 +841,32 @@
 
 	spdu.jobs = cpu_to_le32(thread_number);
 	spdu.stat_outputs = cpu_to_le32(stat_number);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
 	return 0;
 }
 
 static int handle_probe_cmd(struct fio_net_cmd *cmd)
 {
 	struct cmd_client_probe_pdu *pdu = (struct cmd_client_probe_pdu *) cmd->payload;
-	struct cmd_probe_reply_pdu probe;
 	uint64_t tag = cmd->tag;
+	struct cmd_probe_reply_pdu probe = {
+#ifdef CONFIG_BIG_ENDIAN
+		.bigendian	= 1,
+#endif
+		.os		= FIO_OS,
+		.arch		= FIO_ARCH,
+		.bpp		= sizeof(void *),
+		.cpus		= __cpu_to_le32(cpus_online()),
+	};
 
 	dprint(FD_NET, "server: sending probe reply\n");
 
-	memset(&probe, 0, sizeof(probe));
-	gethostname((char *) probe.hostname, sizeof(probe.hostname));
-#ifdef CONFIG_BIG_ENDIAN
-	probe.bigendian = 1;
-#endif
-	strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version));
+	strcpy(me, (char *) pdu->server);
 
-	probe.os	= FIO_OS;
-	probe.arch	= FIO_ARCH;
-	probe.bpp	= sizeof(void *);
-	probe.cpus	= __cpu_to_le32(cpus_online());
+	gethostname((char *) probe.hostname, sizeof(probe.hostname));
+	snprintf((char *) probe.fio_version, sizeof(probe.fio_version), "%s",
+		 fio_version_string);
 
 	/*
 	 * If the client supports compression and we do too, then enable it
@@ -654,63 +879,60 @@
 		use_zlib = 0;
 	}
 
-	return fio_net_send_cmd(server_fd, FIO_NET_CMD_PROBE, &probe, sizeof(probe), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_PROBE, &probe, sizeof(probe), &tag, SK_F_COPY);
 }
 
 static int handle_send_eta_cmd(struct fio_net_cmd *cmd)
 {
 	struct jobs_eta *je;
-	size_t size;
 	uint64_t tag = cmd->tag;
+	size_t size;
 	int i;
 
-	if (!thread_number)
-		return 0;
-
-	size = sizeof(*je) + thread_number * sizeof(char) + 1;
-	je = malloc(size);
-	memset(je, 0, size);
-
-	if (!calc_thread_status(je, 1)) {
-		free(je);
-		return 0;
-	}
-
 	dprint(FD_NET, "server sending status\n");
 
-	je->nr_running		= cpu_to_le32(je->nr_running);
-	je->nr_ramp		= cpu_to_le32(je->nr_ramp);
-	je->nr_pending		= cpu_to_le32(je->nr_pending);
-	je->nr_setting_up	= cpu_to_le32(je->nr_setting_up);
-	je->files_open		= cpu_to_le32(je->files_open);
+	/*
+	 * Fake ETA return if we don't have a local one, otherwise the client
+	 * will end up timing out waiting for a response to the ETA request
+	 */
+	je = get_jobs_eta(true, &size);
+	if (!je) {
+		size = sizeof(*je);
+		je = calloc(1, size);
+	} else {
+		je->nr_running		= cpu_to_le32(je->nr_running);
+		je->nr_ramp		= cpu_to_le32(je->nr_ramp);
+		je->nr_pending		= cpu_to_le32(je->nr_pending);
+		je->nr_setting_up	= cpu_to_le32(je->nr_setting_up);
+		je->files_open		= cpu_to_le32(je->files_open);
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			je->m_rate[i]	= cpu_to_le64(je->m_rate[i]);
+			je->t_rate[i]	= cpu_to_le64(je->t_rate[i]);
+			je->m_iops[i]	= cpu_to_le32(je->m_iops[i]);
+			je->t_iops[i]	= cpu_to_le32(je->t_iops[i]);
+			je->rate[i]	= cpu_to_le64(je->rate[i]);
+			je->iops[i]	= cpu_to_le32(je->iops[i]);
+		}
 
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		je->m_rate[i]	= cpu_to_le32(je->m_rate[i]);
-		je->t_rate[i]	= cpu_to_le32(je->t_rate[i]);
-		je->m_iops[i]	= cpu_to_le32(je->m_iops[i]);
-		je->t_iops[i]	= cpu_to_le32(je->t_iops[i]);
-		je->rate[i]	= cpu_to_le32(je->rate[i]);
-		je->iops[i]	= cpu_to_le32(je->iops[i]);
-	}
-
-	je->elapsed_sec		= cpu_to_le64(je->elapsed_sec);
-	je->eta_sec		= cpu_to_le64(je->eta_sec);
-	je->nr_threads		= cpu_to_le32(je->nr_threads);
-	je->is_pow2		= cpu_to_le32(je->is_pow2);
-	je->unit_base		= cpu_to_le32(je->unit_base);
+		je->elapsed_sec		= cpu_to_le64(je->elapsed_sec);
+		je->eta_sec		= cpu_to_le64(je->eta_sec);
+		je->nr_threads		= cpu_to_le32(je->nr_threads);
+		je->is_pow2		= cpu_to_le32(je->is_pow2);
+		je->unit_base		= cpu_to_le32(je->unit_base);
+	}
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_ETA, je, size, &tag, NULL);
-	free(je);
+	fio_net_queue_cmd(FIO_NET_CMD_ETA, je, size, &tag, SK_F_FREE);
 	return 0;
 }
 
-static int send_update_job_reply(int fd, uint64_t __tag, int error)
+static int send_update_job_reply(uint64_t __tag, int error)
 {
 	uint64_t tag = __tag;
 	uint32_t pdu_error;
 
 	pdu_error = __cpu_to_le32(error);
-	return fio_net_send_cmd(fd, FIO_NET_CMD_UPDATE_JOB, &pdu_error, sizeof(pdu_error), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_UPDATE_JOB, &pdu_error, sizeof(pdu_error), &tag, SK_F_COPY);
 }
 
 static int handle_update_job_cmd(struct fio_net_cmd *cmd)
@@ -724,17 +946,43 @@
 	dprint(FD_NET, "server: updating options for job %u\n", tnumber);
 
 	if (!tnumber || tnumber > thread_number) {
-		send_update_job_reply(server_fd, cmd->tag, ENODEV);
+		send_update_job_reply(cmd->tag, ENODEV);
 		return 0;
 	}
 
 	td = &threads[tnumber - 1];
 	convert_thread_options_to_cpu(&td->o, &pdu->top);
-	send_update_job_reply(server_fd, cmd->tag, 0);
+	send_update_job_reply(cmd->tag, 0);
 	return 0;
 }
 
-static int handle_command(struct fio_net_cmd *cmd)
+static int handle_trigger_cmd(struct fio_net_cmd *cmd, struct flist_head *job_list)
+{
+	struct cmd_vtrigger_pdu *pdu = (struct cmd_vtrigger_pdu *) cmd->payload;
+	char *buf = (char *) pdu->cmd;
+	struct all_io_list *rep;
+	size_t sz;
+
+	pdu->len = le16_to_cpu(pdu->len);
+	buf[pdu->len] = '\0';
+
+	rep = get_all_io_list(IO_LIST_ALL, &sz);
+	if (!rep) {
+		struct all_io_list state;
+
+		state.threads = cpu_to_le64((uint64_t) 0);
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, SK_F_COPY | SK_F_INLINE);
+	} else
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
+
+	fio_terminate_threads(TERMINATE_ALL);
+	fio_server_check_jobs(job_list);
+	exec_trigger(buf);
+	return 0;
+}
+
+static int handle_command(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
 {
 	int ret;
 
@@ -745,10 +993,14 @@
 	switch (cmd->opcode) {
 	case FIO_NET_CMD_QUIT:
 		fio_terminate_threads(TERMINATE_ALL);
-		return -1;
+		ret = 0;
+		break;
 	case FIO_NET_CMD_EXIT:
-		exit_backend = 1;
+		exit_backend = true;
 		return -1;
+	case FIO_NET_CMD_LOAD_FILE:
+		ret = handle_load_file_cmd(cmd);
+		break;
 	case FIO_NET_CMD_JOB:
 		ret = handle_job_cmd(cmd);
 		break;
@@ -762,11 +1014,40 @@
 		ret = handle_send_eta_cmd(cmd);
 		break;
 	case FIO_NET_CMD_RUN:
-		ret = handle_run_cmd(cmd);
+		ret = handle_run_cmd(sk_out, job_list, cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ret = handle_update_job_cmd(cmd);
 		break;
+	case FIO_NET_CMD_VTRIGGER:
+		ret = handle_trigger_cmd(cmd, job_list);
+		break;
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile_reply *in;
+		struct cmd_reply *rep;
+
+		rep = (struct cmd_reply *) (uintptr_t) cmd->tag;
+
+		in = (struct cmd_sendfile_reply *) cmd->payload;
+		in->size = le32_to_cpu(in->size);
+		in->error = le32_to_cpu(in->error);
+		if (in->error) {
+			ret = 1;
+			rep->error = in->error;
+		} else {
+			ret = 0;
+			rep->data = smalloc(in->size);
+			if (!rep->data) {
+				ret = 1;
+				rep->error = ENOMEM;
+			} else {
+				rep->size = in->size;
+				memcpy(rep->data, in->data, in->size);
+			}
+		}
+		fio_sem_up(&rep->lock);
+		break;
+		}
 	default:
 		log_err("fio: unknown opcode: %s\n", fio_server_op(cmd->opcode));
 		ret = 1;
@@ -775,37 +1056,171 @@
 	return ret;
 }
 
-static int handle_connection(int sk)
+/*
+ * Send a command with a separate PDU, not inlined in the command
+ */
+static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
+				off_t size, uint64_t tag, uint32_t flags)
+{
+	struct fio_net_cmd cmd;
+	struct iovec iov[2];
+	size_t this_len;
+	int ret;
+
+	iov[0].iov_base = (void *) &cmd;
+	iov[0].iov_len = sizeof(cmd);
+
+	do {
+		uint32_t this_flags = flags;
+
+		this_len = size;
+		if (this_len > FIO_SERVER_MAX_FRAGMENT_PDU)
+			this_len = FIO_SERVER_MAX_FRAGMENT_PDU;
+
+		if (this_len < size)
+			this_flags |= FIO_NET_CMD_F_MORE;
+
+		__fio_init_net_cmd(&cmd, opcode, this_len, tag);
+		cmd.flags = __cpu_to_le32(this_flags);
+		fio_net_cmd_crc_pdu(&cmd, buf);
+
+		iov[1].iov_base = (void *) buf;
+		iov[1].iov_len = this_len;
+
+		ret = fio_sendv_data(sk, iov, 2);
+		size -= this_len;
+		buf += this_len;
+	} while (!ret && size);
+
+	return ret;
+}
+
+static void finish_entry(struct sk_entry *entry)
+{
+	if (entry->flags & SK_F_FREE)
+		free(entry->buf);
+	else if (entry->flags & SK_F_COPY)
+		sfree(entry->buf);
+
+	sfree(entry);
+}
+
+static void entry_set_flags(struct sk_entry *entry, struct flist_head *list,
+			    unsigned int *flags)
+{
+	if (!flist_empty(list))
+		*flags = FIO_NET_CMD_F_MORE;
+	else
+		*flags = 0;
+}
+
+static int send_vec_entry(struct sk_out *sk_out, struct sk_entry *first)
+{
+	unsigned int flags;
+	int ret;
+
+	entry_set_flags(first, &first->next, &flags);
+
+	ret = fio_send_cmd_ext_pdu(sk_out->sk, first->opcode, first->buf,
+					first->size, first->tag, flags);
+
+	while (!flist_empty(&first->next)) {
+		struct sk_entry *next;
+
+		next = flist_first_entry(&first->next, struct sk_entry, list);
+		flist_del_init(&next->list);
+
+		entry_set_flags(next, &first->next, &flags);
+
+		ret += fio_send_cmd_ext_pdu(sk_out->sk, next->opcode, next->buf,
+						next->size, next->tag, flags);
+		finish_entry(next);
+	}
+
+	return ret;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry)
+{
+	int ret;
+
+	fio_sem_down(&sk_out->xmit);
+
+	if (entry->flags & SK_F_VEC)
+		ret = send_vec_entry(sk_out, entry);
+	else if (entry->flags & SK_F_SIMPLE) {
+		ret = fio_net_send_simple_cmd(sk_out->sk, entry->opcode,
+						entry->tag, NULL);
+	} else {
+		ret = fio_net_send_cmd(sk_out->sk, entry->opcode, entry->buf,
+					entry->size, &entry->tag, NULL);
+	}
+
+	fio_sem_up(&sk_out->xmit);
+
+	if (ret)
+		log_err("fio: failed handling cmd %s\n", fio_server_op(entry->opcode));
+
+	finish_entry(entry);
+	return ret;
+}
+
+static int handle_xmits(struct sk_out *sk_out)
+{
+	struct sk_entry *entry;
+	FLIST_HEAD(list);
+	int ret = 0;
+
+	sk_lock(sk_out);
+	if (flist_empty(&sk_out->list)) {
+		sk_unlock(sk_out);
+		return 0;
+	}
+
+	flist_splice_init(&sk_out->list, &list);
+	sk_unlock(sk_out);
+
+	while (!flist_empty(&list)) {
+		entry = flist_entry(list.next, struct sk_entry, list);
+		flist_del(&entry->list);
+		ret += handle_sk_entry(sk_out, entry);
+	}
+
+	return ret;
+}
+
+static int handle_connection(struct sk_out *sk_out)
 {
 	struct fio_net_cmd *cmd = NULL;
+	FLIST_HEAD(job_list);
 	int ret = 0;
 
 	reset_fio_state();
-	INIT_FLIST_HEAD(&job_list);
-	server_fd = sk;
 
 	/* read forever */
 	while (!exit_backend) {
 		struct pollfd pfd = {
-			.fd	= sk,
+			.fd	= sk_out->sk,
 			.events	= POLLIN,
 		};
 
-		ret = 0;
 		do {
 			int timeout = 1000;
 
 			if (!flist_empty(&job_list))
 				timeout = 100;
 
-			ret = poll(&pfd, 1, timeout);
+			handle_xmits(sk_out);
+
+			ret = poll(&pfd, 1, 0);
 			if (ret < 0) {
 				if (errno == EINTR)
 					break;
 				log_err("fio: poll: %s\n", strerror(errno));
 				break;
 			} else if (!ret) {
-				fio_server_check_jobs();
+				fio_server_check_jobs(&job_list);
+				fio_sem_down_timeout(&sk_out->wait, timeout);
 				continue;
 			}
 
@@ -817,18 +1232,18 @@
 			}
 		} while (!exit_backend);
 
-		fio_server_check_jobs();
+		fio_server_check_jobs(&job_list);
 
 		if (ret < 0)
 			break;
 
-		cmd = fio_net_recv_cmd(sk);
+		cmd = fio_net_recv_cmd(sk_out->sk, true);
 		if (!cmd) {
 			ret = -1;
 			break;
 		}
 
-		ret = handle_command(cmd);
+		ret = handle_command(sk_out, &job_list, cmd);
 		if (ret)
 			break;
 
@@ -839,24 +1254,68 @@
 	if (cmd)
 		free(cmd);
 
-	close(sk);
+	handle_xmits(sk_out);
+
+	close(sk_out->sk);
+	sk_out->sk = -1;
+	__sk_out_drop(sk_out);
 	_exit(ret);
 }
 
+/* get the address on this host bound by the input socket, 
+ * whether it is ipv6 or ipv4 */
+
+static int get_my_addr_str(int sk)
+{
+	struct sockaddr_in6 myaddr6 = { 0, };
+	struct sockaddr_in myaddr4 = { 0, };
+	struct sockaddr *sockaddr_p;
+	char *net_addr;
+	socklen_t len;
+	int ret;
+
+	if (use_ipv6) {
+		len = sizeof(myaddr6);
+		sockaddr_p = (struct sockaddr * )&myaddr6;
+		net_addr = (char * )&myaddr6.sin6_addr;
+	} else {
+		len = sizeof(myaddr4);
+		sockaddr_p = (struct sockaddr * )&myaddr4;
+		net_addr = (char * )&myaddr4.sin_addr;
+	}
+
+	ret = getsockname(sk, sockaddr_p, &len);
+	if (ret) {
+		log_err("fio: getsockname: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (!inet_ntop(use_ipv6?AF_INET6:AF_INET, net_addr, client_sockaddr_str, INET6_ADDRSTRLEN - 1)) {
+		log_err("inet_ntop: failed to convert addr to string\n");
+		return -1;
+	}
+
+	dprint(FD_NET, "fio server bound to addr %s\n", client_sockaddr_str);
+	return 0;
+}
+
 static int accept_loop(int listen_sk)
 {
 	struct sockaddr_in addr;
-	socklen_t len = sizeof(addr);
+	struct sockaddr_in6 addr6;
+	socklen_t len = use_ipv6 ? sizeof(addr6) : sizeof(addr);
 	struct pollfd pfd;
-	int ret = 0, sk, flags, exitval = 0;
+	int ret = 0, sk, exitval = 0;
+	FLIST_HEAD(conn_list);
 
 	dprint(FD_NET, "server enter accept loop\n");
 
-	flags = fcntl(listen_sk, F_GETFL);
-	flags |= O_NONBLOCK;
-	fcntl(listen_sk, F_SETFL, flags);
+	fio_set_fd_nonblocking(listen_sk, "server");
 
 	while (!exit_backend) {
+		struct sk_out *sk_out;
+		const char *from;
+		char buf[64];
 		pid_t pid;
 
 		pfd.fd = listen_sk;
@@ -874,7 +1333,7 @@
 				log_err("fio: poll: %s\n", strerror(errno));
 				break;
 			} else if (!ret) {
-				fio_server_check_conns();
+				fio_server_check_conns(&conn_list);
 				continue;
 			}
 
@@ -882,28 +1341,56 @@
 				break;
 		} while (!exit_backend);
 
-		fio_server_check_conns();
+		fio_server_check_conns(&conn_list);
 
 		if (exit_backend || ret < 0)
 			break;
 
-		sk = accept(listen_sk, (struct sockaddr *) &addr, &len);
+		if (use_ipv6)
+			sk = accept(listen_sk, (struct sockaddr *) &addr6, &len);
+		else
+			sk = accept(listen_sk, (struct sockaddr *) &addr, &len);
+
 		if (sk < 0) {
 			log_err("fio: accept: %s\n", strerror(errno));
 			return -1;
 		}
 
-		dprint(FD_NET, "server: connect from %s\n", inet_ntoa(addr.sin_addr));
+		if (use_ipv6)
+			from = inet_ntop(AF_INET6, (struct sockaddr *) &addr6.sin6_addr, buf, sizeof(buf));
+		else
+			from = inet_ntop(AF_INET, (struct sockaddr *) &addr.sin_addr, buf, sizeof(buf));
+
+		dprint(FD_NET, "server: connect from %s\n", from);
+
+		sk_out = scalloc(1, sizeof(*sk_out));
+		if (!sk_out) {
+			close(sk);
+			return -1;
+		}
+
+		sk_out->sk = sk;
+		INIT_FLIST_HEAD(&sk_out->list);
+		__fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+		__fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+		__fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
 
 		pid = fork();
 		if (pid) {
 			close(sk);
-			fio_server_add_conn_pid(pid);
+			fio_server_add_conn_pid(&conn_list, pid);
 			continue;
 		}
 
-		/* exits */
-		handle_connection(sk);
+		/* if error, it's already logged, non-fatal */
+		get_my_addr_str(sk);
+
+		/*
+		 * Assign sk_out here, it'll be dropped in handle_connection()
+		 * since that function calls _exit() when done
+		 */
+		sk_out_assign(sk_out);
+		handle_connection(sk_out);
 	}
 
 	return exitval;
@@ -911,12 +1398,13 @@
 
 int fio_server_text_output(int level, const char *buf, size_t len)
 {
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
 	struct cmd_text_pdu *pdu;
 	unsigned int tlen;
 	struct timeval tv;
 
-	if (server_fd == -1)
-		return log_local_buf(buf, len);
+	if (!sk_out || sk_out->sk == -1)
+		return -1;
 
 	tlen = sizeof(*pdu) + len;
 	pdu = malloc(tlen);
@@ -930,7 +1418,7 @@
 
 	memcpy(pdu->buf, buf, len);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_TEXT, pdu, tlen, NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_TEXT, pdu, tlen, NULL, SK_F_COPY);
 	free(pdu);
 	return len;
 }
@@ -944,8 +1432,8 @@
 	/*
 	 * Encode to IEEE 754 for network transfer
 	 */
-	dst->mean.u.i	= __cpu_to_le64(fio_double_to_uint64(src->mean.u.f));
-	dst->S.u.i	= __cpu_to_le64(fio_double_to_uint64(src->S.u.f));
+	dst->mean.u.i	= cpu_to_le64(fio_double_to_uint64(src->mean.u.f));
+	dst->S.u.i	= cpu_to_le64(fio_double_to_uint64(src->S.u.f));
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
@@ -957,7 +1445,7 @@
 		dst->min_run[i]		= cpu_to_le64(src->min_run[i]);
 		dst->max_bw[i]		= cpu_to_le64(src->max_bw[i]);
 		dst->min_bw[i]		= cpu_to_le64(src->min_bw[i]);
-		dst->io_kb[i]		= cpu_to_le64(src->io_kb[i]);
+		dst->iobytes[i]		= cpu_to_le64(src->iobytes[i]);
 		dst->agg[i]		= cpu_to_le64(src->agg[i]);
 	}
 
@@ -965,6 +1453,7 @@
 	dst->unit_base	= cpu_to_le32(src->unit_base);
 	dst->groupid	= cpu_to_le32(src->groupid);
 	dst->unified_rw_rep	= cpu_to_le32(src->unified_rw_rep);
+	dst->sig_figs	= cpu_to_le32(src->sig_figs);
 }
 
 /*
@@ -975,14 +1464,17 @@
 {
 	struct cmd_ts_pdu p;
 	int i, j;
+	void *ss_buf;
+	uint64_t *ss_iops, *ss_bw;
 
 	dprint(FD_NET, "server sending end stats\n");
 
 	memset(&p, 0, sizeof(p));
 
-	strcpy(p.ts.name, ts->name);
-	strcpy(p.ts.verror, ts->verror);
-	strcpy(p.ts.description, ts->description);
+	snprintf(p.ts.name, sizeof(p.ts.name), "%s", ts->name);
+	snprintf(p.ts.verror, sizeof(p.ts.verror), "%s", ts->verror);
+	snprintf(p.ts.description, sizeof(p.ts.description), "%s",
+		 ts->description);
 
 	p.ts.error		= cpu_to_le32(ts->error);
 	p.ts.thread_number	= cpu_to_le32(ts->thread_number);
@@ -996,6 +1488,7 @@
 		convert_io_stat(&p.ts.slat_stat[i], &ts->slat_stat[i]);
 		convert_io_stat(&p.ts.lat_stat[i], &ts->lat_stat[i]);
 		convert_io_stat(&p.ts.bw_stat[i], &ts->bw_stat[i]);
+		convert_io_stat(&p.ts.iops_stat[i], &ts->iops_stat[i]);
 	}
 
 	p.ts.usr_time		= cpu_to_le64(ts->usr_time);
@@ -1003,37 +1496,43 @@
 	p.ts.ctx		= cpu_to_le64(ts->ctx);
 	p.ts.minf		= cpu_to_le64(ts->minf);
 	p.ts.majf		= cpu_to_le64(ts->majf);
-	p.ts.clat_percentiles	= cpu_to_le64(ts->clat_percentiles);
+	p.ts.clat_percentiles	= cpu_to_le32(ts->clat_percentiles);
+	p.ts.lat_percentiles	= cpu_to_le32(ts->lat_percentiles);
+	p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		fio_fp64_t *src = &ts->percentile_list[i];
 		fio_fp64_t *dst = &p.ts.percentile_list[i];
 
-		dst->u.i = __cpu_to_le64(fio_double_to_uint64(src->u.f));
+		dst->u.i = cpu_to_le64(fio_double_to_uint64(src->u.f));
 	}
 
 	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
-		p.ts.io_u_map[i]	= cpu_to_le32(ts->io_u_map[i]);
-		p.ts.io_u_submit[i]	= cpu_to_le32(ts->io_u_submit[i]);
-		p.ts.io_u_complete[i]	= cpu_to_le32(ts->io_u_complete[i]);
+		p.ts.io_u_map[i]	= cpu_to_le64(ts->io_u_map[i]);
+		p.ts.io_u_submit[i]	= cpu_to_le64(ts->io_u_submit[i]);
+		p.ts.io_u_complete[i]	= cpu_to_le64(ts->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
-		p.ts.io_u_lat_u[i]	= cpu_to_le32(ts->io_u_lat_u[i]);
-		p.ts.io_u_lat_m[i]	= cpu_to_le32(ts->io_u_lat_m[i]);
-	}
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		p.ts.io_u_lat_n[i]	= cpu_to_le64(ts->io_u_lat_n[i]);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		p.ts.io_u_lat_u[i]	= cpu_to_le64(ts->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		p.ts.io_u_lat_m[i]	= cpu_to_le64(ts->io_u_lat_m[i]);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
-			p.ts.io_u_plat[i][j] = cpu_to_le32(ts->io_u_plat[i][j]);
+			p.ts.io_u_plat[i][j] = cpu_to_le64(ts->io_u_plat[i][j]);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		p.ts.total_io_u[i]	= cpu_to_le64(ts->total_io_u[i]);
 		p.ts.short_io_u[i]	= cpu_to_le64(ts->short_io_u[i]);
+		p.ts.drop_io_u[i]	= cpu_to_le64(ts->drop_io_u[i]);
 	}
 
 	p.ts.total_submit	= cpu_to_le64(ts->total_submit);
 	p.ts.total_complete	= cpu_to_le64(ts->total_complete);
+	p.ts.nr_zone_resets	= cpu_to_le64(ts->nr_zone_resets);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		p.ts.io_bytes[i]	= cpu_to_le64(ts->io_bytes[i]);
@@ -1047,9 +1546,51 @@
 	p.ts.kb_base		= cpu_to_le32(ts->kb_base);
 	p.ts.unit_base		= cpu_to_le32(ts->unit_base);
 
+	p.ts.latency_depth	= cpu_to_le32(ts->latency_depth);
+	p.ts.latency_target	= cpu_to_le64(ts->latency_target);
+	p.ts.latency_window	= cpu_to_le64(ts->latency_window);
+	p.ts.latency_percentile.u.i = cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
+
+	p.ts.sig_figs		= cpu_to_le32(ts->sig_figs);
+
+	p.ts.nr_block_infos	= cpu_to_le64(ts->nr_block_infos);
+	for (i = 0; i < p.ts.nr_block_infos; i++)
+		p.ts.block_infos[i] = cpu_to_le32(ts->block_infos[i]);
+
+	p.ts.ss_dur		= cpu_to_le64(ts->ss_dur);
+	p.ts.ss_state		= cpu_to_le32(ts->ss_state);
+	p.ts.ss_head		= cpu_to_le32(ts->ss_head);
+	p.ts.ss_limit.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_limit.u.f));
+	p.ts.ss_slope.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_slope.u.f));
+	p.ts.ss_deviation.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_deviation.u.f));
+	p.ts.ss_criterion.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_criterion.u.f));
+
+	p.ts.cachehit		= cpu_to_le64(ts->cachehit);
+	p.ts.cachemiss		= cpu_to_le64(ts->cachemiss);
+
 	convert_gs(&p.rs, rs);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_TS, &p, sizeof(p), NULL, NULL);
+	dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+	if (ts->ss_state & FIO_SS_DATA) {
+		dprint(FD_NET, "server sending steadystate ring buffers\n");
+
+		ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+
+		memcpy(ss_buf, &p, sizeof(p));
+
+		ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
+		ss_bw = ss_iops + (int) ts->ss_dur;
+		for (i = 0; i < ts->ss_dur; i++) {
+			ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+			ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
+		}
+
+		fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
+
+		free(ss_buf);
+	}
+	else
+		fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
 }
 
 void fio_server_send_gs(struct group_run_stats *rs)
@@ -1059,7 +1600,48 @@
 	dprint(FD_NET, "server sending group run stats\n");
 
 	convert_gs(&gs, rs);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_GS, &gs, sizeof(gs), NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_GS, &gs, sizeof(gs), NULL, SK_F_COPY);
+}
+
+void fio_server_send_job_options(struct flist_head *opt_list,
+				 unsigned int gid)
+{
+	struct cmd_job_option pdu;
+	struct flist_head *entry;
+
+	if (flist_empty(opt_list))
+		return;
+
+	flist_for_each(entry, opt_list) {
+		struct print_option *p;
+		size_t len;
+
+		p = flist_entry(entry, struct print_option, list);
+		memset(&pdu, 0, sizeof(pdu));
+
+		if (gid == -1U) {
+			pdu.global = __cpu_to_le16(1);
+			pdu.groupid = 0;
+		} else {
+			pdu.global = 0;
+			pdu.groupid = cpu_to_le32(gid);
+		}
+		len = strlen(p->name);
+		if (len >= sizeof(pdu.name)) {
+			len = sizeof(pdu.name) - 1;
+			pdu.truncated = __cpu_to_le16(1);
+		}
+		memcpy(pdu.name, p->name, len);
+		if (p->value) {
+			len = strlen(p->value);
+			if (len >= sizeof(pdu.value)) {
+				len = sizeof(pdu.value) - 1;
+				pdu.truncated = __cpu_to_le16(1);
+			}
+			memcpy(pdu.value, p->value, len);
+		}
+		fio_net_queue_cmd(FIO_NET_CMD_JOB_OPT, &pdu, sizeof(pdu), NULL, SK_F_COPY);
+	}
 }
 
 static void convert_agg(struct disk_util_agg *dst, struct disk_util_agg *src)
@@ -1067,34 +1649,34 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		dst->ios[i]	= cpu_to_le32(src->ios[i]);
-		dst->merges[i]	= cpu_to_le32(src->merges[i]);
+		dst->ios[i]	= cpu_to_le64(src->ios[i]);
+		dst->merges[i]	= cpu_to_le64(src->merges[i]);
 		dst->sectors[i]	= cpu_to_le64(src->sectors[i]);
-		dst->ticks[i]	= cpu_to_le32(src->ticks[i]);
+		dst->ticks[i]	= cpu_to_le64(src->ticks[i]);
 	}
 
-	dst->io_ticks		= cpu_to_le32(src->io_ticks);
-	dst->time_in_queue	= cpu_to_le32(src->time_in_queue);
+	dst->io_ticks		= cpu_to_le64(src->io_ticks);
+	dst->time_in_queue	= cpu_to_le64(src->time_in_queue);
 	dst->slavecount		= cpu_to_le32(src->slavecount);
-	dst->max_util.u.i	= __cpu_to_le64(fio_double_to_uint64(src->max_util.u.f));
+	dst->max_util.u.i	= cpu_to_le64(fio_double_to_uint64(src->max_util.u.f));
 }
 
 static void convert_dus(struct disk_util_stat *dst, struct disk_util_stat *src)
 {
 	int i;
 
-	strcpy((char *) dst->name, (char *) src->name);
+	snprintf((char *) dst->name, sizeof(dst->name), "%s", src->name);
 
 	for (i = 0; i < 2; i++) {
-		dst->ios[i]	= cpu_to_le32(src->ios[i]);
-		dst->merges[i]	= cpu_to_le32(src->merges[i]);
-		dst->sectors[i]	= cpu_to_le64(src->sectors[i]);
-		dst->ticks[i]	= cpu_to_le32(src->ticks[i]);
+		dst->s.ios[i]		= cpu_to_le64(src->s.ios[i]);
+		dst->s.merges[i]	= cpu_to_le64(src->s.merges[i]);
+		dst->s.sectors[i]	= cpu_to_le64(src->s.sectors[i]);
+		dst->s.ticks[i]		= cpu_to_le64(src->s.ticks[i]);
 	}
 
-	dst->io_ticks		= cpu_to_le32(src->io_ticks);
-	dst->time_in_queue	= cpu_to_le32(src->time_in_queue);
-	dst->msec		= cpu_to_le64(src->msec);
+	dst->s.io_ticks		= cpu_to_le64(src->s.io_ticks);
+	dst->s.time_in_queue	= cpu_to_le64(src->s.time_in_queue);
+	dst->s.msec		= cpu_to_le64(src->s.msec);
 }
 
 void fio_server_send_du(void)
@@ -1113,148 +1695,437 @@
 		convert_dus(&pdu.dus, &du->dus);
 		convert_agg(&pdu.agg, &du->agg);
 
-		fio_net_send_cmd(server_fd, FIO_NET_CMD_DU, &pdu, sizeof(pdu), NULL, NULL);
+		fio_net_queue_cmd(FIO_NET_CMD_DU, &pdu, sizeof(pdu), NULL, SK_F_COPY);
 	}
 }
 
+#ifdef CONFIG_ZLIB
+
+static inline void __fio_net_prep_tail(z_stream *stream, void *out_pdu,
+					struct sk_entry **last_entry,
+					struct sk_entry *first)
+{
+	unsigned int this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
+
+	*last_entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+				 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+	if (*last_entry)
+		flist_add_tail(&(*last_entry)->list, &first->next);
+}
+
 /*
- * Send a command with a separate PDU, not inlined in the command
+ * Deflates the next input given, creating as many new packets in the
+ * linked list as necessary.
  */
-static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
-				off_t size, uint64_t tag, uint32_t flags)
+static int __deflate_pdu_buffer(void *next_in, unsigned int next_sz, void **out_pdu,
+				struct sk_entry **last_entry, z_stream *stream,
+				struct sk_entry *first)
 {
-	struct fio_net_cmd cmd;
-	struct iovec iov[2];
+	int ret;
 
-	iov[0].iov_base = &cmd;
-	iov[0].iov_len = sizeof(cmd);
-	iov[1].iov_base = (void *) buf;
-	iov[1].iov_len = size;
+	stream->next_in = next_in;
+	stream->avail_in = next_sz;
+	do {
+		if (!stream->avail_out) {
+			__fio_net_prep_tail(stream, *out_pdu, last_entry, first);
+			if (*last_entry == NULL)
+				return 1;
+
+			*out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
 
-	__fio_init_net_cmd(&cmd, opcode, size, tag);
-	cmd.flags = __cpu_to_le32(flags);
-	fio_net_cmd_crc_pdu(&cmd, buf);
+			stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+			stream->next_out = *out_pdu;
+		}
+
+		ret = deflate(stream, Z_BLOCK);
 
-	return fio_sendv_data(sk, iov, 2);
+		if (ret < 0) {
+			free(*out_pdu);
+			return 1;
+		}
+	} while (stream->avail_in);
+
+	return 0;
 }
 
-static int fio_send_iolog_gz(struct cmd_iolog_pdu *pdu, struct io_log *log)
+static int __fio_append_iolog_gz_hist(struct sk_entry *first, struct io_log *log,
+				      struct io_logs *cur_log, z_stream *stream)
 {
-	int ret = 0;
-#ifdef CONFIG_ZLIB
-	z_stream stream;
+	struct sk_entry *entry;
 	void *out_pdu;
+	int ret, i, j;
+	int sample_sz = log_entry_sz(log);
 
-	/*
-	 * Dirty - since the log is potentially huge, compress it into
-	 * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
-	 * side defragment it.
-	 */
 	out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+	stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+	stream->next_out = out_pdu;
 
-	stream.zalloc = Z_NULL;
-	stream.zfree = Z_NULL;
-	stream.opaque = Z_NULL;
+	for (i = 0; i < cur_log->nr_samples; i++) {
+		struct io_sample *s;
+		struct io_u_plat_entry *cur_plat_entry, *prev_plat_entry;
+		uint64_t *cur_plat, *prev_plat;
 
-	if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK) {
-		ret = 1;
-		goto err;
+		s = get_sample(log, cur_log, i);
+		ret = __deflate_pdu_buffer(s, sample_sz, &out_pdu, &entry, stream, first);
+		if (ret)
+			return ret;
+
+		/* Do the subtraction on server side so that client doesn't have to
+		 * reconstruct our linked list from packets.
+		 */
+		cur_plat_entry  = s->data.plat_entry;
+		prev_plat_entry = flist_first_entry(&cur_plat_entry->list, struct io_u_plat_entry, list);
+		cur_plat  = cur_plat_entry->io_u_plat;
+		prev_plat = prev_plat_entry->io_u_plat;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			cur_plat[j] -= prev_plat[j];
+		}
+
+		flist_del(&prev_plat_entry->list);
+		free(prev_plat_entry);
+
+		ret = __deflate_pdu_buffer(cur_plat_entry, sizeof(*cur_plat_entry),
+					   &out_pdu, &entry, stream, first);
+
+		if (ret)
+			return ret;
 	}
 
-	stream.next_in = (void *) log->log;
-	stream.avail_in = log->nr_samples * sizeof(struct io_sample);
+	__fio_net_prep_tail(stream, out_pdu, &entry, first);
+	return entry == NULL;
+}
+
+static int __fio_append_iolog_gz(struct sk_entry *first, struct io_log *log,
+				 struct io_logs *cur_log, z_stream *stream)
+{
+	unsigned int this_len;
+	void *out_pdu;
+	int ret;
+
+	if (log->log_type == IO_LOG_TYPE_HIST)
+		return __fio_append_iolog_gz_hist(first, log, cur_log, stream);
+
+	stream->next_in = (void *) cur_log->log;
+	stream->avail_in = cur_log->nr_samples * log_entry_sz(log);
 
 	do {
-		unsigned int this_len, flags = 0;
-		int ret;
+		struct sk_entry *entry;
+
+		/*
+		 * Dirty - since the log is potentially huge, compress it into
+		 * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
+		 * side defragment it.
+		 */
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+		stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+		stream->next_out = out_pdu;
+		ret = deflate(stream, Z_BLOCK);
+		/* may be Z_OK, or Z_STREAM_END */
+		if (ret < 0) {
+			free(out_pdu);
+			return 1;
+		}
+
+		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		if (!entry) {
+			free(out_pdu);
+			return 1;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	} while (stream->avail_in);
+
+	return 0;
+}
+
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+	z_stream stream = {
+		.zalloc	= Z_NULL,
+		.zfree	= Z_NULL,
+		.opaque	= Z_NULL,
+	};
+	int ret = 0;
+
+	if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+		return 1;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		ret = __fio_append_iolog_gz(first, log, cur_log, &stream);
+		if (ret)
+			break;
+	}
+
+	ret = deflate(&stream, Z_FINISH);
+
+	while (ret != Z_STREAM_END) {
+		struct sk_entry *entry;
+		unsigned int this_len;
+		void *out_pdu;
 
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
 		stream.avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
 		stream.next_out = out_pdu;
+
 		ret = deflate(&stream, Z_FINISH);
 		/* may be Z_OK, or Z_STREAM_END */
-		if (ret < 0)
-			goto err_zlib;
+		if (ret < 0) {
+			free(out_pdu);
+			break;
+		}
 
 		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream.avail_out;
 
-		if (stream.avail_in)
-			flags = FIO_NET_CMD_F_MORE;
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		if (!entry) {
+			free(out_pdu);
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	} while (ret != Z_STREAM_END);
 
-		ret = fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG,
-					   out_pdu, this_len, 0, flags);
-		if (ret)
-			goto err_zlib;
-	} while (stream.avail_in);
+	ret = deflateEnd(&stream);
+	if (ret == Z_OK)
+		return 0;
 
-err_zlib:
-	deflateEnd(&stream);
-err:
-	free(out_pdu);
+	return 1;
+}
+#else
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+	return 1;
+}
 #endif
+
+static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	struct flist_head *node;
+	int ret = 0;
+
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(node, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(node, struct iolog_compress, list);
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, c->buf, c->len,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		if (!entry) {
+			ret = 1;
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
+static int fio_append_text_log(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	int ret = 0;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+		size_t size;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		size = cur_log->nr_samples * log_entry_sz(log);
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, cur_log->log, size,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		if (!entry) {
+			ret = 1;
+			break;
+		}
+		flist_add_tail(&entry->list, &first->next);
+	}
+
 	return ret;
 }
 
 int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 {
-	struct cmd_iolog_pdu pdu;
-	int i, ret = 0;
+	struct cmd_iolog_pdu pdu = {
+		.nr_samples		= cpu_to_le64(iolog_nr_samples(log)),
+		.thread_number		= cpu_to_le32(td->thread_number),
+		.log_type		= cpu_to_le32(log->log_type),
+		.log_hist_coarseness	= cpu_to_le32(log->hist_coarseness),
+	};
+	struct sk_entry *first;
+	struct flist_head *entry;
+	int ret = 0;
+
+	if (!flist_empty(&log->chunk_list))
+		pdu.compressed = __cpu_to_le32(STORE_COMPRESSED);
+	else if (use_zlib)
+		pdu.compressed = __cpu_to_le32(XMIT_COMPRESSED);
+	else
+		pdu.compressed = 0;
+
+	snprintf((char *) pdu.name, sizeof(pdu.name), "%s", name);
+
+	/*
+	 * We can't do this for a pre-compressed log, but for that case,
+	 * log->nr_samples is zero anyway.
+	 */
+	flist_for_each(entry, &log->io_logs) {
+		struct io_logs *cur_log;
+		int i;
+
+		cur_log = flist_entry(entry, struct io_logs, list);
+
+		for (i = 0; i < cur_log->nr_samples; i++) {
+			struct io_sample *s = get_sample(log, cur_log, i);
 
-	pdu.thread_number = cpu_to_le32(td->thread_number);
-	pdu.nr_samples = __cpu_to_le32(log->nr_samples);
-	pdu.log_type = cpu_to_le32(log->log_type);
-	pdu.compressed = cpu_to_le32(use_zlib);
-	strcpy((char *) pdu.name, name);
-
-	for (i = 0; i < log->nr_samples; i++) {
-		struct io_sample *s = &log->log[i];
-
-		s->time	= cpu_to_le64(s->time);
-		s->val	= cpu_to_le64(s->val);
-		s->ddir	= cpu_to_le32(s->ddir);
-		s->bs	= cpu_to_le32(s->bs);
+			s->time		= cpu_to_le64(s->time);
+			s->data.val	= cpu_to_le64(s->data.val);
+			s->__ddir	= cpu_to_le32(s->__ddir);
+			s->bs		= cpu_to_le64(s->bs);
+
+			if (log->log_offset) {
+				struct io_sample_offset *so = (void *) s;
+
+				so->offset = cpu_to_le64(so->offset);
+			}
+		}
 	}
 
 	/*
-	 * Send header first, it's not compressed.
+	 * Assemble header entry first
 	 */
-	ret = fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG, &pdu,
-					sizeof(pdu), 0, FIO_NET_CMD_F_MORE);
-	if (ret)
-		return ret;
+	first = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, &pdu, sizeof(pdu), NULL, SK_F_VEC | SK_F_INLINE | SK_F_COPY);
+	if (!first)
+		return 1;
 
 	/*
-	 * Now send actual log, compress if we can, otherwise just plain
+	 * Now append actual log entries. If log compression was enabled on
+	 * the job, just send out the compressed chunks directly. If we
+	 * have a plain log, compress if we can, then send. Otherwise, send
+	 * the plain text output.
 	 */
-	if (use_zlib)
-		return fio_send_iolog_gz(&pdu, log);
+	if (!flist_empty(&log->chunk_list))
+		ret = fio_append_gz_chunks(first, log);
+	else if (use_zlib)
+		ret = fio_append_iolog_gz(first, log);
+	else
+		ret = fio_append_text_log(first, log);
 
-	return fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG, log->log,
-			log->nr_samples * sizeof(struct io_sample), 0, 0);
+	fio_net_queue_entry(first);
+	return ret;
 }
 
 void fio_server_send_add_job(struct thread_data *td)
 {
-	struct cmd_add_job_pdu pdu;
+	struct cmd_add_job_pdu pdu = {
+		.thread_number = cpu_to_le32(td->thread_number),
+		.groupid = cpu_to_le32(td->groupid),
+	};
 
-	memset(&pdu, 0, sizeof(pdu));
-	pdu.thread_number = cpu_to_le32(td->thread_number);
-	pdu.groupid = cpu_to_le32(td->groupid);
 	convert_thread_options_to_net(&pdu.top, &td->o);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
+				SK_F_COPY);
 }
 
 void fio_server_send_start(struct thread_data *td)
 {
-	assert(server_fd != -1);
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	assert(sk_out->sk != -1);
+
+	fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE);
+}
+
+int fio_server_get_verify_state(const char *name, int threadnumber,
+				void **datap)
+{
+	struct thread_io_list *s;
+	struct cmd_sendfile out;
+	struct cmd_reply *rep;
+	uint64_t tag;
+	void *data;
+	int ret;
 
-	fio_net_send_simple_cmd(server_fd, FIO_NET_CMD_SERVER_START, 0, NULL);
+	dprint(FD_NET, "server: request verify state\n");
+
+	rep = smalloc(sizeof(*rep));
+	if (!rep)
+		return ENOMEM;
+
+	__fio_sem_init(&rep->lock, FIO_SEM_LOCKED);
+	rep->data = NULL;
+	rep->error = 0;
+
+	verify_state_gen_name((char *) out.path, sizeof(out.path), name, me,
+				threadnumber);
+	tag = (uint64_t) (uintptr_t) rep;
+	fio_net_queue_cmd(FIO_NET_CMD_SENDFILE, &out, sizeof(out), &tag,
+				SK_F_COPY);
+
+	/*
+	 * Wait for the backend to receive the reply
+	 */
+	if (fio_sem_down_timeout(&rep->lock, 10000)) {
+		log_err("fio: timed out waiting for reply\n");
+		ret = ETIMEDOUT;
+		goto fail;
+	}
+
+	if (rep->error) {
+		log_err("fio: failure on receiving state file %s: %s\n",
+				out.path, strerror(rep->error));
+		ret = rep->error;
+fail:
+		*datap = NULL;
+		sfree(rep);
+		fio_net_queue_quit();
+		return ret;
+	}
+
+	/*
+	 * The format is verify_state_hdr, then thread_io_list. Verify
+	 * the header, and the thread_io_list checksum
+	 */
+	s = rep->data + sizeof(struct verify_state_hdr);
+	if (verify_state_hdr(rep->data, s)) {
+		ret = EILSEQ;
+		goto fail;
+	}
+
+	/*
+	 * Don't need the header from now, copy just the thread_io_list
+	 */
+	ret = 0;
+	rep->size -= sizeof(struct verify_state_hdr);
+	data = malloc(rep->size);
+	memcpy(data, s, rep->size);
+	*datap = data;
+
+	sfree(rep->data);
+	__fio_sem_remove(&rep->lock);
+	sfree(rep);
+	return ret;
 }
 
 static int fio_init_server_ip(void)
 {
 	struct sockaddr *addr;
 	socklen_t socklen;
+	char buf[80];
+	const char *str;
 	int sk, opt;
 
 	if (use_ipv6)
@@ -1269,30 +2140,36 @@
 
 	opt = 1;
 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, (void *)&opt, sizeof(opt)) < 0) {
-		log_err("fio: setsockopt: %s\n", strerror(errno));
+		log_err("fio: setsockopt(REUSEADDR): %s\n", strerror(errno));
 		close(sk);
 		return -1;
 	}
 #ifdef SO_REUSEPORT
-	if (setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)) < 0) {
-		log_err("fio: setsockopt: %s\n", strerror(errno));
-		close(sk);
-		return -1;
-	}
+	/*
+	 * Not fatal if fails, so just ignore it if that happens
+	 */
+	setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
 #endif
 
 	if (use_ipv6) {
+		void *src = &saddr_in6.sin6_addr;
+
 		addr = (struct sockaddr *) &saddr_in6;
 		socklen = sizeof(saddr_in6);
 		saddr_in6.sin6_family = AF_INET6;
+		str = inet_ntop(AF_INET6, src, buf, sizeof(buf));
 	} else {
+		void *src = &saddr_in.sin_addr;
+
 		addr = (struct sockaddr *) &saddr_in;
 		socklen = sizeof(saddr_in);
 		saddr_in.sin_family = AF_INET;
+		str = inet_ntop(AF_INET, src, buf, sizeof(buf));
 	}
 
 	if (bind(sk, addr, socklen) < 0) {
 		log_err("fio: bind: %s\n", strerror(errno));
+		log_info("fio: failed with IPv%c %s\n", use_ipv6 ? '6' : '4', str);
 		close(sk);
 		return -1;
 	}
@@ -1315,10 +2192,8 @@
 
 	mode = umask(000);
 
-	memset(&addr, 0, sizeof(addr));
 	addr.sun_family = AF_UNIX;
-	strcpy(addr.sun_path, bind_sock);
-	unlink(bind_sock);
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", bind_sock);
 
 	len = sizeof(addr.sun_family) + strlen(bind_sock) + 1;
 
@@ -1347,9 +2222,11 @@
 	if (sk < 0)
 		return sk;
 
+	memset(bind_str, 0, sizeof(bind_str));
+
 	if (!bind_sock) {
 		char *p, port[16];
-		const void *src;
+		void *src;
 		int af;
 
 		if (use_ipv6) {
@@ -1366,55 +2243,52 @@
 		if (p)
 			strcat(p, port);
 		else
-			strcpy(bind_str, port);
+			snprintf(bind_str, sizeof(bind_str), "%s", port);
 	} else
-		strcpy(bind_str, bind_sock);
+		snprintf(bind_str, sizeof(bind_str), "%s", bind_sock);
 
 	log_info("fio: server listening on %s\n", bind_str);
 
-	if (listen(sk, 0) < 0) {
+	if (listen(sk, 4) < 0) {
 		log_err("fio: listen: %s\n", strerror(errno));
+		close(sk);
 		return -1;
 	}
 
 	return sk;
 }
 
-int fio_server_parse_host(const char *host, int *ipv6, struct in_addr *inp,
+int fio_server_parse_host(const char *host, int ipv6, struct in_addr *inp,
 			  struct in6_addr *inp6)
 
 {
 	int ret = 0;
 
-	if (*ipv6)
+	if (ipv6)
 		ret = inet_pton(AF_INET6, host, inp6);
 	else
 		ret = inet_pton(AF_INET, host, inp);
 
 	if (ret != 1) {
-		struct hostent *hent;
+		struct addrinfo *res, hints = {
+			.ai_family = ipv6 ? AF_INET6 : AF_INET,
+			.ai_socktype = SOCK_STREAM,
+		};
 
-		hent = gethostbyname(host);
-		if (!hent) {
-			log_err("fio: failed to resolve <%s>\n", host);
-			return 0;
+		ret = getaddrinfo(host, NULL, &hints, &res);
+		if (ret) {
+			log_err("fio: failed to resolve <%s> (%s)\n", host,
+					gai_strerror(ret));
+			return 1;
 		}
 
-		if (*ipv6) {
-			if (hent->h_addrtype != AF_INET6) {
-				log_info("fio: falling back to IPv4\n");
-				*ipv6 = 0;
-			} else
-				memcpy(inp6, hent->h_addr_list[0], 16);
-		}
-		if (!*ipv6) {
-			if (hent->h_addrtype != AF_INET) {
-				log_err("fio: lookup type mismatch\n");
-				return 0;
-			}
-			memcpy(inp, hent->h_addr_list[0], 4);
-		}
+		if (ipv6)
+			memcpy(inp6, &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr, sizeof(*inp6));
+		else
+			memcpy(inp, &((struct sockaddr_in *) res->ai_addr)->sin_addr, sizeof(*inp));
+
 		ret = 1;
+		freeaddrinfo(res);
 	}
 
 	return !(ret == 1);
@@ -1432,7 +2306,7 @@
  * For local domain sockets:
  *	*ptr is the filename, *is_sock is 1.
  */
-int fio_server_parse_string(const char *str, char **ptr, int *is_sock,
+int fio_server_parse_string(const char *str, char **ptr, bool *is_sock,
 			    int *port, struct in_addr *inp,
 			    struct in6_addr *inp6, int *ipv6)
 {
@@ -1441,13 +2315,13 @@
 	int lport = 0;
 
 	*ptr = NULL;
-	*is_sock = 0;
+	*is_sock = false;
 	*port = fio_net_port;
 	*ipv6 = 0;
 
 	if (!strncmp(str, "sock:", 5)) {
 		*ptr = strdup(str + 5);
-		*is_sock = 1;
+		*is_sock = true;
 
 		return 0;
 	}
@@ -1476,7 +2350,7 @@
 	}
 
 	/*
-	 * If no port seen yet, check if there's a last ':' at the end
+	 * If no port seen yet, check if there's a last ',' at the end
 	 */
 	if (!lport) {
 		portp = strchr(host, ',');
@@ -1499,7 +2373,7 @@
 
 	*ptr = strdup(host);
 
-	if (fio_server_parse_host(*ptr, ipv6, inp, inp6)) {
+	if (fio_server_parse_host(*ptr, *ipv6, inp, inp6)) {
 		free(*ptr);
 		*ptr = NULL;
 		return 1;
@@ -1526,7 +2400,8 @@
 static int fio_handle_server_arg(void)
 {
 	int port = fio_net_port;
-	int is_sock, ret = 0;
+	bool is_sock;
+	int ret = 0;
 
 	saddr_in.sin_addr.s_addr = htonl(INADDR_ANY);
 
@@ -1549,6 +2424,38 @@
 	return ret;
 }
 
+static void sig_int(int sig)
+{
+	if (bind_sock)
+		unlink(bind_sock);
+}
+
+static void set_sig_handlers(void)
+{
+	struct sigaction act = {
+		.sa_handler = sig_int,
+		.sa_flags = SA_RESTART,
+	};
+
+	sigaction(SIGINT, &act, NULL);
+}
+
+void fio_server_destroy_sk_key(void)
+{
+	pthread_key_delete(sk_out_key);
+}
+
+int fio_server_create_sk_key(void)
+{
+	if (pthread_key_create(&sk_out_key, NULL)) {
+		log_err("fio: can't create sk_out backend key\n");
+		return 1;
+	}
+
+	pthread_setspecific(sk_out_key, NULL);
+	return 0;
+}
+
 static int fio_server(void)
 {
 	int sk, ret;
@@ -1562,6 +2469,8 @@
 	if (sk < 0)
 		return -1;
 
+	set_sig_handlers();
+
 	ret = accept_loop(sk);
 
 	close(sk);
@@ -1578,11 +2487,15 @@
 
 void fio_server_got_signal(int signal)
 {
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	assert(sk_out);
+
 	if (signal == SIGPIPE)
-		server_fd = -1;
+		sk_out->sk = -1;
 	else {
 		log_info("\nfio: terminating on signal %d\n", signal);
-		exit_backend = 1;
+		exit_backend = true;
 	}
 }
 
@@ -1647,23 +2560,24 @@
 	if (check_existing_pidfile(pidfile)) {
 		log_err("fio: pidfile %s exists and server appears alive\n",
 								pidfile);
+		free(pidfile);
 		return -1;
 	}
 
 	pid = fork();
 	if (pid < 0) {
-		log_err("fio: failed server fork: %s", strerror(errno));
+		log_err("fio: failed server fork: %s\n", strerror(errno));
 		free(pidfile);
 		return -1;
 	} else if (pid) {
-		int ret = write_pid(pid, pidfile);
-
-		exit(ret);
+		ret = write_pid(pid, pidfile);
+		free(pidfile);
+		_exit(ret);
 	}
 
 	setsid();
 	openlog("fio", LOG_NDELAY|LOG_NOWAIT|LOG_PID, LOG_USER);
-	log_syslog = 1;
+	log_syslog = true;
 	close(STDIN_FILENO);
 	close(STDOUT_FILENO);
 	close(STDERR_FILENO);
diff -Nru fio-2.1.3/server.h fio-3.16/server.h
--- fio-2.1.3/server.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/server.h	2019-09-20 01:01:52.000000000 +0000
@@ -7,11 +7,21 @@
 #include <netinet/in.h>
 
 #include "stat.h"
-#include "os/os.h"
 #include "diskutil.h"
 
 #define FIO_NET_PORT 8765
 
+struct sk_out {
+	unsigned int refs;	/* frees sk_out when it drops to zero.
+				 * protected by below ->lock */
+
+	int sk;			/* socket fd to talk to client */
+	struct fio_sem lock;	/* protects ref and below list */
+	struct flist_head list;	/* list of pending transmit work */
+	struct fio_sem wait;	/* wake backend when items added to list */
+	struct fio_sem xmit;	/* held while sending data */
+};
+
 /*
  * On-wire encoding is little endian
  */
@@ -32,15 +42,16 @@
 
 struct fio_net_cmd_reply {
 	struct flist_head list;
-	struct timeval tv;
+	struct timespec ts;
 	uint64_t saved_tag;
 	uint16_t opcode;
 };
 
 enum {
-	FIO_SERVER_VER			= 25,
+	FIO_SERVER_VER			= 80,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
+	FIO_SERVER_MAX_CMD_MB		= 2048,
 
 	FIO_NET_CMD_QUIT		= 1,
 	FIO_NET_CMD_EXIT		= 2,
@@ -60,7 +71,11 @@
 	FIO_NET_CMD_RUN			= 16,
 	FIO_NET_CMD_IOLOG		= 17,
 	FIO_NET_CMD_UPDATE_JOB		= 18,
-	FIO_NET_CMD_NR			= 19,
+	FIO_NET_CMD_LOAD_FILE		= 19,
+	FIO_NET_CMD_VTRIGGER		= 20,
+	FIO_NET_CMD_SENDFILE		= 21,
+	FIO_NET_CMD_JOB_OPT		= 22,
+	FIO_NET_CMD_NR			= 23,
 
 	FIO_NET_CMD_F_MORE		= 1UL << 0,
 
@@ -75,6 +90,31 @@
 	FIO_PROBE_FLAG_ZLIB		= 1UL << 0,
 };
 
+struct cmd_sendfile {
+	uint8_t path[FIO_NET_NAME_MAX];
+};
+
+struct cmd_sendfile_reply {
+	uint32_t size;
+	uint32_t error;
+	uint8_t data[0];
+};
+
+/*
+ * Client sends this to server on VTRIGGER, server sends back a full
+ * all_io_list structure.
+ */
+struct cmd_vtrigger_pdu {
+	uint16_t len;
+	uint8_t cmd[];
+};
+
+struct cmd_load_file_pdu {
+	uint16_t name_len;
+	uint16_t client_type;
+	uint8_t file[];
+};
+
 struct cmd_ts_pdu {
 	struct thread_stat ts;
 	struct group_run_stats rs;
@@ -87,6 +127,7 @@
 
 struct cmd_client_probe_pdu {
 	uint64_t flags;
+	uint8_t server[128];
 };
 
 struct cmd_probe_reply_pdu {
@@ -141,72 +182,57 @@
 	uint8_t buf[0];
 };
 
+enum {
+	XMIT_COMPRESSED		= 1U,
+	STORE_COMPRESSED	= 2U,
+};
+
 struct cmd_iolog_pdu {
+	uint64_t nr_samples;
 	uint32_t thread_number;
-	uint32_t nr_samples;
 	uint32_t log_type;
 	uint32_t compressed;
+	uint32_t log_offset;
+	uint32_t log_hist_coarseness;
 	uint8_t name[FIO_NET_NAME_MAX];
 	struct io_sample samples[0];
 };
 
+struct cmd_job_option {
+	uint16_t global;
+	uint16_t truncated;
+	uint32_t groupid;
+	uint8_t name[64];
+	uint8_t value[128];
+};
+
 extern int fio_start_server(char *);
 extern int fio_server_text_output(int, const char *, size_t);
 extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
 extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
 extern void fio_server_set_arg(const char *);
-extern int fio_server_parse_string(const char *, char **, int *, int *, struct in_addr *, struct in6_addr *, int *);
-extern int fio_server_parse_host(const char *, int *, struct in_addr *, struct in6_addr *);
+extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
+extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
 extern const char *fio_server_op(unsigned int);
 extern void fio_server_got_signal(int);
 
-struct thread_stat;
-struct group_run_stats;
 extern void fio_server_send_ts(struct thread_stat *, struct group_run_stats *);
 extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
-extern void fio_server_idle_loop(void);
+extern void fio_server_send_job_options(struct flist_head *, unsigned int);
+extern int fio_server_get_verify_state(const char *, int, void **);
 
-extern int fio_clients_connect(void);
-extern int fio_clients_send_ini(const char *);
-extern void fio_client_add_cmd_option(void *, const char *);
-extern void fio_client_add_ini_file(void *, const char *);
-
-extern int fio_recv_data(int sk, void *p, unsigned int len);
-extern int fio_send_data(int sk, const void *p, unsigned int len);
-extern void fio_net_cmd_crc(struct fio_net_cmd *);
-extern void fio_net_cmd_crc_pdu(struct fio_net_cmd *, const void *);
-extern struct fio_net_cmd *fio_net_recv_cmd(int sk);
+extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
 
 extern int fio_send_iolog(struct thread_data *, struct io_log *, const char *);
 extern void fio_server_send_add_job(struct thread_data *);
 extern void fio_server_send_start(struct thread_data *);
-extern int fio_net_send_stop(int sk, int error, int signal);
 extern int fio_net_send_quit(int sk);
 
-extern int exit_backend;
-extern int fio_net_port;
+extern int fio_server_create_sk_key(void);
+extern void fio_server_destroy_sk_key(void);
 
-static inline void __fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
-				      uint32_t pdu_len, uint64_t tag)
-{
-	memset(cmd, 0, sizeof(*cmd));
-
-	cmd->version	= __cpu_to_le16(FIO_SERVER_VER);
-	cmd->opcode	= cpu_to_le16(opcode);
-	cmd->tag	= cpu_to_le64(tag);
-	cmd->pdu_len	= cpu_to_le32(pdu_len);
-}
-
-
-static inline void fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
-				    const void *pdu, uint32_t pdu_len,
-				    uint64_t tag)
-{
-	__fio_init_net_cmd(cmd, opcode, pdu_len, tag);
-
-	if (pdu)
-		memcpy(&cmd->payload, pdu, pdu_len);
-}
+extern bool exit_backend;
+extern int fio_net_port;
 
 #endif
diff -Nru fio-2.1.3/smalloc.c fio-3.16/smalloc.c
--- fio-2.1.3/smalloc.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/smalloc.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,19 +3,14 @@
  * that can be shared across processes and threads
  */
 #include <sys/mman.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <assert.h>
 #include <string.h>
-#include <unistd.h>
-#include <inttypes.h>
-#include <sys/types.h>
-#include <limits.h>
-#include <fcntl.h>
 
-#include "mutex.h"
-#include "arch/arch.h"
+#include "fio.h"
+#include "fio_sem.h"
 #include "os/os.h"
+#include "smalloc.h"
+#include "log.h"
 
 #define SMALLOC_REDZONE		/* define to detect memory corruption */
 
@@ -23,17 +18,21 @@
 #define SMALLOC_BPI	(sizeof(unsigned int) * 8)
 #define SMALLOC_BPL	(SMALLOC_BPB * SMALLOC_BPI)
 
-#define INITIAL_SIZE	8192*1024	/* new pool size */
-#define MAX_POOLS	128		/* maximum number of pools to setup */
+#define INITIAL_SIZE	16*1024*1024	/* new pool size */
+#define INITIAL_POOLS	8		/* maximum number of pools to setup */
+
+#define MAX_POOLS	16
 
 #define SMALLOC_PRE_RED		0xdeadbeefU
 #define SMALLOC_POST_RED	0x5aa55aa5U
 
 unsigned int smalloc_pool_size = INITIAL_SIZE;
-const int int_mask = sizeof(int) - 1;
+#ifdef SMALLOC_REDZONE
+static const int int_mask = sizeof(int) - 1;
+#endif
 
 struct pool {
-	struct fio_mutex *lock;			/* protects this pool */
+	struct fio_sem *lock;			/* protects this pool */
 	void *map;				/* map of blocks */
 	unsigned int *bitmap;			/* blocks free/busy map */
 	size_t free_blocks;		/* free blocks */
@@ -49,40 +48,15 @@
 #endif
 };
 
-static struct pool mp[MAX_POOLS];
+/*
+ * This suppresses the voluminous potential bitmap printout when
+ * smalloc encounters an OOM error
+ */
+static const bool enable_smalloc_debug = false;
+
+static struct pool *mp;
 static unsigned int nr_pools;
 static unsigned int last_pool;
-static struct fio_rwlock *lock;
-
-static inline void pool_lock(struct pool *pool)
-{
-	fio_mutex_down(pool->lock);
-}
-
-static inline void pool_unlock(struct pool *pool)
-{
-	fio_mutex_up(pool->lock);
-}
-
-static inline void global_read_lock(void)
-{
-	fio_rwlock_read(lock);
-}
-
-static inline void global_read_unlock(void)
-{
-	fio_rwlock_unlock(lock);
-}
-
-static inline void global_write_lock(void)
-{
-	fio_rwlock_write(lock);
-}
-
-static inline void global_write_unlock(void)
-{
-	fio_rwlock_unlock(lock);
-}
 
 static inline int ptr_valid(struct pool *pool, void *ptr)
 {
@@ -176,11 +150,15 @@
 	return ffz(word) + start;
 }
 
-static int add_pool(struct pool *pool, unsigned int alloc_size)
+static bool add_pool(struct pool *pool, unsigned int alloc_size)
 {
 	int bitmap_blocks;
+	int mmap_flags;
 	void *ptr;
 
+	if (nr_pools == MAX_POOLS)
+		return false;
+
 #ifdef SMALLOC_REDZONE
 	alloc_size += sizeof(unsigned int);
 #endif
@@ -195,37 +173,66 @@
 	pool->mmap_size = alloc_size;
 
 	pool->nr_blocks = bitmap_blocks;
-	pool->free_blocks = bitmap_blocks * SMALLOC_BPB;
+	pool->free_blocks = bitmap_blocks * SMALLOC_BPI;
+
+	mmap_flags = OS_MAP_ANON;
+#ifdef CONFIG_ESX
+	mmap_flags |= MAP_PRIVATE;
+#else
+	mmap_flags |= MAP_SHARED;
+#endif
+	ptr = mmap(NULL, alloc_size, PROT_READ|PROT_WRITE, mmap_flags, -1, 0);
 
-	ptr = mmap(NULL, alloc_size, PROT_READ|PROT_WRITE,
-			MAP_SHARED | OS_MAP_ANON, -1, 0);
 	if (ptr == MAP_FAILED)
 		goto out_fail;
 
-	memset(ptr, 0, alloc_size);
 	pool->map = ptr;
-	pool->bitmap = (void *) ptr + (pool->nr_blocks * SMALLOC_BPL);
+	pool->bitmap = (unsigned int *)((char *) ptr + (pool->nr_blocks * SMALLOC_BPL));
+	memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int));
 
-	pool->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	pool->lock = fio_sem_init(FIO_SEM_UNLOCKED);
 	if (!pool->lock)
 		goto out_fail;
 
 	nr_pools++;
-	return 0;
+	return true;
 out_fail:
-	fprintf(stderr, "smalloc: failed adding pool\n");
+	log_err("smalloc: failed adding pool\n");
 	if (pool->map)
 		munmap(pool->map, pool->mmap_size);
-	return 1;
+	return false;
 }
 
 void sinit(void)
 {
-	int ret;
+	bool ret;
+	int i;
+
+	/*
+	 * sinit() can be called more than once if alloc-size is
+	 * set. But we want to allocate space for the struct pool
+	 * instances only once.
+	 */
+	if (!mp) {
+		mp = (struct pool *) mmap(NULL,
+			MAX_POOLS * sizeof(struct pool),
+			PROT_READ | PROT_WRITE,
+			OS_MAP_ANON | MAP_SHARED, -1, 0);
+
+		assert(mp != MAP_FAILED);
+	}
+
+	for (i = 0; i < INITIAL_POOLS; i++) {
+		ret = add_pool(&mp[nr_pools], smalloc_pool_size);
+		if (!ret)
+			break;
+	}
 
-	lock = fio_rwlock_init();
-	ret = add_pool(&mp[0], INITIAL_SIZE);
-	assert(!ret);
+	/*
+	 * If we added at least one pool, we should be OK for most
+	 * cases.
+	 */
+	assert(i);
 }
 
 static void cleanup_pool(struct pool *pool)
@@ -237,7 +244,7 @@
 	munmap(pool->map, pool->mmap_size);
 
 	if (pool->lock)
-		fio_mutex_remove(pool->lock);
+		fio_sem_remove(pool->lock);
 }
 
 void scleanup(void)
@@ -247,8 +254,7 @@
 	for (i = 0; i < nr_pools; i++)
 		cleanup_pool(&mp[i]);
 
-	if (lock)
-		fio_rwlock_remove(lock);
+	munmap(mp, MAX_POOLS * sizeof(struct pool));
 }
 
 #ifdef SMALLOC_REDZONE
@@ -257,7 +263,7 @@
 	uintptr_t ptr;
 
 	ptr = (uintptr_t) hdr + hdr->size - sizeof(unsigned int);
-	ptr = (ptr + int_mask) & ~int_mask;
+	ptr = (uintptr_t) PTR_ALIGN(ptr, int_mask);
 
 	return (void *) ptr;
 }
@@ -275,14 +281,14 @@
 	unsigned int *postred = postred_ptr(hdr);
 
 	if (hdr->prered != SMALLOC_PRE_RED) {
-		fprintf(stderr, "smalloc pre redzone destroyed!\n");
-		fprintf(stderr, "  ptr=%p, prered=%x, expected %x\n",
+		log_err("smalloc pre redzone destroyed!\n"
+			" ptr=%p, prered=%x, expected %x\n",
 				hdr, hdr->prered, SMALLOC_PRE_RED);
 		assert(0);
 	}
 	if (*postred != SMALLOC_POST_RED) {
-		fprintf(stderr, "smalloc post redzone destroyed!\n");
-		fprintf(stderr, "  ptr=%p, postred=%x, expected %x\n",
+		log_err("smalloc post redzone destroyed!\n"
+			"  ptr=%p, postred=%x, expected %x\n",
 				hdr, *postred, SMALLOC_POST_RED);
 		assert(0);
 	}
@@ -317,12 +323,12 @@
 	i = offset / SMALLOC_BPL;
 	idx = (offset % SMALLOC_BPL) / SMALLOC_BPB;
 
-	pool_lock(pool);
+	fio_sem_down(pool->lock);
 	clear_blocks(pool, i, idx, size_to_blocks(hdr->size));
 	if (i < pool->next_non_full)
 		pool->next_non_full = i;
 	pool->free_blocks += size_to_blocks(hdr->size);
-	pool_unlock(pool);
+	fio_sem_up(pool->lock);
 }
 
 void sfree(void *ptr)
@@ -333,8 +339,6 @@
 	if (!ptr)
 		return;
 
-	global_read_lock();
-
 	for (i = 0; i < nr_pools; i++) {
 		if (ptr_valid(&mp[i], ptr)) {
 			pool = &mp[i];
@@ -342,10 +346,31 @@
 		}
 	}
 
-	global_read_unlock();
+	if (pool) {
+		sfree_pool(pool, ptr);
+		return;
+	}
 
-	assert(pool);
-	sfree_pool(pool, ptr);
+	log_err("smalloc: ptr %p not from smalloc pool\n", ptr);
+}
+
+static unsigned int find_best_index(struct pool *pool)
+{
+	unsigned int i;
+
+	assert(pool->free_blocks);
+
+	for (i = pool->next_non_full; pool->bitmap[i] == -1U; i++) {
+		if (i == pool->nr_blocks - 1) {
+			unsigned int j;
+
+			for (j = 0; j < pool->nr_blocks; j++)
+				if (pool->bitmap[j] != -1U)
+					return j;
+		}
+	}
+
+	return i;
 }
 
 static void *__smalloc_pool(struct pool *pool, size_t size)
@@ -356,21 +381,22 @@
 	unsigned int last_idx;
 	void *ret = NULL;
 
-	pool_lock(pool);
+	fio_sem_down(pool->lock);
 
 	nr_blocks = size_to_blocks(size);
 	if (nr_blocks > pool->free_blocks)
 		goto fail;
 
-	i = pool->next_non_full;
+	pool->next_non_full = find_best_index(pool);
+
 	last_idx = 0;
 	offset = -1U;
+	i = pool->next_non_full;
 	while (i < pool->nr_blocks) {
 		unsigned int idx;
 
 		if (pool->bitmap[i] == -1U) {
 			i++;
-			pool->next_non_full = i;
 			last_idx = 0;
 			continue;
 		}
@@ -399,14 +425,13 @@
 		ret = pool->map + offset;
 	}
 fail:
-	pool_unlock(pool);
+	fio_sem_up(pool->lock);
 	return ret;
 }
 
-static void *smalloc_pool(struct pool *pool, size_t size)
+static size_t size_to_alloc_size(size_t size)
 {
 	size_t alloc_size = size + sizeof(struct block_hdr);
-	void *ptr;
 
 	/*
 	 * Round to int alignment, so that the postred pointer will
@@ -417,6 +442,14 @@
 	alloc_size = (alloc_size + int_mask) & ~int_mask;
 #endif
 
+	return alloc_size;
+}
+
+static void *smalloc_pool(struct pool *pool, size_t size)
+{
+	size_t alloc_size = size_to_alloc_size(size);
+	void *ptr;
+
 	ptr = __smalloc_pool(pool, alloc_size);
 	if (ptr) {
 		struct block_hdr *hdr = ptr;
@@ -431,50 +464,117 @@
 	return ptr;
 }
 
-void *smalloc(size_t size)
+static void smalloc_print_bitmap(struct pool *pool)
+{
+	size_t nr_blocks = pool->nr_blocks;
+	unsigned int *bitmap = pool->bitmap;
+	unsigned int i, j;
+	char *buffer;
+
+	if (!enable_smalloc_debug)
+		return;
+
+	buffer = malloc(SMALLOC_BPI + 1);
+	if (!buffer)
+		return;
+	buffer[SMALLOC_BPI] = '\0';
+
+	for (i = 0; i < nr_blocks; i++) {
+		unsigned int line = bitmap[i];
+
+		/* skip completely full lines */
+		if (line == -1U)
+			continue;
+
+		for (j = 0; j < SMALLOC_BPI; j++)
+			if ((1 << j) & line)
+				buffer[SMALLOC_BPI-1-j] = '1';
+			else
+				buffer[SMALLOC_BPI-1-j] = '0';
+
+		log_err("smalloc: bitmap %5u, %s\n", i, buffer);
+	}
+
+	free(buffer);
+}
+
+void smalloc_debug(size_t size)
 {
 	unsigned int i;
+	size_t alloc_size = size_to_alloc_size(size);
+	size_t alloc_blocks;
+
+	alloc_blocks = size_to_blocks(alloc_size);
+
+	if (size)
+		log_err("smalloc: size = %lu, alloc_size = %lu, blocks = %lu\n",
+			(unsigned long) size, (unsigned long) alloc_size,
+			(unsigned long) alloc_blocks);
+	for (i = 0; i < nr_pools; i++) {
+		log_err("smalloc: pool %u, free/total blocks %u/%u\n", i,
+			(unsigned int) (mp[i].free_blocks),
+			(unsigned int) (mp[i].nr_blocks*sizeof(unsigned int)*8));
+		if (size && mp[i].free_blocks >= alloc_blocks) {
+			void *ptr = smalloc_pool(&mp[i], size);
+			if (ptr) {
+				sfree(ptr);
+				last_pool = i;
+				log_err("smalloc: smalloc_pool %u succeeded\n", i);
+			} else {
+				log_err("smalloc: smalloc_pool %u failed\n", i);
+				log_err("smalloc: next_non_full=%u, nr_blocks=%u\n",
+					(unsigned int) mp[i].next_non_full, (unsigned int) mp[i].nr_blocks);
+				smalloc_print_bitmap(&mp[i]);
+			}
+		}
+	}
+}
+
+void *smalloc(size_t size)
+{
+	unsigned int i, end_pool;
 
 	if (size != (unsigned int) size)
 		return NULL;
 
-	global_write_lock();
 	i = last_pool;
+	end_pool = nr_pools;
 
 	do {
-		for (; i < nr_pools; i++) {
+		for (; i < end_pool; i++) {
 			void *ptr = smalloc_pool(&mp[i], size);
 
 			if (ptr) {
 				last_pool = i;
-				global_write_unlock();
 				return ptr;
 			}
 		}
 		if (last_pool) {
-			last_pool = 0;
+			end_pool = last_pool;
+			last_pool = i = 0;
 			continue;
 		}
 
-		if (nr_pools + 1 > MAX_POOLS)
-			break;
-		else {
-			i = nr_pools;
-			if (add_pool(&mp[nr_pools], size))
-				goto out;
-		}
+		break;
 	} while (1);
 
-out:
-	global_write_unlock();
+	log_err("smalloc: OOM. Consider using --alloc-size to increase the "
+		"shared memory available.\n");
+	smalloc_debug(size);
 	return NULL;
 }
 
+void *scalloc(size_t nmemb, size_t size)
+{
+	return smalloc(nmemb * size);
+}
+
 char *smalloc_strdup(const char *str)
 {
-	char *ptr;
+	char *ptr = NULL;
 
 	ptr = smalloc(strlen(str) + 1);
-	strcpy(ptr, str);
+	if (ptr)
+		strcpy(ptr, str);
 	return ptr;
 }
diff -Nru fio-2.1.3/smalloc.h fio-3.16/smalloc.h
--- fio-2.1.3/smalloc.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/smalloc.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,11 +1,15 @@
 #ifndef FIO_SMALLOC_H
 #define FIO_SMALLOC_H
 
+#include <stddef.h>
+
 extern void *smalloc(size_t);
+extern void *scalloc(size_t, size_t);
 extern void sfree(void *);
 extern char *smalloc_strdup(const char *);
 extern void sinit(void);
 extern void scleanup(void);
+extern void smalloc_debug(size_t);
 
 extern unsigned int smalloc_pool_size;
 
diff -Nru fio-2.1.3/stat.c fio-3.16/stat.c
--- fio-2.1.3/stat.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/stat.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,10 +1,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/time.h>
-#include <sys/types.h>
 #include <sys/stat.h>
-#include <dirent.h>
-#include <libgen.h>
 #include <math.h>
 
 #include "fio.h"
@@ -13,17 +10,34 @@
 #include "json.h"
 #include "lib/getrusage.h"
 #include "idletime.h"
+#include "lib/pow2.h"
+#include "lib/output_buffer.h"
+#include "helper_thread.h"
+#include "smalloc.h"
+#include "zbd.h"
 
-static struct fio_mutex *stat_mutex;
+#define LOG_MSEC_SLACK	1
+
+struct fio_sem *stat_sem;
+
+void clear_rusage_stat(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+
+	fio_getrusage(&td->ru_start);
+	ts->usr_time = ts->sys_time = 0;
+	ts->ctx = 0;
+	ts->minf = ts->majf = 0;
+}
 
 void update_rusage_stat(struct thread_data *td)
 {
 	struct thread_stat *ts = &td->ts;
 
 	fio_getrusage(&td->ru_end);
-	ts->usr_time += mtime_since(&td->ru_start.ru_utime,
+	ts->usr_time += mtime_since_tv(&td->ru_start.ru_utime,
 					&td->ru_end.ru_utime);
-	ts->sys_time += mtime_since(&td->ru_start.ru_stime,
+	ts->sys_time += mtime_since_tv(&td->ru_start.ru_stime,
 					&td->ru_end.ru_stime);
 	ts->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw
 			- (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
@@ -42,7 +56,7 @@
  * group by looking at the index bits.
  *
  */
-static unsigned int plat_val_to_idx(unsigned int val)
+static unsigned int plat_val_to_idx(unsigned long long val)
 {
 	unsigned int msb, error_bits, base, offset, idx;
 
@@ -50,7 +64,7 @@
 	if (val == 0)
 		msb = 0;
 	else
-		msb = (sizeof(val)*8) - __builtin_clz(val) - 1;
+		msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
 
 	/*
 	 * MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
@@ -82,9 +96,10 @@
  * Convert the given index of the bucket array to the value
  * represented by the bucket
  */
-static unsigned int plat_idx_to_val(unsigned int idx)
+static unsigned long long plat_idx_to_val(unsigned int idx)
 {
-	unsigned int error_bits, k, base;
+	unsigned int error_bits;
+	unsigned long long k, base;
 
 	assert(idx < FIO_IO_U_PLAT_NR);
 
@@ -95,7 +110,7 @@
 
 	/* Find the group and compute the minimum value of that group */
 	error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1;
-	base = 1 << (error_bits + FIO_IO_U_PLAT_BITS);
+	base = ((unsigned long long) 1) << (error_bits + FIO_IO_U_PLAT_BITS);
 
 	/* Find its bucket number of the group */
 	k = idx % FIO_IO_U_PLAT_VAL;
@@ -118,17 +133,16 @@
 	return cmp;
 }
 
-unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
-				   fio_fp64_t *plist, unsigned int **output,
-				   unsigned int *maxv, unsigned int *minv)
+unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
+				   fio_fp64_t *plist, unsigned long long **output,
+				   unsigned long long *maxv, unsigned long long *minv)
 {
-	unsigned long sum = 0;
+	unsigned long long sum = 0;
 	unsigned int len, i, j = 0;
-	unsigned int oval_len = 0;
-	unsigned int *ovals = NULL;
-	int is_last;
+	unsigned long long *ovals = NULL;
+	bool is_last;
 
-	*minv = -1U;
+	*minv = -1ULL;
 	*maxv = 0;
 
 	len = 0;
@@ -146,27 +160,26 @@
 	if (len > 1)
 		qsort((void *)plist, len, sizeof(plist[0]), double_cmp);
 
+	ovals = malloc(len * sizeof(*ovals));
+	if (!ovals)
+		return 0;
+
 	/*
 	 * Calculate bucket values, note down max and min values
 	 */
-	is_last = 0;
+	is_last = false;
 	for (i = 0; i < FIO_IO_U_PLAT_NR && !is_last; i++) {
 		sum += io_u_plat[i];
-		while (sum >= (plist[j].u.f / 100.0 * nr)) {
+		while (sum >= ((long double) plist[j].u.f / 100.0 * nr)) {
 			assert(plist[j].u.f <= 100.0);
 
-			if (j == oval_len) {
-				oval_len += 100;
-				ovals = realloc(ovals, oval_len * sizeof(unsigned int));
-			}
-
 			ovals[j] = plat_idx_to_val(i);
 			if (ovals[j] < *minv)
 				*minv = ovals[j];
 			if (ovals[j] > *maxv)
 				*maxv = ovals[j];
 
-			is_last = (j == len - 1);
+			is_last = (j == len - 1) != 0;
 			if (is_last)
 				break;
 
@@ -174,6 +187,9 @@
 		}
 	}
 
+	if (!is_last)
+		log_err("fio: error calculating latency percentiles\n");
+
 	*output = ovals;
 	return len;
 }
@@ -181,58 +197,64 @@
 /*
  * Find and display the p-th percentile of clat
  */
-static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
-				  fio_fp64_t *plist, unsigned int precision)
-{
-	unsigned int len, j = 0, minv, maxv;
-	unsigned int *ovals;
-	int is_last, per_line, scale_down;
+static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
+				  fio_fp64_t *plist, unsigned int precision,
+				  const char *pre, struct buf_output *out)
+{
+	unsigned int divisor, len, i, j = 0;
+	unsigned long long minv, maxv;
+	unsigned long long *ovals;
+	int per_line, scale_down, time_width;
+	bool is_last;
 	char fmt[32];
 
 	len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
-	if (!len)
+	if (!len || !ovals)
 		goto out;
 
 	/*
-	 * We default to usecs, but if the value range is such that we
-	 * should scale down to msecs, do that.
+	 * We default to nsecs, but if the value range is such that we
+	 * should scale down to usecs or msecs, do that.
 	 */
-	if (minv > 2000 && maxv > 99999) {
+	if (minv > 2000000 && maxv > 99999999ULL) {
+		scale_down = 2;
+		divisor = 1000000;
+		log_buf(out, "    %s percentiles (msec):\n     |", pre);
+	} else if (minv > 2000 && maxv > 99999) {
 		scale_down = 1;
-		log_info("    clat percentiles (msec):\n     |");
+		divisor = 1000;
+		log_buf(out, "    %s percentiles (usec):\n     |", pre);
 	} else {
 		scale_down = 0;
-		log_info("    clat percentiles (usec):\n     |");
+		divisor = 1;
+		log_buf(out, "    %s percentiles (nsec):\n     |", pre);
 	}
 
-	snprintf(fmt, sizeof(fmt), "%%1.%uf", precision);
-	per_line = (80 - 7) / (precision + 14);
 
-	for (j = 0; j < len; j++) {
-		char fbuf[16], *ptr = fbuf;
+	time_width = max(5, (int) (log10(maxv / divisor) + 1));
+	snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+			precision, time_width);
+	/* fmt will be something like " %5.2fth=[%4llu]%c" */
+	per_line = (80 - 7) / (precision + 10 + time_width);
 
+	for (j = 0; j < len; j++) {
 		/* for formatting */
 		if (j != 0 && (j % per_line) == 0)
-			log_info("     |");
+			log_buf(out, "     |");
 
 		/* end of the list */
-		is_last = (j == len - 1);
-
-		if (plist[j].u.f < 10.0)
-			ptr += sprintf(fbuf, " ");
+		is_last = (j == len - 1) != 0;
 
-		snprintf(ptr, sizeof(fbuf), fmt, plist[j].u.f);
-
-		if (scale_down)
+		for (i = 0; i < scale_down; i++)
 			ovals[j] = (ovals[j] + 999) / 1000;
 
-		log_info(" %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ',');
+		log_buf(out, fmt, plist[j].u.f, ovals[j], is_last ? '\n' : ',');
 
 		if (is_last)
 			break;
 
 		if ((j % per_line) == per_line - 1)	/* for formatting */
-			log_info("\n");
+			log_buf(out, "\n");
 	}
 
 out:
@@ -240,13 +262,13 @@
 		free(ovals);
 }
 
-int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
-	     double *mean, double *dev)
+bool calc_lat(struct io_stat *is, unsigned long long *min,
+	      unsigned long long *max, double *mean, double *dev)
 {
 	double n = (double) is->samples;
 
 	if (n == 0)
-		return 0;
+		return false;
 
 	*min = is->min_val;
 	*max = is->max_val;
@@ -257,16 +279,17 @@
 	else
 		*dev = 0;
 
-	return 1;
+	return true;
 }
 
-void show_group_stats(struct group_run_stats *rs)
+void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
 {
-	char *p1, *p2, *p3, *p4;
-	const char *ddir_str[] = { "   READ", "  WRITE" , "   TRIM"};
+	char *io, *agg, *min, *max;
+	char *ioalt, *aggalt, *minalt, *maxalt;
+	const char *str[] = { "   READ", "  WRITE" , "   TRIM"};
 	int i;
 
-	log_info("\nRun status group %d (all jobs):\n", rs->groupid);
+	log_buf(out, "\nRun status group %d (all jobs):\n", rs->groupid);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		const int i2p = is_power_of_2(rs->kb_base);
@@ -274,26 +297,32 @@
 		if (!rs->max_run[i])
 			continue;
 
-		p1 = num2str(rs->io_kb[i], 6, rs->kb_base, i2p, 8);
-		p2 = num2str(rs->agg[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p3 = num2str(rs->min_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p4 = num2str(rs->max_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-
-		log_info("%s: io=%s, aggrb=%s/s, minb=%s/s, maxb=%s/s,"
-			 " mint=%llumsec, maxt=%llumsec\n",
-				rs->unified_rw_rep ? "  MIXED" : ddir_str[i],
-				p1, p2, p3, p4,
+		io = num2str(rs->iobytes[i], rs->sig_figs, 1, i2p, N2S_BYTE);
+		ioalt = num2str(rs->iobytes[i], rs->sig_figs, 1, !i2p, N2S_BYTE);
+		agg = num2str(rs->agg[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		aggalt = num2str(rs->agg[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		min = num2str(rs->min_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		minalt = num2str(rs->min_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		max = num2str(rs->max_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
+		maxalt = num2str(rs->max_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
+		log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
+				rs->unified_rw_rep ? "  MIXED" : str[i],
+				agg, aggalt, min, max, minalt, maxalt, io, ioalt,
 				(unsigned long long) rs->min_run[i],
 				(unsigned long long) rs->max_run[i]);
 
-		free(p1);
-		free(p2);
-		free(p3);
-		free(p4);
+		free(io);
+		free(agg);
+		free(min);
+		free(max);
+		free(ioalt);
+		free(aggalt);
+		free(minalt);
+		free(maxalt);
 	}
 }
 
-void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist)
+void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist)
 {
 	int i;
 
@@ -312,7 +341,7 @@
 }
 
 static void stat_calc_lat(struct thread_stat *ts, double *dst,
-			  unsigned int *src, int nr)
+			  uint64_t *src, int nr)
 {
 	unsigned long total = ddir_rw_sum(ts->total_io_u);
 	int i;
@@ -331,6 +360,28 @@
 	}
 }
 
+/*
+ * To keep the terse format unaltered, add all of the ns latency
+ * buckets to the first us latency bucket
+ */
+static void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u)
+{
+	unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u);
+	int i;
+
+	stat_calc_lat(ts, io_u_lat_u, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ntotal += ts->io_u_lat_n[i];
+
+	io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total;
+}
+
+void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat)
+{
+	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR);
+}
+
 void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat)
 {
 	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
@@ -341,19 +392,22 @@
 	stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR);
 }
 
-static void display_lat(const char *name, unsigned long min, unsigned long max,
-			double mean, double dev)
+static void display_lat(const char *name, unsigned long long min,
+			unsigned long long max, double mean, double dev,
+			struct buf_output *out)
 {
-	const char *base = "(usec)";
+	const char *base = "(nsec)";
 	char *minp, *maxp;
 
-	if (!usec_to_msec(&min, &max, &mean, &dev))
+	if (nsec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
+	else if (nsec_to_usec(&min, &max, &mean, &dev))
+		base = "(usec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
-	log_info("    %s %s: min=%s, max=%s, avg=%5.02f,"
+	log_buf(out, "    %s %s: min=%s, max=%s, avg=%5.02f,"
 		 " stdev=%5.02f\n", name, base, minp, maxp, mean, dev);
 
 	free(minp);
@@ -361,15 +415,27 @@
 }
 
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-			     int ddir)
+			     int ddir, struct buf_output *out)
 {
-	const char *ddir_str[] = { "read ", "write", "trim" };
-	unsigned long min, max, runt;
-	unsigned long long bw, iops;
+	unsigned long runt;
+	unsigned long long min, max, bw, iops;
 	double mean, dev;
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
 	int i2p;
 
+	if (ddir_sync(ddir)) {
+		if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
+			log_buf(out, "  %s:\n", "fsync/fdatasync/sync_file_range");
+			display_lat(io_ddir_name(ddir), min, max, mean, dev, out);
+			show_clat_percentiles(ts->io_u_sync_plat,
+						ts->sync_stat.samples,
+						ts->percentile_list,
+						ts->percentile_precision,
+						io_ddir_name(ddir), out);
+		}
+		return;
+	}
+
 	assert(ddir_rw(ddir));
 
 	if (!ts->runtime[ddir])
@@ -379,37 +445,76 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
+	io_p = num2str(ts->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
+	bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
+	bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
-
-	log_info("  %s: io=%s, bw=%s/s, iops=%s, runt=%6llumsec\n",
-				rs->unified_rw_rep ? "mixed" : ddir_str[ddir],
-				io_p, bw_p, iops_p,
-				(unsigned long long) ts->runtime[ddir]);
+	iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
+	if (ddir == DDIR_WRITE)
+		post_st = zbd_write_status(ts);
+	else if (ddir == DDIR_READ && ts->cachehit && ts->cachemiss) {
+		uint64_t total;
+		double hit;
+
+		total = ts->cachehit + ts->cachemiss;
+		hit = (double) ts->cachehit / (double) total;
+		hit *= 100.0;
+		if (asprintf(&post_st, "; Cachehit=%0.2f%%", hit) < 0)
+			post_st = NULL;
+	}
+
+	log_buf(out, "  %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
+			rs->unified_rw_rep ? "mixed" : io_ddir_name(ddir),
+			iops_p, bw_p, bw_p_alt, io_p,
+			(unsigned long long) ts->runtime[ddir],
+			post_st ? : "");
 
+	free(post_st);
 	free(io_p);
 	free(bw_p);
+	free(bw_p_alt);
 	free(iops_p);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("slat", min, max, mean, dev);
+		display_lat("slat", min, max, mean, dev, out);
 	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("clat", min, max, mean, dev);
+		display_lat("clat", min, max, mean, dev, out);
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat(" lat", min, max, mean, dev);
+		display_lat(" lat", min, max, mean, dev, out);
+
+	if (ts->clat_percentiles || ts->lat_percentiles) {
+		const char *name = ts->clat_percentiles ? "clat" : " lat";
+		uint64_t samples;
+
+		if (ts->clat_percentiles)
+			samples = ts->clat_stat[ddir].samples;
+		else
+			samples = ts->lat_stat[ddir].samples;
 
-	if (ts->clat_percentiles) {
 		show_clat_percentiles(ts->io_u_plat[ddir],
-					ts->clat_stat[ddir].samples,
+					samples,
 					ts->percentile_list,
-					ts->percentile_precision);
+					ts->percentile_precision, name, out);
 	}
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
 		double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
-		const char *bw_str = (rs->unit_base == 1 ? "Kbit" : "KB");
+		const char *bw_str;
+
+		if ((rs->unit_base == 1) && i2p)
+			bw_str = "Kibit";
+		else if (rs->unit_base == 1)
+			bw_str = "kbit";
+		else if (i2p)
+			bw_str = "KiB";
+		else
+			bw_str = "kB";
+
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
 
 		if (rs->unit_base == 1) {
 			min *= 8.0;
@@ -418,123 +523,311 @@
 			dev *= 8.0;
 		}
 
-		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
-			if (p_of_agg > 100.0)
-				p_of_agg = 100.0;
-		}
-
 		if (mean > fkb_base * fkb_base) {
 			min /= fkb_base;
 			max /= fkb_base;
 			mean /= fkb_base;
 			dev /= fkb_base;
-			bw_str = (rs->unit_base == 1 ? "Mbit" : "MB");
+			bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
 		}
 
-		log_info("    bw (%-4s/s): min=%5lu, max=%5lu, per=%3.2f%%,"
-			 " avg=%5.02f, stdev=%5.02f\n", bw_str, min, max,
-							p_of_agg, mean, dev);
+		log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			bw_str, min, max, p_of_agg, mean, dev,
+			(&ts->bw_stat[ddir])->samples);
+	}
+	if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		log_buf(out, "   iops        : min=%5llu, max=%5llu, "
+			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+			min, max, mean, dev, (&ts->iops_stat[ddir])->samples);
 	}
 }
 
-static int show_lat(double *io_u_lat, int nr, const char **ranges,
-		    const char *msg)
+static bool show_lat(double *io_u_lat, int nr, const char **ranges,
+		     const char *msg, struct buf_output *out)
 {
-	int new_line = 1, i, line = 0, shown = 0;
+	bool new_line = true, shown = false;
+	int i, line = 0;
 
 	for (i = 0; i < nr; i++) {
 		if (io_u_lat[i] <= 0.0)
 			continue;
-		shown = 1;
+		shown = true;
 		if (new_line) {
 			if (line)
-				log_info("\n");
-			log_info("    lat (%s) : ", msg);
-			new_line = 0;
+				log_buf(out, "\n");
+			log_buf(out, "  lat (%s)   : ", msg);
+			new_line = false;
 			line = 0;
 		}
 		if (line)
-			log_info(", ");
-		log_info("%s%3.2f%%", ranges[i], io_u_lat[i]);
+			log_buf(out, ", ");
+		log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]);
 		line++;
 		if (line == 5)
-			new_line = 1;
+			new_line = true;
 	}
 
 	if (shown)
-		log_info("\n");
+		log_buf(out, "\n");
+
+	return true;
+}
+
+static void show_lat_n(double *io_u_lat_n, struct buf_output *out)
+{
+	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+				 "250=", "500=", "750=", "1000=", };
 
-	return shown;
+	show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out);
 }
 
-static void show_lat_u(double *io_u_lat_u)
+static void show_lat_u(double *io_u_lat_u, struct buf_output *out)
 {
 	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
 				 "250=", "500=", "750=", "1000=", };
 
-	show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec");
+	show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec", out);
 }
 
-static void show_lat_m(double *io_u_lat_m)
+static void show_lat_m(double *io_u_lat_m, struct buf_output *out)
 {
 	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
 				 "250=", "500=", "750=", "1000=", "2000=",
 				 ">=2000=", };
 
-	show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec");
+	show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec", out);
 }
 
-static void show_latencies(struct thread_stat *ts)
+static void show_latencies(struct thread_stat *ts, struct buf_output *out)
 {
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 
+	stat_calc_lat_n(ts, io_u_lat_n);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
-	show_lat_u(io_u_lat_u);
-	show_lat_m(io_u_lat_m);
+	show_lat_n(io_u_lat_n, out);
+	show_lat_u(io_u_lat_u, out);
+	show_lat_m(io_u_lat_m, out);
+}
+
+static int block_state_category(int block_state)
+{
+	switch (block_state) {
+	case BLOCK_STATE_UNINIT:
+		return 0;
+	case BLOCK_STATE_TRIMMED:
+	case BLOCK_STATE_WRITTEN:
+		return 1;
+	case BLOCK_STATE_WRITE_FAILURE:
+	case BLOCK_STATE_TRIM_FAILURE:
+		return 2;
+	default:
+		/* Silence compile warning on some BSDs and have a return */
+		assert(0);
+		return -1;
+	}
+}
+
+static int compare_block_infos(const void *bs1, const void *bs2)
+{
+	uint64_t block1 = *(uint64_t *)bs1;
+	uint64_t block2 = *(uint64_t *)bs2;
+	int state1 = BLOCK_INFO_STATE(block1);
+	int state2 = BLOCK_INFO_STATE(block2);
+	int bscat1 = block_state_category(state1);
+	int bscat2 = block_state_category(state2);
+	int cycles1 = BLOCK_INFO_TRIMS(block1);
+	int cycles2 = BLOCK_INFO_TRIMS(block2);
+
+	if (bscat1 < bscat2)
+		return -1;
+	if (bscat1 > bscat2)
+		return 1;
+
+	if (cycles1 < cycles2)
+		return -1;
+	if (cycles1 > cycles2)
+		return 1;
+
+	if (state1 < state2)
+		return -1;
+	if (state1 > state2)
+		return 1;
+
+	assert(block1 == block2);
+	return 0;
+}
+
+static int calc_block_percentiles(int nr_block_infos, uint32_t *block_infos,
+				  fio_fp64_t *plist, unsigned int **percentiles,
+				  unsigned int *types)
+{
+	int len = 0;
+	int i, nr_uninit;
+
+	qsort(block_infos, nr_block_infos, sizeof(uint32_t), compare_block_infos);
+
+	while (len < FIO_IO_U_LIST_MAX_LEN && plist[len].u.f != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Sort the percentile list. Note that it may already be sorted if
+	 * we are using the default values, but since it's a short list this
+	 * isn't a worry. Also note that this does not work for NaN values.
+	 */
+	if (len > 1)
+		qsort((void *)plist, len, sizeof(plist[0]), double_cmp);
+
+	/* Start only after the uninit entries end */
+	for (nr_uninit = 0;
+	     nr_uninit < nr_block_infos
+		&& BLOCK_INFO_STATE(block_infos[nr_uninit]) == BLOCK_STATE_UNINIT;
+	     nr_uninit ++)
+		;
+
+	if (nr_uninit == nr_block_infos)
+		return 0;
+
+	*percentiles = calloc(len, sizeof(**percentiles));
+
+	for (i = 0; i < len; i++) {
+		int idx = (plist[i].u.f * (nr_block_infos - nr_uninit) / 100)
+				+ nr_uninit;
+		(*percentiles)[i] = BLOCK_INFO_TRIMS(block_infos[idx]);
+	}
+
+	memset(types, 0, sizeof(*types) * BLOCK_STATE_COUNT);
+	for (i = 0; i < nr_block_infos; i++)
+		types[BLOCK_INFO_STATE(block_infos[i])]++;
+
+	return len;
+}
+
+static const char *block_state_names[] = {
+	[BLOCK_STATE_UNINIT] = "unwritten",
+	[BLOCK_STATE_TRIMMED] = "trimmed",
+	[BLOCK_STATE_WRITTEN] = "written",
+	[BLOCK_STATE_TRIM_FAILURE] = "trim failure",
+	[BLOCK_STATE_WRITE_FAILURE] = "write failure",
+};
+
+static void show_block_infos(int nr_block_infos, uint32_t *block_infos,
+			     fio_fp64_t *plist, struct buf_output *out)
+{
+	int len, pos, i;
+	unsigned int *percentiles = NULL;
+	unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+	len = calc_block_percentiles(nr_block_infos, block_infos, plist,
+				     &percentiles, block_state_counts);
+
+	log_buf(out, "  block lifetime percentiles :\n   |");
+	pos = 0;
+	for (i = 0; i < len; i++) {
+		uint32_t block_info = percentiles[i];
+#define LINE_LENGTH	75
+		char str[LINE_LENGTH];
+		int strln = snprintf(str, LINE_LENGTH, " %3.2fth=%u%c",
+				     plist[i].u.f, block_info,
+				     i == len - 1 ? '\n' : ',');
+		assert(strln < LINE_LENGTH);
+		if (pos + strln > LINE_LENGTH) {
+			pos = 0;
+			log_buf(out, "\n   |");
+		}
+		log_buf(out, "%s", str);
+		pos += strln;
+#undef LINE_LENGTH
+	}
+	if (percentiles)
+		free(percentiles);
+
+	log_buf(out, "        states               :");
+	for (i = 0; i < BLOCK_STATE_COUNT; i++)
+		log_buf(out, " %s=%u%c",
+			 block_state_names[i], block_state_counts[i],
+			 i == BLOCK_STATE_COUNT - 1 ? '\n' : ',');
 }
 
-void show_thread_status(struct thread_stat *ts, struct group_run_stats *rs)
+static void show_ss_normal(struct thread_stat *ts, struct buf_output *out)
+{
+	char *p1, *p1alt, *p2;
+	unsigned long long bw_mean, iops_mean;
+	const int i2p = is_power_of_2(ts->kb_base);
+
+	if (!ts->ss_dur)
+		return;
+
+	bw_mean = steadystate_bw_mean(ts);
+	iops_mean = steadystate_iops_mean(ts);
+
+	p1 = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, i2p, ts->unit_base);
+	p1alt = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, !i2p, ts->unit_base);
+	p2 = num2str(iops_mean, ts->sig_figs, 1, 0, N2S_NONE);
+
+	log_buf(out, "  steadystate  : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n",
+		ts->ss_state & FIO_SS_ATTAINED ? "yes" : "no",
+		p1, p1alt, p2,
+		ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
+		ts->ss_state & FIO_SS_SLOPE ? " slope": " mean dev",
+		ts->ss_criterion.u.f,
+		ts->ss_state & FIO_SS_PCT ? "%" : "");
+
+	free(p1);
+	free(p1alt);
+	free(p2);
+}
+
+static void show_thread_status_normal(struct thread_stat *ts,
+				      struct group_run_stats *rs,
+				      struct buf_output *out)
 {
 	double usr_cpu, sys_cpu;
 	unsigned long runtime;
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	time_t time_p;
-	char time_buf[64];
+	char time_buf[32];
 
-	if (!(ts->io_bytes[DDIR_READ] + ts->io_bytes[DDIR_WRITE] +
-	    ts->io_bytes[DDIR_TRIM]) && !(ts->total_io_u[DDIR_READ] +
-	    ts->total_io_u[DDIR_WRITE] + ts->total_io_u[DDIR_TRIM]))
+	if (!ddir_rw_sum(ts->io_bytes) && !ddir_rw_sum(ts->total_io_u))
 		return;
+		
+	memset(time_buf, 0, sizeof(time_buf));
 
 	time(&time_p);
 	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
 
 	if (!ts->error) {
-		log_info("%s: (groupid=%d, jobs=%d): err=%2d: pid=%d: %s",
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d: pid=%d: %s",
 					ts->name, ts->groupid, ts->members,
 					ts->error, (int) ts->pid, time_buf);
 	} else {
-		log_info("%s: (groupid=%d, jobs=%d): err=%2d (%s): pid=%d: %s",
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d (%s): pid=%d: %s",
 					ts->name, ts->groupid, ts->members,
 					ts->error, ts->verror, (int) ts->pid,
 					time_buf);
 	}
 
 	if (strlen(ts->description))
-		log_info("  Description  : [%s]\n", ts->description);
+		log_buf(out, "  Description  : [%s]\n", ts->description);
 
 	if (ts->io_bytes[DDIR_READ])
-		show_ddir_status(rs, ts, DDIR_READ);
+		show_ddir_status(rs, ts, DDIR_READ, out);
 	if (ts->io_bytes[DDIR_WRITE])
-		show_ddir_status(rs, ts, DDIR_WRITE);
+		show_ddir_status(rs, ts, DDIR_WRITE, out);
 	if (ts->io_bytes[DDIR_TRIM])
-		show_ddir_status(rs, ts, DDIR_TRIM);
+		show_ddir_status(rs, ts, DDIR_TRIM, out);
+
+	show_latencies(ts, out);
 
-	show_latencies(ts);
+	if (ts->sync_stat.samples)
+		show_ddir_status(rs, ts, DDIR_SYNC, out);
 
 	runtime = ts->total_run_time;
 	if (runtime) {
@@ -547,56 +840,75 @@
 		sys_cpu = 0;
 	}
 
-	log_info("  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%llu,"
+	log_buf(out, "  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%llu,"
 		 " majf=%llu, minf=%llu\n", usr_cpu, sys_cpu,
 			(unsigned long long) ts->ctx,
 			(unsigned long long) ts->majf,
 			(unsigned long long) ts->minf);
 
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	log_info("  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
+	log_buf(out, "  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
 		 " 16=%3.1f%%, 32=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
 
 	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
-	log_info("     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+	log_buf(out, "     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
 		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
 	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
-	log_info("     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+	log_buf(out, "     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
 		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
-	log_info("     issued    : total=r=%llu/w=%llu/d=%llu,"
-				 " short=r=%llu/w=%llu/d=%llu\n",
+	log_buf(out, "     issued rwts: total=%llu,%llu,%llu,%llu"
+				 " short=%llu,%llu,%llu,0"
+				 " dropped=%llu,%llu,%llu,0\n",
 					(unsigned long long) ts->total_io_u[0],
 					(unsigned long long) ts->total_io_u[1],
 					(unsigned long long) ts->total_io_u[2],
+					(unsigned long long) ts->total_io_u[3],
 					(unsigned long long) ts->short_io_u[0],
 					(unsigned long long) ts->short_io_u[1],
-					(unsigned long long) ts->short_io_u[2]);
+					(unsigned long long) ts->short_io_u[2],
+					(unsigned long long) ts->drop_io_u[0],
+					(unsigned long long) ts->drop_io_u[1],
+					(unsigned long long) ts->drop_io_u[2]);
 	if (ts->continue_on_error) {
-		log_info("     errors    : total=%llu, first_error=%d/<%s>\n",
+		log_buf(out, "     errors    : total=%llu, first_error=%d/<%s>\n",
 					(unsigned long long)ts->total_err_count,
 					ts->first_error,
 					strerror(ts->first_error));
 	}
+	if (ts->latency_depth) {
+		log_buf(out, "     latency   : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
+					(unsigned long long)ts->latency_target,
+					(unsigned long long)ts->latency_window,
+					ts->latency_percentile.u.f,
+					ts->latency_depth);
+	}
+
+	if (ts->nr_block_infos)
+		show_block_infos(ts->nr_block_infos, ts->block_infos,
+				  ts->percentile_list, out);
+
+	if (ts->ss_dur)
+		show_ss_normal(ts, out);
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
-				   struct group_run_stats *rs, int ddir)
+				   struct group_run_stats *rs, int ddir,
+				   int ver, struct buf_output *out)
 {
-	unsigned long min, max;
-	unsigned long long bw, iops;
-	unsigned int *ovals = NULL;
+	unsigned long long min, max, minv, maxv, bw, iops;
+	unsigned long long *ovals = NULL;
 	double mean, dev;
-	unsigned int len, minv, maxv;
-	int i;
+	unsigned int len;
+	int i, bw_stat;
 
 	assert(ddir_rw(ddir));
 
@@ -604,25 +916,25 @@
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
 		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
-	log_info(";%llu;%llu;%llu;%llu",
+	log_buf(out, ";%llu;%llu;%llu;%llu",
 		(unsigned long long) ts->io_bytes[ddir] >> 10, bw, iops,
 					(unsigned long long) ts->runtime[ddir]);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
 	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
-	if (ts->clat_percentiles) {
+	if (ts->clat_percentiles || ts->lat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list, &ovals, &maxv,
@@ -632,127 +944,204 @@
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		if (i >= len) {
-			log_info(";0%%=0");
+			log_buf(out, ";0%%=0");
 			continue;
 		}
-		log_info(";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]);
+		log_buf(out, ";%f%%=%llu", ts->percentile_list[i].u.f, ovals[i]/1000);
 	}
 
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
 	if (ovals)
 		free(ovals);
 
-	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+	bw_stat = calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev);
+	if (bw_stat) {
 		double p_of_agg = 100.0;
 
 		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
 			if (p_of_agg > 100.0)
 				p_of_agg = 100.0;
 		}
 
-		log_info(";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
 	} else
-		log_info(";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
+		log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+
+	if (ver == 5) {
+		if (bw_stat)
+			log_buf(out, ";%" PRIu64, (&ts->bw_stat[ddir])->samples);
+		else
+			log_buf(out, ";%lu", 0UL);
+
+		if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev))
+			log_buf(out, ";%llu;%llu;%f;%f;%" PRIu64, min, max,
+				mean, dev, (&ts->iops_stat[ddir])->samples);
+		else
+			log_buf(out, ";%llu;%llu;%f;%f;%lu", 0ULL, 0ULL, 0.0, 0.0, 0UL);
+	}
 }
 
 static void add_ddir_status_json(struct thread_stat *ts,
 		struct group_run_stats *rs, int ddir, struct json_object *parent)
 {
-	unsigned long min, max;
-	unsigned long long bw, iops;
-	unsigned int *ovals = NULL;
-	double mean, dev;
-	unsigned int len, minv, maxv;
+	unsigned long long min, max, minv, maxv;
+	unsigned long long bw_bytes, bw;
+	unsigned long long *ovals = NULL;
+	double mean, dev, iops;
+	unsigned int len;
 	int i;
-	const char *ddirname[] = {"read", "write", "trim"};
-	struct json_object *dir_object, *tmp_object, *percentile_object;
+	struct json_object *dir_object, *tmp_object, *percentile_object, *clat_bins_object = NULL;
 	char buf[120];
 	double p_of_agg = 100.0;
 
-	assert(ddir_rw(ddir));
+	assert(ddir_rw(ddir) || ddir_sync(ddir));
 
 	if (ts->unified_rw_rep && ddir != DDIR_READ)
 		return;
 
 	dir_object = json_create_object();
 	json_object_add_value_object(parent,
-		ts->unified_rw_rep ? "mixed" : ddirname[ddir], dir_object);
-
-	iops = bw = 0;
-	if (ts->runtime[ddir]) {
-		uint64_t runt = ts->runtime[ddir];
+		ts->unified_rw_rep ? "mixed" : io_ddir_name(ddir), dir_object);
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
-		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
-	}
+	if (ddir_rw(ddir)) {
+		bw_bytes = 0;
+		bw = 0;
+		iops = 0.0;
+		if (ts->runtime[ddir]) {
+			uint64_t runt = ts->runtime[ddir];
+
+			bw_bytes = ((1000 * ts->io_bytes[ddir]) / runt); /* Bytes/s */
+			bw = bw_bytes / 1024; /* KiB/s */
+			iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
+		}
 
-	json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10);
-	json_object_add_value_int(dir_object, "bw", bw);
-	json_object_add_value_int(dir_object, "iops", iops);
-	json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
+		json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir]);
+		json_object_add_value_int(dir_object, "io_kbytes", ts->io_bytes[ddir] >> 10);
+		json_object_add_value_int(dir_object, "bw_bytes", bw_bytes);
+		json_object_add_value_int(dir_object, "bw", bw);
+		json_object_add_value_float(dir_object, "iops", iops);
+		json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
+		json_object_add_value_int(dir_object, "total_ios", ts->total_io_u[ddir]);
+		json_object_add_value_int(dir_object, "short_ios", ts->short_io_u[ddir]);
+		json_object_add_value_int(dir_object, "drop_ios", ts->drop_io_u[ddir]);
+
+		if (!calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) {
+			min = max = 0;
+			mean = dev = 0.0;
+		}
+		tmp_object = json_create_object();
+		json_object_add_value_object(dir_object, "slat_ns", tmp_object);
+		json_object_add_value_int(tmp_object, "min", min);
+		json_object_add_value_int(tmp_object, "max", max);
+		json_object_add_value_float(tmp_object, "mean", mean);
+		json_object_add_value_float(tmp_object, "stddev", dev);
+
+		if (!calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev)) {
+			min = max = 0;
+			mean = dev = 0.0;
+		}
+		tmp_object = json_create_object();
+		json_object_add_value_object(dir_object, "clat_ns", tmp_object);
+		json_object_add_value_int(tmp_object, "min", min);
+		json_object_add_value_int(tmp_object, "max", max);
+		json_object_add_value_float(tmp_object, "mean", mean);
+		json_object_add_value_float(tmp_object, "stddev", dev);
+	} else {
+		if (!calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
+			min = max = 0;
+			mean = dev = 0.0;
+		}
 
-	if (!calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) {
-		min = max = 0;
-		mean = dev = 0.0;
+		tmp_object = json_create_object();
+		json_object_add_value_object(dir_object, "lat_ns", tmp_object);
+		json_object_add_value_int(dir_object, "total_ios", ts->total_io_u[DDIR_SYNC]);
+		json_object_add_value_int(tmp_object, "min", min);
+		json_object_add_value_int(tmp_object, "max", max);
+		json_object_add_value_float(tmp_object, "mean", mean);
+		json_object_add_value_float(tmp_object, "stddev", dev);
 	}
-	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "slat", tmp_object);
-	json_object_add_value_int(tmp_object, "min", min);
-	json_object_add_value_int(tmp_object, "max", max);
-	json_object_add_value_float(tmp_object, "mean", mean);
-	json_object_add_value_float(tmp_object, "stddev", dev);
 
-	if (!calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev)) {
-		min = max = 0;
-		mean = dev = 0.0;
-	}
-	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "clat", tmp_object);
-	json_object_add_value_int(tmp_object, "min", min);
-	json_object_add_value_int(tmp_object, "max", max);
-	json_object_add_value_float(tmp_object, "mean", mean);
-	json_object_add_value_float(tmp_object, "stddev", dev);
+	if (ts->clat_percentiles || ts->lat_percentiles) {
+		if (ddir_rw(ddir)) {
+			uint64_t samples;
 
-	if (ts->clat_percentiles) {
-		len = calc_clat_percentiles(ts->io_u_plat[ddir],
-					ts->clat_stat[ddir].samples,
+			if (ts->clat_percentiles)
+				samples = ts->clat_stat[ddir].samples;
+			else
+				samples = ts->lat_stat[ddir].samples;
+
+			len = calc_clat_percentiles(ts->io_u_plat[ddir],
+					samples, ts->percentile_list, &ovals,
+					&maxv, &minv);
+		} else {
+			len = calc_clat_percentiles(ts->io_u_sync_plat,
+					ts->sync_stat.samples,
 					ts->percentile_list, &ovals, &maxv,
 					&minv);
+		}
+
+		if (len > FIO_IO_U_LIST_MAX_LEN)
+			len = FIO_IO_U_LIST_MAX_LEN;
 	} else
 		len = 0;
 
 	percentile_object = json_create_object();
-	json_object_add_value_object(tmp_object, "percentile", percentile_object);
-	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
-		if (i >= len) {
-			json_object_add_value_int(percentile_object, "0.00", 0);
-			continue;
-		}
+	if (ts->clat_percentiles)
+		json_object_add_value_object(tmp_object, "percentile", percentile_object);
+	for (i = 0; i < len; i++) {
 		snprintf(buf, sizeof(buf), "%f", ts->percentile_list[i].u.f);
 		json_object_add_value_int(percentile_object, (const char *)buf, ovals[i]);
 	}
 
+	if (output_format & FIO_OUTPUT_JSON_PLUS) {
+		clat_bins_object = json_create_object();
+		if (ts->clat_percentiles)
+			json_object_add_value_object(tmp_object, "bins", clat_bins_object);
+
+		for(i = 0; i < FIO_IO_U_PLAT_NR; i++) {
+			if (ddir_rw(ddir)) {
+				if (ts->io_u_plat[ddir][i]) {
+					snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i));
+					json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]);
+				}
+			} else {
+				if (ts->io_u_sync_plat[i]) {
+					snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i));
+					json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_sync_plat[i]);
+				}
+			}
+		}
+	}
+
+	if (!ddir_rw(ddir))
+		return;
+
 	if (!calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) {
 		min = max = 0;
 		mean = dev = 0.0;
 	}
 	tmp_object = json_create_object();
-	json_object_add_value_object(dir_object, "lat", tmp_object);
+	json_object_add_value_object(dir_object, "lat_ns", tmp_object);
 	json_object_add_value_int(tmp_object, "min", min);
 	json_object_add_value_int(tmp_object, "max", max);
 	json_object_add_value_float(tmp_object, "mean", mean);
 	json_object_add_value_float(tmp_object, "stddev", dev);
+	if (ts->lat_percentiles)
+		json_object_add_value_object(tmp_object, "percentile", percentile_object);
+	if (output_format & FIO_OUTPUT_JSON_PLUS && ts->lat_percentiles)
+		json_object_add_value_object(tmp_object, "bins", clat_bins_object);
+
 	if (ovals)
 		free(ovals);
 
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
 		if (rs->agg[ddir]) {
-			p_of_agg = mean * 100 / (double) rs->agg[ddir];
+			p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
 			if (p_of_agg > 100.0)
 				p_of_agg = 100.0;
 		}
@@ -762,13 +1151,37 @@
 	}
 	json_object_add_value_int(dir_object, "bw_min", min);
 	json_object_add_value_int(dir_object, "bw_max", max);
-	json_object_add_value_float(dir_object, "bw_agg", mean);
+	json_object_add_value_float(dir_object, "bw_agg", p_of_agg);
 	json_object_add_value_float(dir_object, "bw_mean", mean);
 	json_object_add_value_float(dir_object, "bw_dev", dev);
+	json_object_add_value_int(dir_object, "bw_samples",
+				(&ts->bw_stat[ddir])->samples);
+
+	if (!calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	json_object_add_value_int(dir_object, "iops_min", min);
+	json_object_add_value_int(dir_object, "iops_max", max);
+	json_object_add_value_float(dir_object, "iops_mean", mean);
+	json_object_add_value_float(dir_object, "iops_stddev", dev);
+	json_object_add_value_int(dir_object, "iops_samples",
+				(&ts->iops_stat[ddir])->samples);
+
+	if (ts->cachehit + ts->cachemiss) {
+		uint64_t total;
+		double hit;
+
+		total = ts->cachehit + ts->cachemiss;
+		hit = (double) ts->cachehit / (double) total;
+		hit *= 100.0;
+		json_object_add_value_float(dir_object, "cachehit", hit);
+	}
 }
 
-static void show_thread_status_terse_v2(struct thread_stat *ts,
-					struct group_run_stats *rs)
+static void show_thread_status_terse_all(struct thread_stat *ts,
+					 struct group_run_stats *rs, int ver,
+					 struct buf_output *out)
 {
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -777,13 +1190,19 @@
 	int i;
 
 	/* General Info */
-	log_info("2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+	if (ver == 2)
+		log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+	else
+		log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
+			ts->name, ts->groupid, ts->error);
+
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ);
+	show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
 	/* Log Trim Status */
-	show_ddir_status_terse(ts, rs, DDIR_TRIM);
+	if (ver == 2 || ver == 4 || ver == 5)
+		show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -796,124 +1215,102 @@
 		sys_cpu = 0;
 	}
 
-	log_info(";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
+	log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
 						(unsigned long long) ts->ctx,
 						(unsigned long long) ts->majf,
 						(unsigned long long) ts->minf);
 
 	/* Calc % distribution of IO depths, usecond, msecond latency */
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_nu(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
 	/* Only show fixed 7 I/O depth levels*/
-	log_info(";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
+	log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
 			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
 			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
 
 	/* Microsecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_u[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
 	/* Millisecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_m[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
+
+	/* disk util stats, if any */
+	if (ver >= 3 && is_running_backend())
+		show_disk_util(1, NULL, out);
+
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
-		log_info(";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
-	log_info("\n");
+		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
 
 	/* Additional output if description is set */
-	if (ts->description)
-		log_info(";%s", ts->description);
+	if (strlen(ts->description)) {
+		if (ver == 2)
+			log_buf(out, "\n");
+		log_buf(out, ";%s", ts->description);
+	}
 
-	log_info("\n");
+	log_buf(out, "\n");
 }
 
-static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
-					   struct group_run_stats *rs, int ver)
+static void json_add_job_opts(struct json_object *root, const char *name,
+			      struct flist_head *opt_list)
 {
-	double io_u_dist[FIO_IO_U_MAP_NR];
-	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
-	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
-	double usr_cpu, sys_cpu;
-	int i;
-
-	/* General Info */
-	log_info("%d;%s;%s;%d;%d", ver, fio_version_string,
-					ts->name, ts->groupid, ts->error);
-	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ);
-	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE);
-	/* Log Trim Status */
-	if (ver == 4)
-		show_ddir_status_terse(ts, rs, DDIR_TRIM);
-
-	/* CPU Usage */
-	if (ts->total_run_time) {
-		double runt = (double) ts->total_run_time;
-
-		usr_cpu = (double) ts->usr_time * 100 / runt;
-		sys_cpu = (double) ts->sys_time * 100 / runt;
-	} else {
-		usr_cpu = 0;
-		sys_cpu = 0;
-	}
-
-	log_info(";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
-						(unsigned long long) ts->ctx,
-						(unsigned long long) ts->majf,
-						(unsigned long long) ts->minf);
-
-	/* Calc % distribution of IO depths, usecond, msecond latency */
-	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
-	stat_calc_lat_m(ts, io_u_lat_m);
-
-	/* Only show fixed 7 I/O depth levels*/
-	log_info(";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
-			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
-			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
-
-	/* Microsecond latency */
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_u[i]);
-	/* Millisecond latency */
-	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_m[i]);
+	struct json_object *dir_object;
+	struct flist_head *entry;
+	struct print_option *p;
 
-	/* disk util stats, if any */
-	show_disk_util(1, NULL);
+	if (flist_empty(opt_list))
+		return;
 
-	/* Additional output if continue_on_error set - default off*/
-	if (ts->continue_on_error)
-		log_info(";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+	dir_object = json_create_object();
+	json_object_add_value_object(root, name, dir_object);
 
-	/* Additional output if description is set */
-	if (strlen(ts->description))
-		log_info(";%s", ts->description);
+	flist_for_each(entry, opt_list) {
+		const char *pos = "";
 
-	log_info("\n");
+		p = flist_entry(entry, struct print_option, list);
+		if (p->value)
+			pos = p->value;
+		json_object_add_value_string(dir_object, p->name, pos);
+	}
 }
 
 static struct json_object *show_thread_status_json(struct thread_stat *ts,
-				    struct group_run_stats *rs)
+						   struct group_run_stats *rs,
+						   struct flist_head *opt_list)
 {
 	struct json_object *root, *tmp;
+	struct jobs_eta *je;
 	double io_u_dist[FIO_IO_U_MAP_NR];
+	double io_u_lat_n[FIO_IO_U_LAT_N_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	double usr_cpu, sys_cpu;
 	int i;
+	size_t size;
 
 	root = json_create_object();
 	json_object_add_value_string(root, "jobname", ts->name);
 	json_object_add_value_int(root, "groupid", ts->groupid);
 	json_object_add_value_int(root, "error", ts->error);
 
+	/* ETA Info */
+	je = get_jobs_eta(true, &size);
+	if (je) {
+		json_object_add_value_int(root, "eta", je->eta_sec);
+		json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+	}
+
+	if (opt_list)
+		json_add_job_opts(root, "job options", opt_list);
+
 	add_ddir_status_json(ts, rs, DDIR_READ, root);
 	add_ddir_status_json(ts, rs, DDIR_WRITE, root);
 	add_ddir_status_json(ts, rs, DDIR_TRIM, root);
+	add_ddir_status_json(ts, rs, DDIR_SYNC, root);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -925,18 +1322,15 @@
 		usr_cpu = 0;
 		sys_cpu = 0;
 	}
+	json_object_add_value_int(root, "job_runtime", ts->total_run_time);
 	json_object_add_value_float(root, "usr_cpu", usr_cpu);
 	json_object_add_value_float(root, "sys_cpu", sys_cpu);
 	json_object_add_value_int(root, "ctx", ts->ctx);
 	json_object_add_value_int(root, "majf", ts->majf);
 	json_object_add_value_int(root, "minf", ts->minf);
 
-
-	/* Calc % distribution of IO depths, usecond, msecond latency */
+	/* Calc % distribution of IO depths */
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
-	stat_calc_lat_m(ts, io_u_lat_m);
-
 	tmp = json_create_object();
 	json_object_add_value_object(root, "iodepth_level", tmp);
 	/* Only show fixed 7 I/O depth levels*/
@@ -949,54 +1343,209 @@
 		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
 	}
 
+	/* Calc % distribution of submit IO depths */
+	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
 	tmp = json_create_object();
-	json_object_add_value_object(root, "latency_us", tmp);
-	/* Microsecond latency */
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
-		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
-				 "250", "500", "750", "1000", };
-		json_object_add_value_float(tmp, ranges[i], io_u_lat_u[i]);
+	json_object_add_value_object(root, "iodepth_submit", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i == 0)
+			snprintf(name, 20, "0");
+		else if (i < 6)
+			snprintf(name, 20, "%d", 1 << (i+1));
+		else
+			snprintf(name, 20, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
 	}
-	/* Millisecond latency */
+
+	/* Calc % distribution of completion IO depths */
+	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
 	tmp = json_create_object();
-	json_object_add_value_object(root, "latency_ms", tmp);
-	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) {
-		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
-				 "250", "500", "750", "1000", "2000",
-				 ">=2000", };
-		json_object_add_value_float(tmp, ranges[i], io_u_lat_m[i]);
+	json_object_add_value_object(root, "iodepth_complete", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i == 0)
+			snprintf(name, 20, "0");
+		else if (i < 6)
+			snprintf(name, 20, "%d", 1 << (i+1));
+		else
+			snprintf(name, 20, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
 	}
 
-	/* Additional output if continue_on_error set - default off*/
+	/* Calc % distribution of nsecond, usecond, msecond latency */
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	stat_calc_lat_n(ts, io_u_lat_n);
+	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
+
+	/* Nanosecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_ns", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_n[i]);
+	}
+	/* Microsecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_us", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_u[i]);
+	}
+	/* Millisecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_ms", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", "2000",
+				 ">=2000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_m[i]);
+	}
+
+	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error) {
 		json_object_add_value_int(root, "total_err", ts->total_err_count);
-		json_object_add_value_int(root, "total_err", ts->first_error);
+		json_object_add_value_int(root, "first_error", ts->first_error);
+	}
+
+	if (ts->latency_depth) {
+		json_object_add_value_int(root, "latency_depth", ts->latency_depth);
+		json_object_add_value_int(root, "latency_target", ts->latency_target);
+		json_object_add_value_float(root, "latency_percentile", ts->latency_percentile.u.f);
+		json_object_add_value_int(root, "latency_window", ts->latency_window);
 	}
 
 	/* Additional output if description is set */
 	if (strlen(ts->description))
 		json_object_add_value_string(root, "desc", ts->description);
 
+	if (ts->nr_block_infos) {
+		/* Block error histogram and types */
+		int len;
+		unsigned int *percentiles = NULL;
+		unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+		len = calc_block_percentiles(ts->nr_block_infos, ts->block_infos,
+					     ts->percentile_list,
+					     &percentiles, block_state_counts);
+
+		if (len) {
+			struct json_object *block, *percentile_object, *states;
+			int state;
+			block = json_create_object();
+			json_object_add_value_object(root, "block", block);
+
+			percentile_object = json_create_object();
+			json_object_add_value_object(block, "percentiles",
+						     percentile_object);
+			for (i = 0; i < len; i++) {
+				char buf[20];
+				snprintf(buf, sizeof(buf), "%f",
+					 ts->percentile_list[i].u.f);
+				json_object_add_value_int(percentile_object,
+							  (const char *)buf,
+							  percentiles[i]);
+			}
+
+			states = json_create_object();
+			json_object_add_value_object(block, "states", states);
+			for (state = 0; state < BLOCK_STATE_COUNT; state++) {
+				json_object_add_value_int(states,
+					block_state_names[state],
+					block_state_counts[state]);
+			}
+			free(percentiles);
+		}
+	}
+
+	if (ts->ss_dur) {
+		struct json_object *data;
+		struct json_array *iops, *bw;
+		int j, k, l;
+		char ss_buf[64];
+
+		snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s",
+			ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
+			ts->ss_state & FIO_SS_SLOPE ? "_slope" : "",
+			(float) ts->ss_limit.u.f,
+			ts->ss_state & FIO_SS_PCT ? "%" : "");
+
+		tmp = json_create_object();
+		json_object_add_value_object(root, "steadystate", tmp);
+		json_object_add_value_string(tmp, "ss", ss_buf);
+		json_object_add_value_int(tmp, "duration", (int)ts->ss_dur);
+		json_object_add_value_int(tmp, "attained", (ts->ss_state & FIO_SS_ATTAINED) > 0);
+
+		snprintf(ss_buf, sizeof(ss_buf), "%f%s", (float) ts->ss_criterion.u.f,
+			ts->ss_state & FIO_SS_PCT ? "%" : "");
+		json_object_add_value_string(tmp, "criterion", ss_buf);
+		json_object_add_value_float(tmp, "max_deviation", ts->ss_deviation.u.f);
+		json_object_add_value_float(tmp, "slope", ts->ss_slope.u.f);
+
+		data = json_create_object();
+		json_object_add_value_object(tmp, "data", data);
+		bw = json_create_array();
+		iops = json_create_array();
+
+		/*
+		** if ss was attained or the buffer is not full,
+		** ss->head points to the first element in the list.
+		** otherwise it actually points to the second element
+		** in the list
+		*/
+		if ((ts->ss_state & FIO_SS_ATTAINED) || !(ts->ss_state & FIO_SS_BUFFER_FULL))
+			j = ts->ss_head;
+		else
+			j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1;
+		for (l = 0; l < ts->ss_dur; l++) {
+			k = (j + l) % ts->ss_dur;
+			json_array_add_value_int(bw, ts->ss_bw_data[k]);
+			json_array_add_value_int(iops, ts->ss_iops_data[k]);
+		}
+		json_object_add_value_int(data, "bw_mean", steadystate_bw_mean(ts));
+		json_object_add_value_int(data, "iops_mean", steadystate_iops_mean(ts));
+		json_object_add_value_array(data, "iops", iops);
+		json_object_add_value_array(data, "bw", bw);
+	}
+
 	return root;
 }
 
 static void show_thread_status_terse(struct thread_stat *ts,
-				     struct group_run_stats *rs)
+				     struct group_run_stats *rs,
+				     struct buf_output *out)
 {
-	if (terse_version == 2)
-		show_thread_status_terse_v2(ts, rs);
-	else if (terse_version == 3 || terse_version == 4)
-		show_thread_status_terse_v3_v4(ts, rs, terse_version);
+	if (terse_version >= 2 && terse_version <= 5)
+		show_thread_status_terse_all(ts, rs, terse_version, out);
 	else
 		log_err("fio: bad terse version!? %d\n", terse_version);
 }
 
-static void sum_stat(struct io_stat *dst, struct io_stat *src, int nr)
+struct json_object *show_thread_status(struct thread_stat *ts,
+				       struct group_run_stats *rs,
+				       struct flist_head *opt_list,
+				       struct buf_output *out)
 {
-	double mean, S;
+	struct json_object *ret = NULL;
 
-	if (src->samples == 0)
-		return;
+	if (output_format & FIO_OUTPUT_TERSE)
+		show_thread_status_terse(ts, rs,  out);
+	if (output_format & FIO_OUTPUT_JSON)
+		ret = show_thread_status_json(ts, rs, opt_list);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_thread_status_normal(ts, rs,  out);
+
+	return ret;
+}
+
+static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
+{
+	double mean, S;
 
 	dst->min_val = min(dst->min_val, src->min_val);
 	dst->max_val = max(dst->max_val, src->max_val);
@@ -1006,7 +1555,7 @@
 	 * <http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 	 *  #Parallel_algorithm>
 	 */
-	if (nr == 1) {
+	if (first) {
 		mean = src->mean.u.f;
 		S = src->S.u.f;
 	} else {
@@ -1024,6 +1573,39 @@
 	dst->samples += src->samples;
 	dst->mean.u.f = mean;
 	dst->S.u.f = S;
+
+}
+
+/*
+ * We sum two kinds of stats - one that is time based, in which case we
+ * apply the proper summing technique, and then one that is iops/bw
+ * numbers. For group_reporting, we should just add those up, not make
+ * them the mean of everything.
+ */
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
+		     bool pure_sum)
+{
+	if (src->samples == 0)
+		return;
+
+	if (!pure_sum) {
+		__sum_stat(dst, src, first);
+		return;
+	}
+
+	if (first) {
+		dst->min_val = src->min_val;
+		dst->max_val = src->max_val;
+		dst->samples = src->samples;
+		dst->mean.u.f = src->mean.u.f;
+		dst->S.u.f = src->S.u.f;
+	} else {
+		dst->min_val += src->min_val;
+		dst->max_val += src->max_val;
+		dst->samples += src->samples;
+		dst->mean.u.f += src->mean.u.f;
+		dst->S.u.f += src->S.u.f;
+	}
 }
 
 void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
@@ -1040,67 +1622,92 @@
 		if (dst->min_bw[i] && dst->min_bw[i] > src->min_bw[i])
 			dst->min_bw[i] = src->min_bw[i];
 
-		dst->io_kb[i] += src->io_kb[i];
+		dst->iobytes[i] += src->iobytes[i];
 		dst->agg[i] += src->agg[i];
 	}
 
+	if (!dst->kb_base)
+		dst->kb_base = src->kb_base;
+	if (!dst->unit_base)
+		dst->unit_base = src->unit_base;
+	if (!dst->sig_figs)
+		dst->sig_figs = src->sig_figs;
 }
 
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr)
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
+		      bool first)
 {
 	int l, k;
 
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		if (!dst->unified_rw_rep) {
-			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], nr);
-			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], nr);
-			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], nr);
-			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], nr);
+			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
+			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
+			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
+			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
+			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
 
 			dst->io_bytes[l] += src->io_bytes[l];
 
 			if (dst->runtime[l] < src->runtime[l])
 				dst->runtime[l] = src->runtime[l];
 		} else {
-			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], nr);
-			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], nr);
-			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], nr);
-			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], nr);
+			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
+			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
+			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
+			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
+			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
 
 			dst->io_bytes[0] += src->io_bytes[l];
 
 			if (dst->runtime[0] < src->runtime[l])
 				dst->runtime[0] = src->runtime[l];
+
+			/*
+			 * We're summing to the same destination, so override
+			 * 'first' after the first iteration of the loop
+			 */
+			first = false;
 		}
 	}
 
+	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
 	dst->usr_time += src->usr_time;
 	dst->sys_time += src->sys_time;
 	dst->ctx += src->ctx;
 	dst->majf += src->majf;
 	dst->minf += src->minf;
 
-	for (k = 0; k < FIO_IO_U_MAP_NR; k++)
+	for (k = 0; k < FIO_IO_U_MAP_NR; k++) {
 		dst->io_u_map[k] += src->io_u_map[k];
-	for (k = 0; k < FIO_IO_U_MAP_NR; k++)
 		dst->io_u_submit[k] += src->io_u_submit[k];
-	for (k = 0; k < FIO_IO_U_MAP_NR; k++)
 		dst->io_u_complete[k] += src->io_u_complete[k];
+	}
+
+	for (k = 0; k < FIO_IO_U_LAT_N_NR; k++)
+		dst->io_u_lat_n[k] += src->io_u_lat_n[k];
 	for (k = 0; k < FIO_IO_U_LAT_U_NR; k++)
 		dst->io_u_lat_u[k] += src->io_u_lat_u[k];
 	for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
 		dst->io_u_lat_m[k] += src->io_u_lat_m[k];
 
+	for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+		dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
+
 	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
 		if (!dst->unified_rw_rep) {
 			dst->total_io_u[k] += src->total_io_u[k];
 			dst->short_io_u[k] += src->short_io_u[k];
+			dst->drop_io_u[k] += src->drop_io_u[k];
 		} else {
 			dst->total_io_u[0] += src->total_io_u[k];
 			dst->short_io_u[0] += src->short_io_u[k];
+			dst->drop_io_u[0] += src->drop_io_u[k];
 		}
 	}
 
+	dst->total_io_u[DDIR_SYNC] += src->total_io_u[DDIR_SYNC];
+
 	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
 		int m;
 
@@ -1115,6 +1722,9 @@
 	dst->total_run_time += src->total_run_time;
 	dst->total_submit += src->total_submit;
 	dst->total_complete += src->total_complete;
+	dst->nr_zone_resets += src->nr_zone_resets;
+	dst->cachehit += src->cachehit;
+	dst->cachemiss += src->cachemiss;
 }
 
 void init_group_run_stat(struct group_run_stats *gs)
@@ -1137,20 +1747,24 @@
 		ts->clat_stat[j].min_val = -1UL;
 		ts->slat_stat[j].min_val = -1UL;
 		ts->bw_stat[j].min_val = -1UL;
+		ts->iops_stat[j].min_val = -1UL;
 	}
+	ts->sync_stat.min_val = -1UL;
 	ts->groupid = -1;
 }
 
-static void __show_run_stats(void)
+void __show_run_stats(void)
 {
 	struct group_run_stats *runstats, *rs;
 	struct thread_data *td;
 	struct thread_stat *threadstats, *ts;
-	int i, j, nr_ts, last_ts, idx;
-	int kb_base_warned = 0;
-	int unit_base_warned = 0;
+	int i, j, k, nr_ts, last_ts, idx;
+	bool kb_base_warned = false;
+	bool unit_base_warned = false;
 	struct json_object *root = NULL;
 	struct json_array *array = NULL;
+	struct buf_output output[FIO_OUTPUT_NR];
+	struct flist_head **opt_lists;
 
 	runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1));
 
@@ -1170,20 +1784,27 @@
 		}
 		if (last_ts == td->groupid)
 			continue;
+		if (!td->o.stats)
+			continue;
 
 		last_ts = td->groupid;
 		nr_ts++;
 	}
 
 	threadstats = malloc(nr_ts * sizeof(struct thread_stat));
+	opt_lists = malloc(nr_ts * sizeof(struct flist_head *));
 
-	for (i = 0; i < nr_ts; i++)
+	for (i = 0; i < nr_ts; i++) {
 		init_thread_stat(&threadstats[i]);
+		opt_lists[i] = NULL;
+	}
 
 	j = 0;
 	last_ts = -1;
 	idx = 0;
 	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
 		if (idx && (!td->o.group_reporting ||
 		    (td->o.group_reporting && last_ts != td->groupid))) {
 			idx = 0;
@@ -1195,8 +1816,10 @@
 		ts = &threadstats[j];
 
 		ts->clat_percentiles = td->o.clat_percentiles;
+		ts->lat_percentiles = td->o.lat_percentiles;
 		ts->percentile_precision = td->o.percentile_precision;
 		memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list));
+		opt_lists[j] = &td->opt_list;
 
 		idx++;
 		ts->members++;
@@ -1205,12 +1828,13 @@
 			/*
 			 * These are per-group shared already
 			 */
-			strncpy(ts->name, td->o.name, FIO_JOBNAME_SIZE);
+			snprintf(ts->name, sizeof(ts->name), "%s", td->o.name);
 			if (td->o.description)
-				strncpy(ts->description, td->o.description,
-						FIO_JOBNAME_SIZE);
+				snprintf(ts->description,
+					 sizeof(ts->description), "%s",
+					 td->o.description);
 			else
-				memset(ts->description, 0, FIO_JOBNAME_SIZE);
+				memset(ts->description, 0, FIO_JOBDESC_SIZE);
 
 			/*
 			 * If multiple entries in this group, this is
@@ -1226,15 +1850,16 @@
 
 			ts->kb_base = td->o.kb_base;
 			ts->unit_base = td->o.unit_base;
+			ts->sig_figs = td->o.sig_figs;
 			ts->unified_rw_rep = td->o.unified_rw_rep;
 		} else if (ts->kb_base != td->o.kb_base && !kb_base_warned) {
 			log_info("fio: kb_base differs for jobs in group, using"
 				 " %u as the base\n", ts->kb_base);
-			kb_base_warned = 1;
+			kb_base_warned = true;
 		} else if (ts->unit_base != td->o.unit_base && !unit_base_warned) {
 			log_info("fio: unit_base differs for jobs in group, using"
 				 " %u as the base\n", ts->unit_base);
-			unit_base_warned = 1;
+			unit_base_warned = true;
 		}
 
 		ts->continue_on_error = td->o.continue_on_error;
@@ -1244,23 +1869,51 @@
 			if (!td->error && td->o.continue_on_error &&
 			    td->first_error) {
 				ts->error = td->first_error;
-				strcpy(ts->verror, td->verror);
+				snprintf(ts->verror, sizeof(ts->verror), "%s",
+					 td->verror);
 			} else  if (td->error) {
 				ts->error = td->error;
-				strcpy(ts->verror, td->verror);
+				snprintf(ts->verror, sizeof(ts->verror), "%s",
+					 td->verror);
 			}
 		}
 
-		sum_thread_stats(ts, &td->ts, idx);
+		ts->latency_depth = td->latency_qd;
+		ts->latency_target = td->o.latency_target;
+		ts->latency_percentile = td->o.latency_percentile;
+		ts->latency_window = td->o.latency_window;
+
+		ts->nr_block_infos = td->ts.nr_block_infos;
+		for (k = 0; k < ts->nr_block_infos; k++)
+			ts->block_infos[k] = td->ts.block_infos[k];
+
+		sum_thread_stats(ts, &td->ts, idx == 1);
+
+		if (td->o.ss_dur) {
+			ts->ss_state = td->ss.state;
+			ts->ss_dur = td->ss.dur;
+			ts->ss_head = td->ss.head;
+			ts->ss_bw_data = td->ss.bw_data;
+			ts->ss_iops_data = td->ss.iops_data;
+			ts->ss_limit.u.f = td->ss.limit;
+			ts->ss_slope.u.f = td->ss.slope;
+			ts->ss_deviation.u.f = td->ss.deviation;
+			ts->ss_criterion.u.f = td->ss.criterion;
+		}
+		else
+			ts->ss_dur = ts->ss_state = 0;
 	}
 
 	for (i = 0; i < nr_ts; i++) {
 		unsigned long long bw;
 
 		ts = &threadstats[i];
+		if (ts->groupid == -1)
+			continue;
 		rs = &runstats[ts->groupid];
 		rs->kb_base = ts->kb_base;
 		rs->unit_base = ts->unit_base;
+		rs->sig_figs = ts->sig_figs;
 		rs->unified_rw_rep += ts->unified_rw_rep;
 
 		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
@@ -1272,19 +1925,14 @@
 				rs->max_run[j] = ts->runtime[j];
 
 			bw = 0;
-			if (ts->runtime[j]) {
-				unsigned long runt = ts->runtime[j];
-				unsigned long long kb;
-
-				kb = ts->io_bytes[j] / rs->kb_base;
-				bw = kb * 1000 / runt;
-			}
+			if (ts->runtime[j])
+				bw = ts->io_bytes[j] * 1000 / ts->runtime[j];
 			if (bw < rs->min_bw[j])
 				rs->min_bw[j] = bw;
 			if (bw > rs->max_bw[j])
 				rs->max_bw[j] = bw;
 
-			rs->io_kb[j] += ts->io_bytes[j] / rs->kb_base;
+			rs->iobytes[j] += ts->io_bytes[j];
 		}
 	}
 
@@ -1295,45 +1943,75 @@
 
 		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			if (rs->max_run[ddir])
-				rs->agg[ddir] = (rs->io_kb[ddir] * 1000) /
+				rs->agg[ddir] = (rs->iobytes[ddir] * 1000) /
 						rs->max_run[ddir];
 		}
 	}
 
+	for (i = 0; i < FIO_OUTPUT_NR; i++)
+		buf_output_init(&output[i]);
+
 	/*
 	 * don't overwrite last signal output
 	 */
-	if (output_format == FIO_OUTPUT_NORMAL)
-		log_info("\n");
-	else if (output_format == FIO_OUTPUT_JSON) {
+	if (output_format & FIO_OUTPUT_NORMAL)
+		log_buf(&output[__FIO_OUTPUT_NORMAL], "\n");
+	if (output_format & FIO_OUTPUT_JSON) {
+		struct thread_data *global;
+		char time_buf[32];
+		struct timeval now;
+		unsigned long long ms_since_epoch;
+		time_t tv_sec;
+
+		gettimeofday(&now, NULL);
+		ms_since_epoch = (unsigned long long)(now.tv_sec) * 1000 +
+		                 (unsigned long long)(now.tv_usec) / 1000;
+
+		tv_sec = now.tv_sec;
+		os_ctime_r(&tv_sec, time_buf, sizeof(time_buf));
+		if (time_buf[strlen(time_buf) - 1] == '\n')
+			time_buf[strlen(time_buf) - 1] = '\0';
+
 		root = json_create_object();
 		json_object_add_value_string(root, "fio version", fio_version_string);
+		json_object_add_value_int(root, "timestamp", now.tv_sec);
+		json_object_add_value_int(root, "timestamp_ms", ms_since_epoch);
+		json_object_add_value_string(root, "time", time_buf);
+		global = get_global_options();
+		json_add_job_opts(root, "global options", &global->opt_list);
 		array = json_create_array();
 		json_object_add_value_array(root, "jobs", array);
 	}
 
+	if (is_backend)
+		fio_server_send_job_options(&get_global_options()->opt_list, -1U);
+
 	for (i = 0; i < nr_ts; i++) {
 		ts = &threadstats[i];
 		rs = &runstats[ts->groupid];
 
-		if (is_backend)
+		if (is_backend) {
+			fio_server_send_job_options(opt_lists[i], i);
 			fio_server_send_ts(ts, rs);
-		else if (output_format == FIO_OUTPUT_TERSE)
-			show_thread_status_terse(ts, rs);
-		else if (output_format == FIO_OUTPUT_JSON) {
-			struct json_object *tmp = show_thread_status_json(ts, rs);
-			json_array_add_value_object(array, tmp);
-		} else
-			show_thread_status(ts, rs);
+		} else {
+			if (output_format & FIO_OUTPUT_TERSE)
+				show_thread_status_terse(ts, rs, &output[__FIO_OUTPUT_TERSE]);
+			if (output_format & FIO_OUTPUT_JSON) {
+				struct json_object *tmp = show_thread_status_json(ts, rs, opt_lists[i]);
+				json_array_add_value_object(array, tmp);
+			}
+			if (output_format & FIO_OUTPUT_NORMAL)
+				show_thread_status_normal(ts, rs, &output[__FIO_OUTPUT_NORMAL]);
+		}
 	}
-	if (output_format == FIO_OUTPUT_JSON) {
+	if (!is_backend && (output_format & FIO_OUTPUT_JSON)) {
 		/* disk util stats, if any */
-		show_disk_util(1, root);
+		show_disk_util(1, root, &output[__FIO_OUTPUT_JSON]);
 
-		show_idle_prof_stats(FIO_OUTPUT_JSON, root);
+		show_idle_prof_stats(FIO_OUTPUT_JSON, root, &output[__FIO_OUTPUT_JSON]);
 
-		json_print_object(root);
-		log_info("\n");
+		json_print_object(root, &output[__FIO_OUTPUT_JSON]);
+		log_buf(&output[__FIO_OUTPUT_JSON], "\n");
 		json_free_object(root);
 	}
 
@@ -1343,59 +2021,66 @@
 		rs->groupid = i;
 		if (is_backend)
 			fio_server_send_gs(rs);
-		else if (output_format == FIO_OUTPUT_NORMAL)
-			show_group_stats(rs);
+		else if (output_format & FIO_OUTPUT_NORMAL)
+			show_group_stats(rs, &output[__FIO_OUTPUT_NORMAL]);
 	}
 
 	if (is_backend)
 		fio_server_send_du();
-	else if (output_format == FIO_OUTPUT_NORMAL) {
-		show_disk_util(0, NULL);
-		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL);
+	else if (output_format & FIO_OUTPUT_NORMAL) {
+		show_disk_util(0, NULL, &output[__FIO_OUTPUT_NORMAL]);
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, &output[__FIO_OUTPUT_NORMAL]);
+	}
+
+	for (i = 0; i < FIO_OUTPUT_NR; i++) {
+		struct buf_output *out = &output[i];
+
+		log_info_buf(out->buf, out->buflen);
+		buf_output_free(out);
 	}
 
+	fio_idle_prof_cleanup();
+
 	log_info_flush();
 	free(runstats);
 	free(threadstats);
+	free(opt_lists);
 }
 
-void show_run_stats(void)
-{
-	fio_mutex_down(stat_mutex);
-	__show_run_stats();
-	fio_mutex_up(stat_mutex);
-}
-
-static void *__show_running_run_stats(void fio_unused *arg)
+void __show_running_run_stats(void)
 {
 	struct thread_data *td;
 	unsigned long long *rt;
-	struct timeval tv;
+	struct timespec ts;
 	int i;
 
+	fio_sem_down(stat_sem);
+
 	rt = malloc(thread_number * sizeof(unsigned long long));
-	fio_gettime(&tv, NULL);
+	fio_gettime(&ts, NULL);
 
 	for_each_td(td, i) {
-		rt[i] = mtime_since(&td->start, &tv);
-		if (td_read(td) && td->io_bytes[DDIR_READ])
-			td->ts.runtime[DDIR_READ] += rt[i];
-		if (td_write(td) && td->io_bytes[DDIR_WRITE])
-			td->ts.runtime[DDIR_WRITE] += rt[i];
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
-			td->ts.runtime[DDIR_TRIM] += rt[i];
-
 		td->update_rusage = 1;
 		td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
 		td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
 		td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
-		td->ts.total_run_time = mtime_since(&td->epoch, &tv);
+		td->ts.total_run_time = mtime_since(&td->epoch, &ts);
+
+		rt[i] = mtime_since(&td->start, &ts);
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
+			td->ts.runtime[DDIR_READ] += rt[i];
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
+			td->ts.runtime[DDIR_WRITE] += rt[i];
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] += rt[i];
 	}
 
 	for_each_td(td, i) {
+		if (td->runstate >= TD_EXITED)
+			continue;
 		if (td->rusage_sem) {
 			td->update_rusage = 1;
-			fio_mutex_down(td->rusage_sem);
+			fio_sem_down(td->rusage_sem);
 		}
 		td->update_rusage = 0;
 	}
@@ -1403,42 +2088,23 @@
 	__show_run_stats();
 
 	for_each_td(td, i) {
-		if (td_read(td) && td->io_bytes[DDIR_READ])
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
 			td->ts.runtime[DDIR_READ] -= rt[i];
-		if (td_write(td) && td->io_bytes[DDIR_WRITE])
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
 			td->ts.runtime[DDIR_WRITE] -= rt[i];
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
 			td->ts.runtime[DDIR_TRIM] -= rt[i];
 	}
 
 	free(rt);
-	fio_mutex_up(stat_mutex);
-	return NULL;
+	fio_sem_up(stat_sem);
 }
 
-/*
- * Called from signal handler. It _should_ be safe to just run this inline
- * in the sig handler, but we should be disturbing the system less by just
- * creating a thread to do it.
- */
-void show_running_run_stats(void)
-{
-	pthread_t thread;
+static bool status_interval_init;
+static struct timespec status_time;
+static bool status_file_disabled;
 
-	fio_mutex_down(stat_mutex);
-
-	if (!pthread_create(&thread, NULL, __show_running_run_stats, NULL)) {
-		pthread_detach(thread);
-		return;
-	}
-
-	fio_mutex_up(stat_mutex);
-}
-
-static int status_interval_init;
-static struct timeval status_time;
-
-#define FIO_STATUS_FILE		"/tmp/fio-dump-status"
+#define FIO_STATUS_FILE		"fio-dump-status"
 
 static int check_status_file(void)
 {
@@ -1446,9 +2112,15 @@
 	const char *temp_dir;
 	char fio_status_file_path[PATH_MAX];
 
+	if (status_file_disabled)
+		return 0;
+
 	temp_dir = getenv("TMPDIR");
-	if (temp_dir == NULL)
+	if (temp_dir == NULL) {
 		temp_dir = getenv("TEMP");
+		if (temp_dir && strlen(temp_dir) >= PATH_MAX)
+			temp_dir = NULL;
+	}
 	if (temp_dir == NULL)
 		temp_dir = "/tmp";
 
@@ -1457,7 +2129,13 @@
 	if (stat(fio_status_file_path, &sb))
 		return 0;
 
-	unlink(fio_status_file_path);
+	if (unlink(fio_status_file_path) < 0) {
+		log_err("fio: failed to unlink %s: %s\n", fio_status_file_path,
+							strerror(errno));
+		log_err("fio: disabling status file updates\n");
+		status_file_disabled = true;
+	}
+
 	return 1;
 }
 
@@ -1466,7 +2144,7 @@
 	if (status_interval) {
 		if (!status_interval_init) {
 			fio_gettime(&status_time, NULL);
-			status_interval_init = 1;
+			status_interval_init = true;
 		} else if (mtime_since_now(&status_time) >= status_interval) {
 			show_running_run_stats();
 			fio_gettime(&status_time, NULL);
@@ -1479,7 +2157,7 @@
 	}
 }
 
-static inline void add_stat_sample(struct io_stat *is, unsigned long data)
+static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
 {
 	double val = data;
 	double delta;
@@ -1498,43 +2176,282 @@
 	is->samples++;
 }
 
-static void __add_log_sample(struct io_log *iolog, unsigned long val,
-			     enum fio_ddir ddir, unsigned int bs,
-			     unsigned long t)
+/*
+ * Return a struct io_logs, which is added to the tail of the log
+ * list for 'iolog'.
+ */
+static struct io_logs *get_new_log(struct io_log *iolog)
+{
+	size_t new_size, new_samples;
+	struct io_logs *cur_log;
+
+	/*
+	 * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
+	 * forever
+	 */
+	if (!iolog->cur_log_max)
+		new_samples = DEF_LOG_ENTRIES;
+	else {
+		new_samples = iolog->cur_log_max * 2;
+		if (new_samples > MAX_LOG_ENTRIES)
+			new_samples = MAX_LOG_ENTRIES;
+	}
+
+	new_size = new_samples * log_entry_sz(iolog);
+
+	cur_log = smalloc(sizeof(*cur_log));
+	if (cur_log) {
+		INIT_FLIST_HEAD(&cur_log->list);
+		cur_log->log = malloc(new_size);
+		if (cur_log->log) {
+			cur_log->nr_samples = 0;
+			cur_log->max_samples = new_samples;
+			flist_add_tail(&cur_log->list, &iolog->io_logs);
+			iolog->cur_log_max = new_samples;
+			return cur_log;
+		}
+		sfree(cur_log);
+	}
+
+	return NULL;
+}
+
+/*
+ * Add and return a new log chunk, or return current log if big enough
+ */
+static struct io_logs *regrow_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+	int i;
+
+	if (!iolog || iolog->disabled)
+		goto disable;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * No room for a new sample. If we're compressing on the fly, flush
+	 * out the current chunk
+	 */
+	if (iolog->log_gz) {
+		if (iolog_cur_flush(iolog, cur_log)) {
+			log_err("fio: failed flushing iolog! Will stop logging.\n");
+			return NULL;
+		}
+	}
+
+	/*
+	 * Get a new log array, and add to our list
+	 */
+	cur_log = get_new_log(iolog);
+	if (!cur_log) {
+		log_err("fio: failed extending iolog! Will stop logging.\n");
+		return NULL;
+	}
+
+	if (!iolog->pending || !iolog->pending->nr_samples)
+		return cur_log;
+
+	/*
+	 * Flush pending items to new log
+	 */
+	for (i = 0; i < iolog->pending->nr_samples; i++) {
+		struct io_sample *src, *dst;
+
+		src = get_sample(iolog, iolog->pending, i);
+		dst = get_sample(iolog, cur_log, i);
+		memcpy(dst, src, log_entry_sz(iolog));
+	}
+	cur_log->nr_samples = iolog->pending->nr_samples;
+
+	iolog->pending->nr_samples = 0;
+	return cur_log;
+disable:
+	if (iolog)
+		iolog->disabled = true;
+	return NULL;
+}
+
+void regrow_logs(struct thread_data *td)
+{
+	regrow_log(td->slat_log);
+	regrow_log(td->clat_log);
+	regrow_log(td->clat_hist_log);
+	regrow_log(td->lat_log);
+	regrow_log(td->bw_log);
+	regrow_log(td->iops_log);
+	td->flags &= ~TD_F_REGROW_LOGS;
+}
+
+static struct io_logs *get_cur_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * Out of space. If we're in IO offload mode, or we're not doing
+	 * per unit logging (hence logging happens outside of the IO thread
+	 * as well), add a new log chunk inline. If we're doing inline
+	 * submissions, flag 'td' as needing a log regrow and we'll take
+	 * care of it on the submission side.
+	 */
+	if ((iolog->td && iolog->td->o.io_submit_mode == IO_MODE_OFFLOAD) ||
+	    !per_unit_log(iolog))
+		return regrow_log(iolog);
+
+	if (iolog->td)
+		iolog->td->flags |= TD_F_REGROW_LOGS;
+	if (iolog->pending)
+		assert(iolog->pending->nr_samples < iolog->pending->max_samples);
+	return iolog->pending;
+}
+
+static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
+			     enum fio_ddir ddir, unsigned long long bs,
+			     unsigned long t, uint64_t offset)
 {
-	const int nr_samples = iolog->nr_samples;
+	struct io_logs *cur_log;
+
+	if (iolog->disabled)
+		return;
+	if (flist_empty(&iolog->io_logs))
+		iolog->avg_last[ddir] = t;
+
+	cur_log = get_cur_log(iolog);
+	if (cur_log) {
+		struct io_sample *s;
+
+		s = get_sample(iolog, cur_log, cur_log->nr_samples);
+
+		s->data = data;
+		s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+		io_sample_set_ddir(iolog, s, ddir);
+		s->bs = bs;
 
-	if (!iolog->nr_samples)
-		iolog->avg_last = t;
+		if (iolog->log_offset) {
+			struct io_sample_offset *so = (void *) s;
 
-	if (iolog->nr_samples == iolog->max_samples) {
-		int new_size = sizeof(struct io_sample) * iolog->max_samples*2;
+			so->offset = offset;
+		}
 
-		iolog->log = realloc(iolog->log, new_size);
-		iolog->max_samples <<= 1;
+		cur_log->nr_samples++;
+		return;
 	}
 
-	iolog->log[nr_samples].val = val;
-	iolog->log[nr_samples].time = t;
-	iolog->log[nr_samples].ddir = ddir;
-	iolog->log[nr_samples].bs = bs;
-	iolog->nr_samples++;
+	iolog->disabled = true;
 }
 
 static inline void reset_io_stat(struct io_stat *ios)
 {
-	ios->max_val = ios->min_val = ios->samples = 0;
+	ios->min_val = -1ULL;
+	ios->max_val = ios->samples = 0;
 	ios->mean.u.f = ios->S.u.f = 0;
 }
 
-static void add_log_sample(struct thread_data *td, struct io_log *iolog,
-			   unsigned long val, enum fio_ddir ddir,
-			   unsigned int bs)
+void reset_io_stats(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+	int i, j;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		reset_io_stat(&ts->clat_stat[i]);
+		reset_io_stat(&ts->slat_stat[i]);
+		reset_io_stat(&ts->lat_stat[i]);
+		reset_io_stat(&ts->bw_stat[i]);
+		reset_io_stat(&ts->iops_stat[i]);
+
+		ts->io_bytes[i] = 0;
+		ts->runtime[i] = 0;
+		ts->total_io_u[i] = 0;
+		ts->short_io_u[i] = 0;
+		ts->drop_io_u[i] = 0;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			ts->io_u_plat[i][j] = 0;
+			if (!i)
+				ts->io_u_sync_plat[j] = 0;
+		}
+	}
+
+	ts->total_io_u[DDIR_SYNC] = 0;
+
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		ts->io_u_map[i] = 0;
+		ts->io_u_submit[i] = 0;
+		ts->io_u_complete[i] = 0;
+	}
+
+	for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+		ts->io_u_lat_n[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		ts->io_u_lat_u[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		ts->io_u_lat_m[i] = 0;
+
+	ts->total_submit = 0;
+	ts->total_complete = 0;
+	ts->nr_zone_resets = 0;
+	ts->cachehit = ts->cachemiss = 0;
+}
+
+static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
+			      unsigned long elapsed, bool log_max)
+{
+	/*
+	 * Note an entry in the log. Use the mean from the logged samples,
+	 * making sure to properly round up. Only write a log entry if we
+	 * had actual samples done.
+	 */
+	if (iolog->avg_window[ddir].samples) {
+		union io_sample_data data;
+
+		if (log_max)
+			data.val = iolog->avg_window[ddir].max_val;
+		else
+			data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
+
+		__add_log_sample(iolog, data, ddir, 0, elapsed, 0);
+	}
+
+	reset_io_stat(&iolog->avg_window[ddir]);
+}
+
+static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
+			     bool log_max)
+{
+	int ddir;
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		__add_stat_to_log(iolog, ddir, elapsed, log_max);
+}
+
+static unsigned long add_log_sample(struct thread_data *td,
+				    struct io_log *iolog,
+				    union io_sample_data data,
+				    enum fio_ddir ddir, unsigned long long bs,
+				    uint64_t offset)
 {
 	unsigned long elapsed, this_window;
 
 	if (!ddir_rw(ddir))
-		return;
+		return 0;
 
 	elapsed = mtime_since_now(&td->epoch);
 
@@ -1542,56 +2459,55 @@
 	 * If no time averaging, just add the log sample.
 	 */
 	if (!iolog->avg_msec) {
-		__add_log_sample(iolog, val, ddir, bs, elapsed);
-		return;
+		__add_log_sample(iolog, data, ddir, bs, elapsed, offset);
+		return 0;
 	}
 
 	/*
 	 * Add the sample. If the time period has passed, then
 	 * add that entry to the log and clear.
 	 */
-	add_stat_sample(&iolog->avg_window[ddir], val);
+	add_stat_sample(&iolog->avg_window[ddir], data.val);
 
 	/*
 	 * If period hasn't passed, adding the above sample is all we
 	 * need to do.
 	 */
-	this_window = elapsed - iolog->avg_last;
-	if (this_window < iolog->avg_msec)
-		return;
+	this_window = elapsed - iolog->avg_last[ddir];
+	if (elapsed < iolog->avg_last[ddir])
+		return iolog->avg_last[ddir] - elapsed;
+	else if (this_window < iolog->avg_msec) {
+		unsigned long diff = iolog->avg_msec - this_window;
 
-	/*
-	 * Note an entry in the log. Use the mean from the logged samples,
-	 * making sure to properly round up. Only write a log entry if we
-	 * had actual samples done.
-	 */
-	if (iolog->avg_window[DDIR_READ].samples) {
-		unsigned long mr;
-
-		mr = iolog->avg_window[DDIR_READ].mean.u.f + 0.50;
-		__add_log_sample(iolog, mr, DDIR_READ, 0, elapsed);
+		if (inline_log(iolog) || diff > LOG_MSEC_SLACK)
+			return diff;
 	}
-	if (iolog->avg_window[DDIR_WRITE].samples) {
-		unsigned long mw;
 
-		mw = iolog->avg_window[DDIR_WRITE].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed);
-	}
-	if (iolog->avg_window[DDIR_TRIM].samples) {
-		unsigned long mw;
+	__add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0);
 
-		mw = iolog->avg_window[DDIR_TRIM].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed);
-	}
+	iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
+	return iolog->avg_msec;
+}
 
+void finalize_logs(struct thread_data *td, bool unit_logs)
+{
+	unsigned long elapsed;
 
-	reset_io_stat(&iolog->avg_window[DDIR_READ]);
-	reset_io_stat(&iolog->avg_window[DDIR_WRITE]);
-	reset_io_stat(&iolog->avg_window[DDIR_TRIM]);
-	iolog->avg_last = elapsed;
+	elapsed = mtime_since_now(&td->epoch);
+
+	if (td->clat_log && unit_logs)
+		_add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0);
+	if (td->slat_log && unit_logs)
+		_add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0);
+	if (td->lat_log && unit_logs)
+		_add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0);
+	if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
+		_add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0);
+	if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
+		_add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0);
 }
 
-void add_agg_sample(unsigned long val, enum fio_ddir ddir, unsigned int bs)
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs)
 {
 	struct io_log *iolog;
 
@@ -1599,136 +2515,305 @@
 		return;
 
 	iolog = agg_io_log[ddir];
-	__add_log_sample(iolog, val, ddir, bs, mtime_since_genesis());
+	__add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0);
+}
+
+void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
+{
+	unsigned int idx = plat_val_to_idx(nsec);
+	assert(idx < FIO_IO_U_PLAT_NR);
+
+	ts->io_u_sync_plat[idx]++;
+	add_stat_sample(&ts->sync_stat, nsec);
 }
 
 static void add_clat_percentile_sample(struct thread_stat *ts,
-				unsigned long usec, enum fio_ddir ddir)
+				unsigned long long nsec, enum fio_ddir ddir)
 {
-	unsigned int idx = plat_val_to_idx(usec);
+	unsigned int idx = plat_val_to_idx(nsec);
 	assert(idx < FIO_IO_U_PLAT_NR);
 
 	ts->io_u_plat[ddir][idx]++;
 }
 
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
-		     unsigned long usec, unsigned int bs)
+		     unsigned long long nsec, unsigned long long bs,
+		     uint64_t offset)
 {
+	const bool needs_lock = td_async_processing(td);
+	unsigned long elapsed, this_window;
 	struct thread_stat *ts = &td->ts;
+	struct io_log *iolog = td->clat_hist_log;
 
-	if (!ddir_rw(ddir))
-		return;
+	if (needs_lock)
+		__td_io_u_lock(td);
 
-	add_stat_sample(&ts->clat_stat[ddir], usec);
+	add_stat_sample(&ts->clat_stat[ddir], nsec);
 
 	if (td->clat_log)
-		add_log_sample(td, td->clat_log, usec, ddir, bs);
+		add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
+			       offset);
 
 	if (ts->clat_percentiles)
-		add_clat_percentile_sample(ts, usec, ddir);
+		add_clat_percentile_sample(ts, nsec, ddir);
+
+	if (iolog && iolog->hist_msec) {
+		struct io_hist *hw = &iolog->hist_window[ddir];
+
+		hw->samples++;
+		elapsed = mtime_since_now(&td->epoch);
+		if (!hw->hist_last)
+			hw->hist_last = elapsed;
+		this_window = elapsed - hw->hist_last;
+		
+		if (this_window >= iolog->hist_msec) {
+			uint64_t *io_u_plat;
+			struct io_u_plat_entry *dst;
+
+			/*
+			 * Make a byte-for-byte copy of the latency histogram
+			 * stored in td->ts.io_u_plat[ddir], recording it in a
+			 * log sample. Note that the matching call to free() is
+			 * located in iolog.c after printing this sample to the
+			 * log file.
+			 */
+			io_u_plat = (uint64_t *) td->ts.io_u_plat[ddir];
+			dst = malloc(sizeof(struct io_u_plat_entry));
+			memcpy(&(dst->io_u_plat), io_u_plat,
+				FIO_IO_U_PLAT_NR * sizeof(unsigned int));
+			flist_add(&dst->list, &hw->list);
+			__add_log_sample(iolog, sample_plat(dst), ddir, bs,
+						elapsed, offset);
+
+			/*
+			 * Update the last time we recorded as being now, minus
+			 * any drift in time we encountered before actually
+			 * making the record.
+			 */
+			hw->hist_last = elapsed - (this_window - iolog->hist_msec);
+			hw->samples = 0;
+		}
+	}
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
 }
 
 void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
-		     unsigned long usec, unsigned int bs)
+		     unsigned long usec, unsigned long long bs, uint64_t offset)
 {
+	const bool needs_lock = td_async_processing(td);
 	struct thread_stat *ts = &td->ts;
 
 	if (!ddir_rw(ddir))
 		return;
 
+	if (needs_lock)
+		__td_io_u_lock(td);
+
 	add_stat_sample(&ts->slat_stat[ddir], usec);
 
 	if (td->slat_log)
-		add_log_sample(td, td->slat_log, usec, ddir, bs);
+		add_log_sample(td, td->slat_log, sample_val(usec), ddir, bs, offset);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
 }
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
-		    unsigned long usec, unsigned int bs)
+		    unsigned long long nsec, unsigned long long bs,
+		    uint64_t offset)
 {
+	const bool needs_lock = td_async_processing(td);
 	struct thread_stat *ts = &td->ts;
 
 	if (!ddir_rw(ddir))
 		return;
 
-	add_stat_sample(&ts->lat_stat[ddir], usec);
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->lat_stat[ddir], nsec);
 
 	if (td->lat_log)
-		add_log_sample(td, td->lat_log, usec, ddir, bs);
+		add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
+			       offset);
+
+	if (ts->lat_percentiles)
+		add_clat_percentile_sample(ts, nsec, ddir);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
 }
 
-void add_bw_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-		   struct timeval *t)
+void add_bw_sample(struct thread_data *td, struct io_u *io_u,
+		   unsigned int bytes, unsigned long long spent)
 {
+	const bool needs_lock = td_async_processing(td);
 	struct thread_stat *ts = &td->ts;
+	unsigned long rate;
+
+	if (spent)
+		rate = (unsigned long) (bytes * 1000000ULL / spent);
+	else
+		rate = 0;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
+
+	add_stat_sample(&ts->bw_stat[io_u->ddir], rate);
+
+	if (td->bw_log)
+		add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
+			       bytes, io_u->offset);
+
+	td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
+
+static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
+			 struct timespec *t, unsigned int avg_time,
+			 uint64_t *this_io_bytes, uint64_t *stat_io_bytes,
+			 struct io_stat *stat, struct io_log *log,
+			 bool is_kb)
+{
+	const bool needs_lock = td_async_processing(td);
 	unsigned long spent, rate;
+	enum fio_ddir ddir;
+	unsigned long next, next_log;
 
-	if (!ddir_rw(ddir))
-		return;
+	next_log = avg_time;
 
-	spent = mtime_since(&td->bw_sample_time, t);
-	if (spent < td->o.bw_avg_time)
-		return;
+	spent = mtime_since(parent_tv, t);
+	if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+		return avg_time - spent;
+
+	if (needs_lock)
+		__td_io_u_lock(td);
 
 	/*
 	 * Compute both read and write rates for the interval.
 	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 		uint64_t delta;
 
-		delta = td->this_io_bytes[ddir] - td->stat_io_bytes[ddir];
+		delta = this_io_bytes[ddir] - stat_io_bytes[ddir];
 		if (!delta)
 			continue; /* No entries for interval */
 
-		rate = delta * 1000 / spent / 1024;
-		add_stat_sample(&ts->bw_stat[ddir], rate);
+		if (spent) {
+			if (is_kb)
+				rate = delta * 1000 / spent / 1024; /* KiB/s */
+			else
+				rate = (delta * 1000) / spent;
+		} else
+			rate = 0;
+
+		add_stat_sample(&stat[ddir], rate);
+
+		if (log) {
+			unsigned long long bs = 0;
 
-		if (td->bw_log)
-			add_log_sample(td, td->bw_log, rate, ddir, bs);
+			if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
+				bs = td->o.min_bs[ddir];
 
-		td->stat_io_bytes[ddir] = td->this_io_bytes[ddir];
+			next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0);
+			next_log = min(next_log, next);
+		}
+
+		stat_io_bytes[ddir] = this_io_bytes[ddir];
 	}
 
-	fio_gettime(&td->bw_sample_time, NULL);
+	timespec_add_msec(parent_tv, avg_time);
+
+	if (needs_lock)
+		__td_io_u_unlock(td);
+
+	if (spent <= avg_time)
+		next = avg_time;
+	else
+		next = avg_time - (1 + spent - avg_time);
+
+	return min(next, next_log);
+}
+
+static int add_bw_samples(struct thread_data *td, struct timespec *t)
+{
+	return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time,
+				td->this_io_bytes, td->stat_io_bytes,
+				td->ts.bw_stat, td->bw_log, true);
 }
 
-void add_iops_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-		     struct timeval *t)
+void add_iops_sample(struct thread_data *td, struct io_u *io_u,
+		     unsigned int bytes)
 {
+	const bool needs_lock = td_async_processing(td);
 	struct thread_stat *ts = &td->ts;
-	unsigned long spent, iops;
 
-	if (!ddir_rw(ddir))
-		return;
+	if (needs_lock)
+		__td_io_u_lock(td);
 
-	spent = mtime_since(&td->iops_sample_time, t);
-	if (spent < td->o.iops_avg_time)
-		return;
+	add_stat_sample(&ts->iops_stat[io_u->ddir], 1);
 
-	/*
-	 * Compute both read and write rates for the interval.
-	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-		uint64_t delta;
+	if (td->iops_log)
+		add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
+			       bytes, io_u->offset);
 
-		delta = td->this_io_blocks[ddir] - td->stat_io_blocks[ddir];
-		if (!delta)
-			continue; /* No entries for interval */
+	td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
 
-		iops = (delta * 1000) / spent;
-		add_stat_sample(&ts->iops_stat[ddir], iops);
+	if (needs_lock)
+		__td_io_u_unlock(td);
+}
 
-		if (td->iops_log)
-			add_log_sample(td, td->iops_log, iops, ddir, bs);
+static int add_iops_samples(struct thread_data *td, struct timespec *t)
+{
+	return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time,
+				td->this_io_blocks, td->stat_io_blocks,
+				td->ts.iops_stat, td->iops_log, false);
+}
 
-		td->stat_io_blocks[ddir] = td->this_io_blocks[ddir];
+/*
+ * Returns msecs to next event
+ */
+int calc_log_samples(void)
+{
+	struct thread_data *td;
+	unsigned int next = ~0U, tmp;
+	struct timespec now;
+	int i;
+
+	fio_gettime(&now, NULL);
+
+	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
+		if (in_ramp_time(td) ||
+		    !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+			next = min(td->o.iops_avg_time, td->o.bw_avg_time);
+			continue;
+		}
+		if (!td->bw_log ||
+			(td->bw_log && !per_unit_log(td->bw_log))) {
+			tmp = add_bw_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
+		if (!td->iops_log ||
+			(td->iops_log && !per_unit_log(td->iops_log))) {
+			tmp = add_iops_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
 	}
 
-	fio_gettime(&td->iops_sample_time, NULL);
+	return next == ~0U ? 0 : next;
 }
 
 void stat_init(void)
 {
-	stat_mutex = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	stat_sem = fio_sem_init(FIO_SEM_UNLOCKED);
 }
 
 void stat_exit(void)
@@ -1737,6 +2822,25 @@
 	 * When we have the mutex, we know out-of-band access to it
 	 * have ended.
 	 */
-	fio_mutex_down(stat_mutex);
-	fio_mutex_remove(stat_mutex);
+	fio_sem_down(stat_sem);
+	fio_sem_remove(stat_sem);
+}
+
+/*
+ * Called from signal handler. Wake up status thread.
+ */
+void show_running_run_stats(void)
+{
+	helper_do_stat();
+}
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u)
+{
+	/* Ignore io_u's which span multiple blocks--they will just get
+	 * inaccurate counts. */
+	int idx = (io_u->offset - io_u->file->file_offset)
+			/ td->o.bs[DDIR_TRIM];
+	uint32_t *info = &td->ts.block_infos[idx];
+	assert(idx < td->ts.nr_block_infos);
+	return info;
 }
diff -Nru fio-2.1.3/stat.h fio-3.16/stat.h
--- fio-2.1.3/stat.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/stat.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,26 +2,39 @@
 #define FIO_STAT_H
 
 #include "iolog.h"
+#include "lib/output_buffer.h"
 
 struct group_run_stats {
 	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
 	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
-	uint64_t io_kb[DDIR_RWDIR_CNT];
+	uint64_t iobytes[DDIR_RWDIR_CNT];
 	uint64_t agg[DDIR_RWDIR_CNT];
 	uint32_t kb_base;
 	uint32_t unit_base;
+	uint32_t sig_figs;
 	uint32_t groupid;
 	uint32_t unified_rw_rep;
-};
+} __attribute__((packed));
 
 /*
  * How many depth levels to log
  */
 #define FIO_IO_U_MAP_NR	7
+#define FIO_IO_U_LAT_N_NR 10
 #define FIO_IO_U_LAT_U_NR 10
 #define FIO_IO_U_LAT_M_NR 12
 
 /*
+ * Constants for clat percentiles
+ */
+#define FIO_IO_U_PLAT_BITS 6
+#define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS)
+#define FIO_IO_U_PLAT_GROUP_NR 29
+#define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL)
+#define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
+					list of percentiles */
+
+/*
  * Aggregate clat samples to report percentile(s) of them.
  *
  * EXECUTIVE SUMMARY
@@ -32,7 +45,7 @@
  *
  * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the maximum
  * range being tracked for latency samples. The maximum value tracked
- * accurately will be 2^(GROUP_NR + PLAT_BITS -1) microseconds.
+ * accurately will be 2^(GROUP_NR + PLAT_BITS - 1) nanoseconds.
  *
  * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the memory
  * requirement of storing those aggregate counts. The memory used will
@@ -96,24 +109,40 @@
  *	3	8	2		[256,511]		64
  *	4	9	3		[512,1023]		64
  *	...	...	...		[...,...]		...
- *	18	23	17		[8838608,+inf]**	64
+ *	28	33	27		[8589934592,+inf]**	64
  *
  *  * Special cases: when n < (M-1) or when n == (M-1), in both cases,
  *    the value cannot be rounded off. Use all bits of the sample as
  *    index.
  *
- *  ** If a sample's MSB is greater than 23, it will be counted as 23.
+ *  ** If a sample's MSB is greater than 33, it will be counted as 33.
  */
 
-#define FIO_IO_U_PLAT_BITS 6
-#define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS)
-#define FIO_IO_U_PLAT_GROUP_NR 19
-#define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL)
-#define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
-					list of percentiles */
+/*
+ * Trim cycle count measurements
+ */
+#define MAX_NR_BLOCK_INFOS	8192
+#define BLOCK_INFO_STATE_SHIFT	29
+#define BLOCK_INFO_TRIMS(block_info)	\
+	((block_info) & ((1 << BLOCK_INFO_STATE_SHIFT) - 1))
+#define BLOCK_INFO_STATE(block_info)		\
+	((block_info) >> BLOCK_INFO_STATE_SHIFT)
+#define BLOCK_INFO(state, trim_cycles)	\
+	((trim_cycles) | ((unsigned int) (state) << BLOCK_INFO_STATE_SHIFT))
+#define BLOCK_INFO_SET_STATE(block_info, state)	\
+	BLOCK_INFO(state, BLOCK_INFO_TRIMS(block_info))
+enum block_info_state {
+	BLOCK_STATE_UNINIT,
+	BLOCK_STATE_TRIMMED,
+	BLOCK_STATE_WRITTEN,
+	BLOCK_STATE_TRIM_FAILURE,
+	BLOCK_STATE_WRITE_FAILURE,
+	BLOCK_STATE_COUNT,
+};
 
 #define MAX_PATTERN_SIZE	512
 #define FIO_JOBNAME_SIZE	128
+#define FIO_JOBDESC_SIZE	256
 #define FIO_VERROR_SIZE		128
 
 struct thread_stat {
@@ -123,13 +152,14 @@
 	uint32_t thread_number;
 	uint32_t groupid;
 	uint32_t pid;
-	char description[FIO_JOBNAME_SIZE];
+	char description[FIO_JOBDESC_SIZE];
 	uint32_t members;
 	uint32_t unified_rw_rep;
 
 	/*
 	 * bandwidth and latency stats
 	 */
+	struct io_stat sync_stat __attribute__((aligned(8)));/* fsync etc stats */
 	struct io_stat clat_stat[DDIR_RWDIR_CNT]; /* completion latency */
 	struct io_stat slat_stat[DDIR_RWDIR_CNT]; /* submission latency */
 	struct io_stat lat_stat[DDIR_RWDIR_CNT]; /* total latency */
@@ -147,18 +177,23 @@
 	/*
 	 * IO depth and latency stats
 	 */
-	uint64_t clat_percentiles;
+	uint32_t clat_percentiles;
+	uint32_t lat_percentiles;
 	uint64_t percentile_precision;
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
-	uint32_t io_u_map[FIO_IO_U_MAP_NR];
-	uint32_t io_u_submit[FIO_IO_U_MAP_NR];
-	uint32_t io_u_complete[FIO_IO_U_MAP_NR];
-	uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
-	uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
-	uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
-	uint64_t total_io_u[3];
-	uint64_t short_io_u[3];
+	uint64_t io_u_map[FIO_IO_U_MAP_NR];
+	uint64_t io_u_submit[FIO_IO_U_MAP_NR];
+	uint64_t io_u_complete[FIO_IO_U_MAP_NR];
+	uint64_t io_u_lat_n[FIO_IO_U_LAT_N_NR];
+	uint64_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	uint64_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
+	uint64_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
+	uint64_t io_u_sync_plat[FIO_IO_U_PLAT_NR];
+
+	uint64_t total_io_u[DDIR_RWDIR_SYNC_CNT];
+	uint64_t short_io_u[DDIR_RWDIR_CNT];
+	uint64_t drop_io_u[DDIR_RWDIR_CNT];
 	uint64_t total_submit;
 	uint64_t total_complete;
 
@@ -169,69 +204,176 @@
 	/*
 	 * IO Error related stats
 	 */
-	uint16_t continue_on_error;
-	uint64_t total_err_count;
+	union {
+		uint16_t continue_on_error;
+		uint32_t pad2;
+	};
 	uint32_t first_error;
+	uint64_t total_err_count;
+
+	/* ZBD stats */
+	uint64_t nr_zone_resets;
+
+	uint64_t nr_block_infos;
+	uint32_t block_infos[MAX_NR_BLOCK_INFOS];
 
 	uint32_t kb_base;
 	uint32_t unit_base;
-};
 
-struct jobs_eta {
-	uint32_t nr_running;
-	uint32_t nr_ramp;
-	uint32_t nr_pending;
-	uint32_t nr_setting_up;
-	uint32_t files_open;
-	uint32_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
-	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
-	uint32_t rate[DDIR_RWDIR_CNT];
-	uint32_t iops[DDIR_RWDIR_CNT];
-	uint64_t elapsed_sec;
-	uint64_t eta_sec;
-	uint32_t is_pow2;
-	uint32_t unit_base;
+	uint32_t latency_depth;
+	uint32_t pad3;
+	uint64_t latency_target;
+	fio_fp64_t latency_percentile;
+	uint64_t latency_window;
+
+	uint32_t sig_figs;
+
+	uint64_t ss_dur;
+	uint32_t ss_state;
+	uint32_t ss_head;
+
+	fio_fp64_t ss_limit;
+	fio_fp64_t ss_slope;
+	fio_fp64_t ss_deviation;
+	fio_fp64_t ss_criterion;
+
+	union {
+		uint64_t *ss_iops_data;
+		uint64_t pad4;
+	};
+
+	union {
+		uint64_t *ss_bw_data;
+		uint64_t pad5;
+	};
+
+	uint64_t cachehit;
+	uint64_t cachemiss;
+} __attribute__((packed));
+
+#define JOBS_ETA {							\
+	uint32_t nr_running;						\
+	uint32_t nr_ramp;						\
+									\
+	uint32_t nr_pending;						\
+	uint32_t nr_setting_up;						\
+									\
+	uint64_t m_rate[DDIR_RWDIR_CNT];				\
+	uint64_t t_rate[DDIR_RWDIR_CNT];				\
+	uint64_t rate[DDIR_RWDIR_CNT];					\
+	uint32_t m_iops[DDIR_RWDIR_CNT];				\
+	uint32_t t_iops[DDIR_RWDIR_CNT];				\
+	uint32_t iops[DDIR_RWDIR_CNT];					\
+	uint32_t pad;							\
+	uint64_t elapsed_sec;						\
+	uint64_t eta_sec;						\
+	uint32_t is_pow2;						\
+	uint32_t unit_base;						\
+									\
+	uint32_t sig_figs;						\
+									\
+	uint32_t files_open;						\
+									\
+	/*								\
+	 * Network 'copy' of run_str[]					\
+	 */								\
+	uint32_t nr_threads;						\
+	uint32_t pad2;							\
+	uint8_t run_str[];						\
+}
 
-	/*
-	 * Network 'copy' of run_str[]
-	 */
-	uint32_t nr_threads;
-	uint8_t run_str[];
+struct jobs_eta JOBS_ETA;
+struct jobs_eta_packed JOBS_ETA __attribute__((packed));
+
+struct io_u_plat_entry {
+	struct flist_head list;
+	uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
 };
 
+extern struct fio_sem *stat_sem;
+
+extern struct jobs_eta *get_jobs_eta(bool force, size_t *size);
+
 extern void stat_init(void);
 extern void stat_exit(void);
 
-extern void show_thread_status(struct thread_stat *ts, struct group_run_stats *rs);
-extern void show_group_stats(struct group_run_stats *rs);
-extern int calc_thread_status(struct jobs_eta *je, int force);
+extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *);
+extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
+extern bool calc_thread_status(struct jobs_eta *je, int force);
 extern void display_thread_status(struct jobs_eta *je);
-extern void show_run_stats(void);
+extern void __show_run_stats(void);
+extern void __show_running_run_stats(void);
 extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
 extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
 extern void init_thread_stat(struct thread_stat *ts);
 extern void init_group_run_stat(struct group_run_stats *gs);
 extern void eta_to_str(char *str, unsigned long eta_sec);
-extern int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev);
-extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int **output, unsigned int *maxv, unsigned int *minv);
+extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev);
+extern unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv);
+extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
-extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
-
-static inline int usec_to_msec(unsigned long *min, unsigned long *max,
-			       double *mean, double *dev)
+extern void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist);
+extern void reset_io_stats(struct thread_data *);
+extern void update_rusage_stat(struct thread_data *);
+extern void clear_rusage_stat(struct thread_data *);
+
+extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
+				unsigned long long, uint64_t);
+extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
+				unsigned long long, uint64_t);
+extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+				unsigned long long, uint64_t);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
+extern void add_iops_sample(struct thread_data *, struct io_u *,
+				unsigned int);
+extern void add_bw_sample(struct thread_data *, struct io_u *,
+				unsigned int, unsigned long long);
+extern void add_sync_clat_sample(struct thread_stat *ts,
+					unsigned long long nsec);
+extern int calc_log_samples(void);
+
+extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
+extern bool write_bw_log;
+
+static inline bool nsec_to_usec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
 {
-	if (*min > 1000 && *max > 1000 && *mean > 1000.0 && *dev > 1000.0) {
+	if (*min > 2000 && *max > 99999 && *dev > 1000.0) {
 		*min /= 1000;
 		*max /= 1000;
 		*mean /= 1000.0;
 		*dev /= 1000.0;
-		return 0;
+		return true;
 	}
 
-	return 1;
+	return false;
 }
 
+static inline bool nsec_to_msec(unsigned long long *min,
+				unsigned long long *max, double *mean,
+				double *dev)
+{
+	if (*min > 2000000 && *max > 99999999ULL && *dev > 1000000.0) {
+		*min /= 1000000;
+		*max /= 1000000;
+		*mean /= 1000000.0;
+		*dev /= 1000000.0;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Worst level condensing would be 1:5, so allow enough room for that
+ */
+#define __THREAD_RUNSTR_SZ(nr)	((nr) * 5)
+#define THREAD_RUNSTR_SZ	__THREAD_RUNSTR_SZ(thread_number)
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u);
+
 #endif
diff -Nru fio-2.1.3/steadystate.c fio-3.16/steadystate.c
--- fio-2.1.3/steadystate.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/steadystate.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,379 @@
+#include <stdlib.h>
+
+#include "fio.h"
+#include "steadystate.h"
+
+bool steadystate_enabled = false;
+
+void steadystate_free(struct thread_data *td)
+{
+	free(td->ss.iops_data);
+	free(td->ss.bw_data);
+	td->ss.iops_data = NULL;
+	td->ss.bw_data = NULL;
+}
+
+static void steadystate_alloc(struct thread_data *td)
+{
+	td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
+	td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
+
+	td->ss.state |= FIO_SS_DATA;
+}
+
+void steadystate_setup(void)
+{
+	struct thread_data *td, *prev_td;
+	int i, prev_groupid;
+
+	if (!steadystate_enabled)
+		return;
+
+	/*
+	 * if group reporting is enabled, identify the last td
+	 * for each group and use it for storing steady state
+	 * data
+	 */
+	prev_groupid = -1;
+	prev_td = NULL;
+	for_each_td(td, i) {
+		if (!td->ss.dur)
+			continue;
+
+		if (!td->o.group_reporting) {
+			steadystate_alloc(td);
+			continue;
+		}
+
+		if (prev_groupid != td->groupid) {
+			if (prev_td)
+				steadystate_alloc(prev_td);
+			prev_groupid = td->groupid;
+		}
+		prev_td = td;
+	}
+
+	if (prev_td && prev_td->o.group_reporting)
+		steadystate_alloc(prev_td);
+}
+
+static bool steadystate_slope(uint64_t iops, uint64_t bw,
+			      struct thread_data *td)
+{
+	int i, j;
+	double result;
+	struct steadystate_data *ss = &td->ss;
+	uint64_t new_val;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & FIO_SS_IOPS)
+		new_val = iops;
+	else
+		new_val = bw;
+
+	if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++) {
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+				j = (ss->head + i) % ss->dur;
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_xy += i * ss->iops_data[j];
+				else
+					ss->sum_xy += i * ss->bw_data[j];
+			}
+			ss->state |= FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sums */
+			ss->sum_y -= ss->oldest_y;
+			ss->sum_y += new_val;
+			ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val;
+		}
+
+		if (ss->state & FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		/*
+		 * calculate slope as (sum_xy - sum_x * sum_y / n) / (sum_(x^2)
+		 * - (sum_x)^2 / n) This code assumes that all x values are
+		 * equally spaced when they are often off by a few milliseconds.
+		 * This assumption greatly simplifies the calculations.
+		 */
+		ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) /
+				(ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur);
+		if (ss->state & FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur);
+		else
+			ss->criterion = ss->slope;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, sum_xy: %llu, slope: %f, "
+					"criterion: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y,
+					(unsigned long long) ss->sum_xy,
+					ss->slope, ss->criterion, ss->limit);
+
+		result = ss->criterion * (ss->criterion < 0.0 ? -1.0 : 1.0);
+		if (result < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+static bool steadystate_deviation(uint64_t iops, uint64_t bw,
+				  struct thread_data *td)
+{
+	int i;
+	double diff;
+	double mean;
+
+	struct steadystate_data *ss = &td->ss;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++)
+				if (ss->state & FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+			ss->state |= FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sum */
+			ss->sum_y -= ss->oldest_y;
+			if (ss->state & FIO_SS_IOPS)
+				ss->sum_y += ss->iops_data[ss->tail];
+			else
+				ss->sum_y += ss->bw_data[ss->tail];
+		}
+
+		if (ss->state & FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		mean = (double) ss->sum_y / ss->dur;
+		ss->deviation = 0.0;
+
+		for (i = 0; i < ss->dur; i++) {
+			if (ss->state & FIO_SS_IOPS)
+				diff = ss->iops_data[i] - mean;
+			else
+				diff = ss->bw_data[i] - mean;
+			ss->deviation = max(ss->deviation, diff * (diff < 0.0 ? -1.0 : 1.0));
+		}
+
+		if (ss->state & FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->deviation / mean;
+		else
+			ss->criterion = ss->deviation;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, "
+					"objective: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y, mean,
+					ss->deviation, ss->criterion, ss->limit);
+
+		if (ss->criterion < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+void steadystate_check(void)
+{
+	int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
+	unsigned long rate_time;
+	struct thread_data *td, *td2;
+	struct timespec now;
+	uint64_t group_bw = 0, group_iops = 0;
+	uint64_t td_iops, td_bytes;
+	bool ret;
+
+	prev_groupid = -1;
+	for_each_td(td, i) {
+		const bool needs_lock = td_async_processing(td);
+		struct steadystate_data *ss = &td->ss;
+
+		if (!ss->dur || td->runstate <= TD_SETTING_UP ||
+		    td->runstate >= TD_EXITED || !ss->state ||
+		    ss->state & FIO_SS_ATTAINED)
+			continue;
+
+		td_iops = 0;
+		td_bytes = 0;
+		if (!td->o.group_reporting ||
+		    (td->o.group_reporting && td->groupid != prev_groupid)) {
+			group_bw = 0;
+			group_iops = 0;
+			group_ramp_time_over = 0;
+		}
+		prev_groupid = td->groupid;
+
+		fio_gettime(&now, NULL);
+		if (ss->ramp_time && !(ss->state & FIO_SS_RAMP_OVER)) {
+			/*
+			 * Begin recording data one second after ss->ramp_time
+			 * has elapsed
+			 */
+			if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L))
+				ss->state |= FIO_SS_RAMP_OVER;
+		}
+
+		if (needs_lock)
+			__td_io_u_lock(td);
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td_iops += td->io_blocks[ddir];
+			td_bytes += td->io_bytes[ddir];
+		}
+
+		if (needs_lock)
+			__td_io_u_unlock(td);
+
+		rate_time = mtime_since(&ss->prev_time, &now);
+		memcpy(&ss->prev_time, &now, sizeof(now));
+
+		/*
+		 * Begin monitoring when job starts but don't actually use
+		 * data in checking stopping criterion until ss->ramp_time is
+		 * over. This ensures that we will have a sane value in
+		 * prev_iops/bw the first time through after ss->ramp_time
+		 * is done.
+		 */
+		if (ss->state & FIO_SS_RAMP_OVER) {
+			group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
+			group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
+			++group_ramp_time_over;
+		}
+		ss->prev_iops = td_iops;
+		ss->prev_bytes = td_bytes;
+
+		if (td->o.group_reporting && !(ss->state & FIO_SS_DATA))
+			continue;
+
+		/*
+		 * Don't begin checking criterion until ss->ramp_time is over
+		 * for at least one thread in group
+		 */
+		if (!group_ramp_time_over)
+			continue;
+
+		dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, "
+					"groupid: %u, rate_msec: %ld, "
+					"iops: %llu, bw: %llu, head: %d, tail: %d\n",
+					i, td->groupid, rate_time,
+					(unsigned long long) group_iops,
+					(unsigned long long) group_bw,
+					ss->head, ss->tail);
+
+		if (ss->state & FIO_SS_SLOPE)
+			ret = steadystate_slope(group_iops, group_bw, td);
+		else
+			ret = steadystate_deviation(group_iops, group_bw, td);
+
+		if (ret) {
+			if (td->o.group_reporting) {
+				for_each_td(td2, j) {
+					if (td2->groupid == td->groupid) {
+						td2->ss.state |= FIO_SS_ATTAINED;
+						fio_mark_td_terminate(td2);
+					}
+				}
+			} else {
+				ss->state |= FIO_SS_ATTAINED;
+				fio_mark_td_terminate(td);
+			}
+		}
+	}
+}
+
+int td_steadystate_init(struct thread_data *td)
+{
+	struct steadystate_data *ss = &td->ss;
+	struct thread_options *o = &td->o;
+	struct thread_data *td2;
+	int j;
+
+	memset(ss, 0, sizeof(*ss));
+
+	if (o->ss_dur) {
+		steadystate_enabled = true;
+		o->ss_dur /= 1000000L;
+
+		/* put all steady state info in one place */
+		ss->dur = o->ss_dur;
+		ss->limit = o->ss_limit.u.f;
+		ss->ramp_time = o->ss_ramp_time;
+
+		ss->state = o->ss_state;
+		if (!td->ss.ramp_time)
+			ss->state |= FIO_SS_RAMP_OVER;
+
+		ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2;
+		ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6;
+	}
+
+	/* make sure that ss options are consistent within reporting group */
+	for_each_td(td2, j) {
+		if (td2->groupid == td->groupid) {
+			struct steadystate_data *ss2 = &td2->ss;
+
+			if (ss2->dur != ss->dur ||
+			    ss2->limit != ss->limit ||
+			    ss2->ramp_time != ss->ramp_time ||
+			    ss2->state != ss->state ||
+			    ss2->sum_x != ss->sum_x ||
+			    ss2->sum_x_sq != ss->sum_x_sq) {
+				td_verror(td, EINVAL, "job rejected: steadystate options must be consistent within reporting groups");
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+uint64_t steadystate_bw_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	if (!ts->ss_dur)
+		return 0;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_bw_data[i];
+
+	return sum / ts->ss_dur;
+}
+
+uint64_t steadystate_iops_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	if (!ts->ss_dur)
+		return 0;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_iops_data[i];
+
+	return sum / ts->ss_dur;
+}
diff -Nru fio-2.1.3/steadystate.h fio-3.16/steadystate.h
--- fio-2.1.3/steadystate.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/steadystate.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,69 @@
+#ifndef FIO_STEADYSTATE_H
+#define FIO_STEADYSTATE_H
+
+#include "thread_options.h"
+
+extern void steadystate_free(struct thread_data *);
+extern void steadystate_check(void);
+extern void steadystate_setup(void);
+extern int td_steadystate_init(struct thread_data *);
+extern uint64_t steadystate_bw_mean(struct thread_stat *);
+extern uint64_t steadystate_iops_mean(struct thread_stat *);
+
+extern bool steadystate_enabled;
+
+struct steadystate_data {
+	double limit;
+	unsigned long long dur;
+	unsigned long long ramp_time;
+
+	uint32_t state;
+
+	unsigned int head;
+	unsigned int tail;
+	uint64_t *iops_data;
+	uint64_t *bw_data;
+
+	double slope;
+	double deviation;
+	double criterion;
+
+	uint64_t sum_y;
+	uint64_t sum_x;
+	uint64_t sum_x_sq;
+	uint64_t sum_xy;
+	uint64_t oldest_y;
+
+	struct timespec prev_time;
+	uint64_t prev_iops;
+	uint64_t prev_bytes;
+};
+
+enum {
+	__FIO_SS_IOPS = 0,
+	__FIO_SS_BW,
+	__FIO_SS_SLOPE,
+	__FIO_SS_ATTAINED,
+	__FIO_SS_RAMP_OVER,
+	__FIO_SS_DATA,
+	__FIO_SS_PCT,
+	__FIO_SS_BUFFER_FULL,
+};
+
+enum {
+	FIO_SS_IOPS		= 1 << __FIO_SS_IOPS,
+	FIO_SS_BW		= 1 << __FIO_SS_BW,
+	FIO_SS_SLOPE		= 1 << __FIO_SS_SLOPE,
+	FIO_SS_ATTAINED		= 1 << __FIO_SS_ATTAINED,
+	FIO_SS_RAMP_OVER	= 1 << __FIO_SS_RAMP_OVER,
+	FIO_SS_DATA		= 1 << __FIO_SS_DATA,
+	FIO_SS_PCT		= 1 << __FIO_SS_PCT,
+	FIO_SS_BUFFER_FULL	= 1 << __FIO_SS_BUFFER_FULL,
+
+	FIO_SS_IOPS_SLOPE	= FIO_SS_IOPS | FIO_SS_SLOPE,
+	FIO_SS_BW_SLOPE		= FIO_SS_BW | FIO_SS_SLOPE,
+};
+
+#define STEADYSTATE_MSEC	1000
+
+#endif
diff -Nru fio-2.1.3/STEADYSTATE-TODO fio-3.16/STEADYSTATE-TODO
--- fio-2.1.3/STEADYSTATE-TODO	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/STEADYSTATE-TODO	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,14 @@
+Known issues/TODO (for steady-state)
+
+- Allow user to specify the frequency of measurements
+
+- Better documentation for output
+
+- Report read, write, trim IOPS/BW separately
+
+- Semantics for the ring buffer ss->head are confusing. ss->head points
+  to the beginning of the buffer up through the point where the buffer
+  is filled for the first time. afterwards, when a new element is added,
+  ss->head is advanced to point to the second element in the buffer. if
+  steady state is attained upon adding a new element, ss->head is not
+  advanced so it actually does point to the head of the buffer.
diff -Nru fio-2.1.3/t/arch.c fio-3.16/t/arch.c
--- fio-2.1.3/t/arch.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/arch.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,5 @@
+#include "../arch/arch.h"
+
+unsigned long arch_flags = 0;
+bool tsc_reliable;
+int arch_random;
diff -Nru fio-2.1.3/t/axmap.c fio-3.16/t/axmap.c
--- fio-2.1.3/t/axmap.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/t/axmap.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,43 +1,141 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
 #include <inttypes.h>
 
 #include "../lib/lfsr.h"
 #include "../lib/axmap.h"
 
-void *smalloc(size_t size)
+static int test_regular(uint64_t size, int seed)
 {
-	return malloc(size);
+	struct fio_lfsr lfsr;
+	struct axmap *map;
+	int err;
+
+	printf("Using %llu entries...", (unsigned long long) size);
+	fflush(stdout);
+
+	lfsr_init(&lfsr, size, seed, seed & 0xF);
+	map = axmap_new(size);
+	err = 0;
+
+	while (size--) {
+		uint64_t val;
+
+		if (lfsr_next(&lfsr, &val)) {
+			printf("lfsr: short loop\n");
+			err = 1;
+			break;
+		}
+		if (axmap_isset(map, val)) {
+			printf("bit already set\n");
+			err = 1;
+			break;
+		}
+		axmap_set(map, val);
+		if (!axmap_isset(map, val)) {
+			printf("bit not set\n");
+			err = 1;
+			break;
+		}
+	}
+
+	if (err)
+		return err;
+
+	printf("pass!\n");
+	axmap_free(map);
+	return 0;
 }
 
-void sfree(void *ptr)
+static int check_next_free(struct axmap *map, uint64_t start, uint64_t expected)
 {
-	free(ptr);
+
+	uint64_t ff;
+
+	ff = axmap_next_free(map, start);
+	if (ff != expected) {
+		printf("axmap_next_free broken: Expected %llu, got %llu\n",
+				(unsigned long long)expected, (unsigned long long) ff);
+		return 1;
+	}
+	return 0;
 }
 
-static int test_regular(size_t size, int seed)
+static int test_next_free(uint64_t size, int seed)
 {
 	struct fio_lfsr lfsr;
 	struct axmap *map;
-	size_t osize;
-	uint64_t ff;
-	int err;
+	uint64_t osize;
+	uint64_t ff, lastfree;
+	int err, i;
 
-	printf("Using %llu entries...", (unsigned long long) size);
+	printf("Test next_free %llu entries...", (unsigned long long) size);
 	fflush(stdout);
 
+	map = axmap_new(size);
+	err = 0;
+
+
+	/* Empty map.  Next free after 0 should be 1. */
+	if (check_next_free(map, 0, 1))
+		err = 1;
+
+	/* Empty map.  Next free after 63 should be 64. */
+	if (check_next_free(map, 63, 64))
+		err = 1;
+
+	/* Empty map.  Next free after size - 2 should be size - 1 */
+	if (check_next_free(map, size - 2, size - 1))
+		err = 1;
+
+	/* Empty map.  Next free after size - 1 should be 0 */
+	if (check_next_free(map, size - 1, 0))
+		err = 1;
+
+	/* Empty map.  Next free after 63 should be 64. */
+	if (check_next_free(map, 63, 64))
+		err = 1;
+
+
+	/* Bit 63 set.  Next free after 62 should be 64. */
+	axmap_set(map, 63);
+	if (check_next_free(map, 62, 64))
+		err = 1;
+
+	/* Last bit set.  Next free after size - 2 should be 0. */
+	axmap_set(map, size - 1);
+	if (check_next_free(map, size - 2, 0))
+		err = 1;
+
+	/* Last bit set.  Next free after size - 1 should be 0. */
+	if (check_next_free(map, size - 1, 0))
+		err = 1;
+	
+	/* Last 64 bits set.  Next free after size - 66 or size - 65 should be 0. */
+	for (i=size - 65; i < size; i++)
+		axmap_set(map, i);
+	if (check_next_free(map, size - 66, 0))
+		err = 1;
+	if (check_next_free(map, size - 65, 0))
+		err = 1;
+	
+	/* Last 64 bits set.  Next free after size - 67 should be size - 66. */
+	if (check_next_free(map, size - 67, size - 66))
+		err = 1;
+
+	axmap_free(map);
+	
+	/* Start with a fresh map and mostly fill it up */
 	lfsr_init(&lfsr, size, seed, seed & 0xF);
 	map = axmap_new(size);
 	osize = size;
-	err = 0;
 
+	/* Leave 1 entry free */
+	size--;
 	while (size--) {
 		uint64_t val;
 
-		if (lfsr_next(&lfsr, &val, osize)) {
+		if (lfsr_next(&lfsr, &val)) {
 			printf("lfsr: short loop\n");
 			err = 1;
 			break;
@@ -55,21 +153,50 @@
 		}
 	}
 
-	if (err)
-		return err;
+	/* Get last free bit */
+	lastfree = axmap_next_free(map, 0);
+	if (lastfree == -1ULL) {
+		printf("axmap_next_free broken: Couldn't find last free bit\n");
+		err = 1;
+	}
+
+	/* Start with last free bit and test wrap-around */
+	ff = axmap_next_free(map, lastfree);
+	if (ff != lastfree) {
+		printf("axmap_next_free broken: wrap-around test #1 failed\n");
+		err = 1;
+	}
+
+	/* Start with last bit and test wrap-around */
+	ff = axmap_next_free(map, osize - 1);
+	if (ff != lastfree) {
+		printf("axmap_next_free broken: wrap-around test #2 failed\n");
+		err = 1;
+	}
+
+	/* Set last free bit */
+	axmap_set(map, lastfree);
+	ff = axmap_next_free(map, 0);
+	if (ff != -1ULL) {
+		printf("axmap_next_free broken: Expected -1 from full map\n");
+		err = 1;
+	}
 
 	ff = axmap_next_free(map, osize);
-	if (ff != (uint64_t) -1ULL) {
-		printf("axmap_next_free broken: got %llu\n", (unsigned long long) ff);
-		return 1;
+	if (ff != -1ULL) {
+		printf("axmap_next_free broken: Expected -1 from out of bounds request\n");
+		err = 1;
 	}
 
+	if (err)
+		return err;
+
 	printf("pass!\n");
 	axmap_free(map);
 	return 0;
 }
 
-static int test_multi(size_t size, unsigned int bit_off)
+static int test_multi(uint64_t size, unsigned int bit_off)
 {
 	unsigned int map_size = size;
 	struct axmap *map;
@@ -120,9 +247,155 @@
 	return err;
 }
 
+struct overlap_test {
+	unsigned int start;
+	unsigned int nr;
+	unsigned int ret;
+};
+
+static int test_overlap(void)
+{
+	struct overlap_test tests[] = {
+		{
+			.start	= 0,
+			.nr	= 0,
+			.ret	= 0,
+		},
+		{
+			.start	= 16,
+			.nr	= 16,
+			.ret	= 16,
+		},
+		{
+			.start	= 16,
+			.nr	= 0,
+			.ret	= 0,
+		},
+		{
+			.start	= 0,
+			.nr	= 32,
+			.ret	= 16,
+		},
+		{
+			.start	= 48,
+			.nr	= 32,
+			.ret	= 32,
+		},
+		{
+			.start	= 32,
+			.nr	= 32,
+			.ret	= 16,
+		},
+		{
+			.start	= 79,
+			.nr	= 1,
+			.ret	= 0,
+		},
+		{
+			.start	= 80,
+			.nr	= 21,
+			.ret	= 21,
+		},
+		{
+			.start	= 102,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 101,
+			.nr	= 3,
+			.ret	= 1,
+		},
+		{
+			.start	= 106,
+			.nr	= 4,
+			.ret	= 4,
+		},
+		{
+			.start	= 105,
+			.nr	= 3,
+			.ret	= 1,
+		},
+		{
+			.start	= 120,
+			.nr	= 4,
+			.ret	= 4,
+		},
+		{
+			.start	= 118,
+			.nr	= 2,
+			.ret	= 2,
+		},
+		{
+			.start	= 118,
+			.nr	= 2,
+			.ret	= 0,
+		},
+		{
+			.start	= 1100,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 1000,
+			.nr	= 256,
+			.ret	= 100,
+		},
+		{
+			.start	= 22684,
+			.nr	= 1,
+			.ret	= 1,
+		},
+		{
+			.start	= 22670,
+			.nr	= 60,
+			.ret	= 14,
+		},
+		{
+			.start	= 22670,
+			.nr	= 60,
+			.ret	= 0,
+		},
+		{
+			.start	= -1U,
+		},
+	};
+	struct axmap *map;
+	int entries, i, ret, err = 0;
+
+	entries = 0;
+	for (i = 0; tests[i].start != -1U; i++) {
+		unsigned int this = tests[i].start + tests[i].nr;
+
+		if (this > entries)
+			entries = this;
+	}
+
+	printf("Test overlaps...\n");
+	fflush(stdout);
+
+	map = axmap_new(entries);
+
+	for (i = 0; tests[i].start != -1U; i++) {
+		struct overlap_test *t = &tests[i];
+
+		printf("\tstart=%6u, nr=%3u: ", t->start, t->nr);
+		ret = axmap_set_nr(map, t->start, t->nr);
+		if (ret != t->ret) {
+			printf("%3d (FAIL, wanted %d)\n", ret, t->ret);
+			err = 1;
+			break;
+		}
+		printf("%3d (PASS)\n", ret);
+	}
+
+	axmap_free(map);
+	return err;
+}
+
 int main(int argc, char *argv[])
 {
-	size_t size = (1UL << 23) - 200;
+	uint64_t size = (1ULL << 23) - 200;
 	int seed = 1;
 
 	if (argc > 1) {
@@ -137,6 +410,18 @@
 		return 2;
 	if (test_multi(size, 17))
 		return 3;
+	if (test_overlap())
+		return 4;
+	if (test_next_free(size, seed))
+		return 5;
+
+	/* Test 3 levels, all full:  64*64*64 */
+	if (test_next_free(64*64*64, seed))
+		return 6;
+
+	/* Test 4 levels, with 2 inner levels not full */
+	if (test_next_free(((((64*64)-63)*64)-63)*64*12, seed))
+		return 7;
 
 	return 0;
 }
diff -Nru fio-2.1.3/t/btrace2fio.c fio-3.16/t/btrace2fio.c
--- fio-2.1.3/t/btrace2fio.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/btrace2fio.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,1144 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <assert.h>
+
+#include "../io_ddir.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../fifo.h"
+#include "../blktrace_api.h"
+#include "../os/os.h"
+#include "../log.h"
+#include "../minmax.h"
+#include "../oslib/linux-dev-lookup.h"
+
+#define TRACE_FIFO_SIZE	8192
+
+static unsigned int rt_threshold = 1000000;
+static unsigned int ios_threshold = 10;
+static unsigned int rate_threshold;
+static unsigned int set_rate;
+static unsigned int max_depth = 256;
+static int output_ascii = 1;
+static char *filename;
+
+static char **add_opts;
+static int n_add_opts;
+
+/*
+ * Collapse defaults
+ */
+static unsigned int collapse_entries = 0;
+static unsigned int depth_diff = 1;
+static unsigned int random_diff = 5;
+
+struct bs {
+	unsigned int bs;
+	unsigned int nr;
+	int merges;
+};
+
+struct trace_file {
+	char *name;
+	int major, minor;
+};
+
+struct btrace_out {
+	unsigned long ios[DDIR_RWDIR_CNT];
+	unsigned long merges[DDIR_RWDIR_CNT];
+
+	uint64_t last_end[DDIR_RWDIR_CNT];
+	uint64_t seq[DDIR_RWDIR_CNT];
+
+	struct bs *bs[DDIR_RWDIR_CNT];
+	unsigned int nr_bs[DDIR_RWDIR_CNT];
+
+	int inflight;
+	unsigned int depth;
+	int depth_disabled;
+	int complete_seen;
+
+	uint64_t first_ttime[DDIR_RWDIR_CNT];
+	uint64_t last_ttime[DDIR_RWDIR_CNT];
+	uint64_t kib[DDIR_RWDIR_CNT];
+
+	uint64_t start_delay;
+};
+
+struct btrace_pid {
+	struct flist_head hash_list;
+	struct flist_head pid_list;
+	pid_t pid;
+
+	pid_t *merge_pids;
+	unsigned int nr_merge_pids;
+
+	struct trace_file *files;
+	int nr_files;
+	unsigned int last_major, last_minor;
+	int numjobs;
+	int ignore;
+
+	struct btrace_out o;
+};
+
+struct inflight {
+	struct flist_head list;
+	struct btrace_pid *p;
+	uint64_t end_sector;
+};
+
+#define PID_HASH_BITS	10
+#define PID_HASH_SIZE	(1U << PID_HASH_BITS)
+
+static struct flist_head pid_hash[PID_HASH_SIZE];
+static FLIST_HEAD(pid_list);
+
+#define INFLIGHT_HASH_BITS	8
+#define INFLIGHT_HASH_SIZE	(1U << INFLIGHT_HASH_BITS)
+static struct flist_head inflight_hash[INFLIGHT_HASH_SIZE];
+
+static uint64_t first_ttime = -1ULL;
+
+static struct inflight *inflight_find(uint64_t sector)
+{
+	struct flist_head *inflight_list;
+	struct flist_head *e;
+
+	inflight_list = &inflight_hash[hash_long(sector, INFLIGHT_HASH_BITS)];
+
+	flist_for_each(e, inflight_list) {
+		struct inflight *i = flist_entry(e, struct inflight, list);
+
+		if (i->end_sector == sector)
+			return i;
+	}
+
+	return NULL;
+}
+
+static void inflight_remove(struct inflight *i)
+{
+	struct btrace_out *o = &i->p->o;
+
+	o->inflight--;
+	assert(o->inflight >= 0);
+	flist_del(&i->list);
+	free(i);
+}
+
+static void __inflight_add(struct inflight *i)
+{
+	struct flist_head *list;
+
+	list = &inflight_hash[hash_long(i->end_sector, INFLIGHT_HASH_BITS)];
+	flist_add_tail(&i->list, list);
+}
+
+static void inflight_add(struct btrace_pid *p, uint64_t sector, uint32_t len)
+{
+	struct btrace_out *o = &p->o;
+	struct inflight *i;
+
+	i = calloc(1, sizeof(*i));
+	i->p = p;
+	o->inflight++;
+	if (!o->depth_disabled) {
+		o->depth = max((int) o->depth, o->inflight);
+		if (o->depth >= max_depth && !o->complete_seen) {
+			o->depth_disabled = 1;
+			o->depth = max_depth;
+		}
+	}
+	i->end_sector = sector + (len >> 9);
+	__inflight_add(i);
+}
+
+static void inflight_merge(struct inflight *i, int rw, unsigned int size)
+{
+	i->p->o.merges[rw]++;
+	if (size) {
+		i->end_sector += (size >> 9);
+		flist_del(&i->list);
+		__inflight_add(i);
+	}
+}
+
+/*
+ * fifo refill frontend, to avoid reading data in trace sized bites
+ */
+static int refill_fifo(struct fifo *fifo, int fd)
+{
+	char buf[TRACE_FIFO_SIZE];
+	unsigned int total;
+	int ret;
+
+	total = sizeof(buf);
+	if (total > fifo_room(fifo))
+		total = fifo_room(fifo);
+
+	ret = read(fd, buf, total);
+	if (ret < 0) {
+		perror("read refill");
+		return -1;
+	}
+
+	if (ret > 0)
+		ret = fifo_put(fifo, buf, ret);
+
+	return ret;
+}
+
+/*
+ * Retrieve 'len' bytes from the fifo, refilling if necessary.
+ */
+static int trace_fifo_get(struct fifo *fifo, int fd, void *buf,
+			  unsigned int len)
+{
+	if (fifo_len(fifo) < len) {
+		int ret = refill_fifo(fifo, fd);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	return fifo_get(fifo, buf, len);
+}
+
+/*
+ * Just discard the pdu by seeking past it.
+ */
+static int discard_pdu(struct fifo *fifo, int fd, struct blk_io_trace *t)
+{
+	if (t->pdu_len == 0)
+		return 0;
+
+	return trace_fifo_get(fifo, fd, NULL, t->pdu_len);
+}
+
+static int handle_trace_notify(struct blk_io_trace *t)
+{
+	switch (t->action) {
+	case BLK_TN_PROCESS:
+		//printf("got process notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_TIMESTAMP:
+		//printf("got timestamp notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_MESSAGE:
+		break;
+	default:
+		log_err("unknown trace act %x\n", t->action);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void __add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	o->bs[rw] = realloc(o->bs[rw], (o->nr_bs[rw] + 1) * sizeof(struct bs));
+	o->bs[rw][o->nr_bs[rw]].bs = len;
+	o->bs[rw][o->nr_bs[rw]].nr = 1;
+	o->nr_bs[rw]++;
+}
+
+static void add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	struct bs *bs = o->bs[rw];
+	int i;
+
+	if (!o->nr_bs[rw]) {
+		__add_bs(o, len, rw);
+		return;
+	}
+
+	for (i = 0; i < o->nr_bs[rw]; i++) {
+		if (bs[i].bs == len) {
+			bs[i].nr++;
+			return;
+		}
+	}
+
+	__add_bs(o, len, rw);
+}
+
+#define FMINORBITS	20
+#define FMINORMASK	((1U << FMINORBITS) - 1)
+#define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
+#define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
+
+static int btrace_add_file(struct btrace_pid *p, uint32_t devno)
+{
+	unsigned int maj = FMAJOR(devno);
+	unsigned int min = FMINOR(devno);
+	struct trace_file *f;
+	unsigned int i;
+	char dev[256];
+
+	if (filename)
+		return 0;
+	if (p->last_major == maj && p->last_minor == min)
+		return 0;
+
+	p->last_major = maj;
+	p->last_minor = min;
+
+	/*
+	 * check for this file in our list
+	 */
+	for (i = 0; i < p->nr_files; i++) {
+		f = &p->files[i];
+
+		if (f->major == maj && f->minor == min)
+			return 0;
+	}
+
+	strcpy(dev, "/dev");
+	if (!blktrace_lookup_device(NULL, dev, maj, min)) {
+		log_err("fio: failed to find device %u/%u\n", maj, min);
+		if (!output_ascii) {
+			log_err("fio: use -d to specify device\n");
+			return 1;
+		}
+		return 0;
+	}
+
+	p->files = realloc(p->files, (p->nr_files + 1) * sizeof(*f));
+	f = &p->files[p->nr_files];
+	f->name = strdup(dev);
+	f->major = maj;
+	f->minor = min;
+	p->nr_files++;
+	return 0;
+}
+
+static int t_to_rwdir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+}
+
+static int handle_trace_discard(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	if (o->first_ttime[2] == -1ULL)
+		o->first_ttime[2] = t->time;
+
+	o->ios[DDIR_TRIM]++;
+	add_bs(o, t->bytes, DDIR_TRIM);
+	return 0;
+}
+
+static int handle_trace_fs(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int rw;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	first_ttime = min(first_ttime, (uint64_t) t->time);
+
+	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+
+	if (o->first_ttime[rw] == -1ULL)
+		o->first_ttime[rw] = t->time;
+
+	add_bs(o, t->bytes, rw);
+	o->ios[rw]++;
+
+	if (t->sector == o->last_end[rw] || o->last_end[rw] == -1ULL)
+		o->seq[rw]++;
+
+	o->last_end[rw] = t->sector + (t->bytes >> 9);
+	return 0;
+}
+
+static int handle_queue_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
+		return handle_trace_notify(t);
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return handle_trace_discard(t, p);
+	else
+		return handle_trace_fs(t, p);
+}
+
+static int handle_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	unsigned int act = t->action & 0xffff;
+	int ret = 0;
+
+	if (act == __BLK_TA_QUEUE) {
+		inflight_add(p, t->sector, t->bytes);
+		ret = handle_queue_trace(t, p);
+	} else if (act == __BLK_TA_BACKMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), t->bytes);
+	} else if (act == __BLK_TA_FRONTMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), 0);
+	} else if (act == __BLK_TA_COMPLETE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i) {
+			i->p->o.kib[t_to_rwdir(t)] += (t->bytes >> 10);
+			i->p->o.complete_seen = 1;
+			inflight_remove(i);
+		}
+	}
+
+	return ret;
+}
+
+static void byteswap_trace(struct blk_io_trace *t)
+{
+	t->magic = fio_swap32(t->magic);
+	t->sequence = fio_swap32(t->sequence);
+	t->time = fio_swap64(t->time);
+	t->sector = fio_swap64(t->sector);
+	t->bytes = fio_swap32(t->bytes);
+	t->action = fio_swap32(t->action);
+	t->pid = fio_swap32(t->pid);
+	t->device = fio_swap32(t->device);
+	t->cpu = fio_swap32(t->cpu);
+	t->error = fio_swap16(t->error);
+	t->pdu_len = fio_swap16(t->pdu_len);
+}
+
+static struct btrace_pid *pid_hash_find(pid_t pid, struct flist_head *list)
+{
+	struct flist_head *e;
+	struct btrace_pid *p;
+
+	flist_for_each(e, list) {
+		p = flist_entry(e, struct btrace_pid, hash_list);
+		if (p->pid == pid)
+			return p;
+	}
+
+	return NULL;
+}
+
+static struct btrace_pid *pid_hash_get(pid_t pid)
+{
+	struct flist_head *hash_list;
+	struct btrace_pid *p;
+
+	hash_list = &pid_hash[hash_long(pid, PID_HASH_BITS)];
+
+	p = pid_hash_find(pid, hash_list);
+	if (!p) {
+		int i;
+
+		p = calloc(1, sizeof(*p));
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			p->o.first_ttime[i] = -1ULL;
+			p->o.last_ttime[i] = -1ULL;
+			p->o.last_end[i] = -1ULL;
+		}
+
+		p->pid = pid;
+		p->numjobs = 1;
+		flist_add_tail(&p->hash_list, hash_list);
+		flist_add_tail(&p->pid_list, &pid_list);
+	}
+
+	return p;
+}
+
+/*
+ * Load a blktrace file by reading all the blk_io_trace entries, and storing
+ * them as io_pieces like the fio text version would do.
+ */
+static int load_blktrace(const char *fname, int need_swap)
+{
+	struct btrace_pid *p;
+	unsigned long traces;
+	struct blk_io_trace t;
+	struct fifo *fifo;
+	int fd, ret = 0;
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		perror("open trace file\n");
+		return 1;
+	}
+
+	fifo = fifo_alloc(TRACE_FIFO_SIZE);
+
+	traces = 0;
+	do {
+		ret = trace_fifo_get(fifo, fd, &t, sizeof(t));
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+		else if (ret < (int) sizeof(t)) {
+			log_err("fio: short fifo get\n");
+			break;
+		}
+
+		if (need_swap)
+			byteswap_trace(&t);
+
+		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+			log_err("fio: bad magic in blktrace data: %x\n", t.magic);
+			goto err;
+		}
+		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
+			log_err("fio: bad blktrace version %d\n", t.magic & 0xff);
+			goto err;
+		}
+		ret = discard_pdu(fifo, fd, &t);
+		if (ret < 0) {
+			log_err("blktrace lseek\n");
+			goto err;
+		} else if (t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
+			goto err;
+		}
+
+		p = pid_hash_get(t.pid);
+		ret = handle_trace(&t, p);
+		if (ret)
+			break;
+		p->o.last_ttime[t_to_rwdir(&t)] = t.time;
+		traces++;
+	} while (1);
+
+	fifo_free(fifo);
+	close(fd);
+
+	if (ret)
+		return ret;
+
+	if (output_ascii)
+		printf("Traces loaded: %lu\n", traces);
+
+	return 0;
+err:
+	close(fd);
+	fifo_free(fifo);
+	return 1;
+}
+
+static int bs_cmp(const void *ba, const void *bb)
+{
+	const struct bs *bsa = ba;
+	const struct bs *bsb = bb;
+
+	return bsb->nr - bsa->nr;
+}
+
+static unsigned long o_to_kib_rate(struct btrace_out *o, int rw)
+{
+	uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
+	uint64_t val;
+
+	if (!usec)
+		return 0;
+
+	usec /= 1000;
+	if (!usec)
+		return 0;
+
+	val = o->kib[rw] * 1000ULL;
+	return val / usec;
+}
+
+static uint64_t o_first_ttime(struct btrace_out *o)
+{
+	uint64_t first;
+
+	first = min(o->first_ttime[0], o->first_ttime[1]);
+	return min(first, o->first_ttime[2]);
+}
+
+static uint64_t o_longest_ttime(struct btrace_out *o)
+{
+	uint64_t ret = 0;
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		uint64_t diff;
+
+		diff = o->last_ttime[i] - o->first_ttime[i];
+		ret = max(diff, ret);
+	}
+
+	return ret;
+}
+
+static void __output_p_ascii(struct btrace_pid *p, unsigned long *ios)
+{
+	const char *msg[] = { "reads", "writes", "trims" };
+	struct btrace_out *o = &p->o;
+	unsigned long total, usec;
+	int i, j;
+
+	printf("[pid:\t%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(", %u", p->merge_pids[i]);
+	printf("]\n");
+
+	total = ddir_rw_sum(o->ios);
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		float perc;
+
+		if (!o->ios[i])
+			continue;
+
+		ios[i] += o->ios[i] + o->merges[i];
+		printf("%s\n", msg[i]);
+		perc = ((float) o->ios[i] * 100.0) / (float) total;
+		printf("\tios:    %lu (perc=%3.2f%%)\n", o->ios[i], perc);
+		perc = ((float) o->merges[i] * 100.0) / (float) total;
+		printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
+		perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+		printf("\tseq:    %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
+		printf("\trate:   %lu KiB/sec\n", o_to_kib_rate(o, i));
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			printf("\tbs=%u, perc=%3.2f%%\n", bs->bs, perc);
+		}
+	}
+
+	printf("depth:\t%u\n", o->depth);
+	usec = o_longest_ttime(o) / 1000ULL;
+	printf("usec:\t%lu (delay=%llu)\n", usec, (unsigned long long) o->start_delay);
+
+	printf("files:\t");
+	for (i = 0; i < p->nr_files; i++)
+		printf("%s,", p->files[i].name);
+	printf("\n");
+
+	printf("\n");
+}
+
+static int __output_p_fio(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	unsigned long total;
+	unsigned long long time;
+	float perc;
+	int i, j;
+
+	if ((o->ios[0] + o->ios[1]) && o->ios[2]) {
+		log_err("fio: trace has both read/write and trim\n");
+		return 1;
+	}
+	if (!p->nr_files) {
+		log_err("fio: no devices found\n");
+		return 1;
+	}
+
+	printf("[pid%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(",pid%u", p->merge_pids[i]);
+	printf("]\n");
+
+	printf("numjobs=%u\n", p->numjobs);
+	printf("direct=1\n");
+	if (o->depth == 1)
+		printf("ioengine=sync\n");
+	else
+		printf("ioengine=libaio\niodepth=%u\n", o->depth);
+
+	if (o->ios[0] && !o->ios[1])
+		printf("rw=randread\n");
+	else if (!o->ios[0] && o->ios[1])
+		printf("rw=randwrite\n");
+	else if (o->ios[2])
+		printf("rw=randtrim\n");
+	else {
+		printf("rw=randrw\n");
+		total = ddir_rw_sum(o->ios);
+		perc = ((float) o->ios[0] * 100.0) / (float) total;
+		printf("rwmixread=%u\n", (int) floor(perc + 0.50));
+	}
+
+	printf("percentage_random=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->seq[i] && o->ios[i]) {
+			perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+			if (perc >= 99.0)
+				perc = 100.0;
+		} else
+			perc = 100.0;
+
+		if (i)
+			printf(",");
+		perc = 100.0 - perc;
+		printf("%u", (int) floor(perc + 0.5));
+	}
+	printf("\n");
+
+	printf("filename=");
+	for (i = 0; i < p->nr_files; i++) {
+		if (i)
+			printf(":");
+		printf("%s", p->files[i].name);
+	}
+	printf("\n");
+
+	if (o->start_delay / 1000000ULL)
+		printf("startdelay=%llus\n", o->start_delay / 1000000ULL);
+
+	time = o_longest_ttime(o);
+	time = (time + 1000000000ULL - 1) / 1000000000ULL;
+	printf("runtime=%llus\n", time);
+
+	printf("bssplit=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+
+		if (i && o->nr_bs[i - 1] && o->nr_bs[i])
+			printf(",");
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			if (perc < 1.00)
+				continue;
+			if (j)
+				printf(":");
+			if (j + 1 == o->nr_bs[i])
+				printf("%u/", bs->bs);
+			else
+				printf("%u/%u", bs->bs, (int) floor(perc + 0.5));
+		}
+	}
+	printf("\n");
+
+	if (set_rate) {
+		printf("rate=");
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			unsigned long rate;
+
+			rate = o_to_kib_rate(o, i);
+			if (i)
+				printf(",");
+			if (rate)
+				printf("%luk", rate);
+		}
+		printf("\n");
+	}
+
+	if (n_add_opts)
+		for (i = 0; i < n_add_opts; i++)
+			printf("%s\n", add_opts[i]);
+
+	printf("\n");
+	return 0;
+}
+
+static int __output_p(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	int i, ret = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->nr_bs[i] <= 1)
+			continue;
+		qsort(o->bs[i], o->nr_bs[i], sizeof(struct bs), bs_cmp);
+	}
+
+	if (filename) {
+		p->files = malloc(sizeof(struct trace_file));
+		p->nr_files++;
+		p->files[0].name = filename;
+	}
+
+	if (output_ascii)
+		__output_p_ascii(p, ios);
+	else
+		ret = __output_p_fio(p, ios);
+
+	return ret;
+}
+
+static void remove_ddir(struct btrace_out *o, int rw)
+{
+	o->ios[rw] = 0;
+}
+
+static int prune_entry(struct btrace_out *o)
+{
+	unsigned long rate;
+	uint64_t time;
+	int i;
+
+	if (ddir_rw_sum(o->ios) < ios_threshold)
+		return 1;
+
+	time = o_longest_ttime(o) / 1000ULL;
+	if (time < rt_threshold)
+		return 1;
+
+	rate = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		unsigned long this_rate;
+
+		this_rate = o_to_kib_rate(o, i);
+		if (this_rate < rate_threshold) {
+			remove_ddir(o, i);
+			this_rate = 0;
+		}
+		rate += this_rate;
+	}
+
+	if (rate < rate_threshold)
+		return 1;
+
+	return 0;
+}
+
+static int entry_cmp(void *priv, struct flist_head *a, struct flist_head *b)
+{
+	struct btrace_pid *pa = flist_entry(a, struct btrace_pid, pid_list);
+	struct btrace_pid *pb = flist_entry(b, struct btrace_pid, pid_list);
+
+	return ddir_rw_sum(pb->o.ios) - ddir_rw_sum(pa->o.ios);
+}
+
+static void free_p(struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int i;
+
+	for (i = 0; i < p->nr_files; i++) {
+		if (p->files[i].name && p->files[i].name != filename)
+			free(p->files[i].name);
+	}
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		free(o->bs[i]);
+
+	free(p->files);
+	flist_del(&p->pid_list);
+	flist_del(&p->hash_list);
+	free(p);
+}
+
+static int entries_close(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	float perca, percb, fdiff;
+	int i, idiff;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if ((pida->o.ios[i] && !pidb->o.ios[i]) ||
+		    (pidb->o.ios[i] && !pida->o.ios[i]))
+			return 0;
+		if (pida->o.ios[i] && pidb->o.ios[i]) {
+			perca = ((float) pida->o.seq[i] * 100.0) / (float) pida->o.ios[i];
+			percb = ((float) pidb->o.seq[i] * 100.0) / (float) pidb->o.ios[i];
+			fdiff = perca - percb;
+			if (fabs(fdiff) > random_diff)
+				return 0;
+		}
+
+		idiff = pida->o.depth - pidb->o.depth;
+		if (abs(idiff) > depth_diff)
+			return 0;
+	}
+
+	return 1;
+}
+
+static void merge_bs(struct bs **bsap, unsigned int *nr_bsap,
+		     struct bs *bsb, unsigned int nr_bsb)
+{
+	struct bs *bsa = *bsap;
+	unsigned int nr_bsa = *nr_bsap;
+	int a, b;
+
+	for (b = 0; b < nr_bsb; b++) {
+		int next, found = 0;
+
+		for (a = 0; a < nr_bsa; a++) {
+			if (bsb[b].bs != bsa[a].bs)
+				continue;
+
+			bsa[a].nr += bsb[b].nr;
+			bsa[a].merges += bsb[b].merges;
+			found = 1;
+			break;
+		}
+
+		if (found)
+			continue;
+
+		next = *nr_bsap;
+		bsa = realloc(bsa, (next + 1) * sizeof(struct bs));
+		bsa[next].bs = bsb[b].bs;
+		bsa[next].nr = bsb[b].nr;
+		(*nr_bsap)++;
+		*bsap = bsa;
+	}
+}
+
+static int merge_entries(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	int i;
+
+	if (!entries_close(pida, pidb))
+		return 0;
+
+	pida->nr_merge_pids++;
+	pida->merge_pids = realloc(pida->merge_pids, pida->nr_merge_pids * sizeof(pid_t));
+	pida->merge_pids[pida->nr_merge_pids - 1] = pidb->pid;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct btrace_out *oa = &pida->o;
+		struct btrace_out *ob = &pidb->o;
+
+		oa->ios[i] += ob->ios[i];
+		oa->merges[i] += ob->merges[i];
+		oa->seq[i] += ob->seq[i];
+		oa->kib[i] += ob->kib[i];
+		oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]);
+		oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]);
+		merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]);
+	}
+
+	pida->o.start_delay = min(pida->o.start_delay, pidb->o.start_delay);
+	pida->o.depth = (pida->o.depth + pidb->o.depth) / 2;
+	return 1;
+}
+
+static void check_merges(struct btrace_pid *p, struct flist_head *pidlist)
+{
+	struct flist_head *e, *tmp;
+
+	if (p->ignore)
+		return;
+
+	flist_for_each_safe(e, tmp, pidlist) {
+		struct btrace_pid *pidb;
+
+		pidb = flist_entry(e, struct btrace_pid, pid_list);
+		if (pidb == p)
+			continue;
+
+		if (merge_entries(p, pidb)) {
+			pidb->ignore = 1;
+			p->numjobs++;
+		}
+	}
+}
+
+static int output_p(void)
+{
+	unsigned long ios[DDIR_RWDIR_CNT];
+	struct flist_head *e, *tmp;
+	int depth_disabled = 0;
+	int ret = 0;
+
+	flist_for_each_safe(e, tmp, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		if (prune_entry(&p->o)) {
+			free_p(p);
+			continue;
+		}
+		p->o.start_delay = (o_first_ttime(&p->o) / 1000ULL) - first_ttime;
+		depth_disabled += p->o.depth_disabled;
+	}
+
+	if (collapse_entries) {
+		struct btrace_pid *p;
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			check_merges(p, &pid_list);
+		}
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			if (p->ignore)
+				free_p(p);
+		}
+	}
+
+	if (depth_disabled)
+		log_err("fio: missing completion traces, depths capped at %u\n", max_depth);
+
+	memset(ios, 0, sizeof(ios));
+
+	flist_sort(NULL, &pid_list, entry_cmp);
+
+	flist_for_each(e, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		ret |= __output_p(p, ios);
+		if (ret && !output_ascii)
+			break;
+	}
+
+	if (output_ascii)
+		printf("Total: reads=%lu, writes=%lu\n", ios[0], ios[1]);
+
+	return ret;
+}
+
+static int usage(char *argv[])
+{
+	log_err("%s: [options] <blktrace bin file>\n", argv[0]);
+	log_err("\t-t\tUsec threshold to ignore task\n");
+	log_err("\t-n\tNumber IOS threshold to ignore task\n");
+	log_err("\t-f\tFio job file output\n");
+	log_err("\t-d\tUse this file/device for replay\n");
+	log_err("\t-r\tIgnore jobs with less than this KiB/sec rate\n");
+	log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate);
+	log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
+	log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries);
+	log_err("\t-u\tDepth difference for collapse (def=%u)\n", depth_diff);
+	log_err("\t-x\tRandom difference for collapse (def=%u)\n", random_diff);
+	log_err("\t-a\tAdditional fio option to add to job file\n");
+	return 1;
+}
+
+static int trace_needs_swap(const char *trace_file, int *swap)
+{
+	struct blk_io_trace t;
+	int fd, ret;
+
+	*swap = -1;
+	
+	fd = open(trace_file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 1;
+	}
+
+	ret = read(fd, &t, sizeof(t));
+	if (ret < 0) {
+		close(fd);
+		perror("read");
+		return 1;
+	} else if (ret != sizeof(t)) {
+		close(fd);
+		log_err("fio: short read on trace file\n");
+		return 1;
+	}
+
+	close(fd);
+
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+		*swap = 0;
+	else {
+		/*
+		 * Maybe it needs to be endian swapped...
+		 */
+		t.magic = fio_swap32(t.magic);
+		if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+			*swap = 1;
+	}
+
+	if (*swap == -1) {
+		log_err("fio: blktrace appears corrupt\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int need_swap, i, c;
+
+	if (argc < 2)
+		return usage(argv);
+
+	while ((c = getopt(argc, argv, "t:n:fd:r:RD:c:u:x:a:")) != -1) {
+		switch (c) {
+		case 'R':
+			set_rate = 1;
+			break;
+		case 'r':
+			rate_threshold = atoi(optarg);
+			break;
+		case 't':
+			rt_threshold = atoi(optarg);
+			break;
+		case 'n':
+			ios_threshold = atoi(optarg);
+			break;
+		case 'f':
+			output_ascii = 0;
+			break;
+		case 'd':
+			filename = strdup(optarg);
+			break;
+		case 'D':
+			max_depth = atoi(optarg);
+			break;
+		case 'c':
+			collapse_entries = atoi(optarg);
+			break;
+		case 'u':
+			depth_diff = atoi(optarg);
+			break;
+		case 'x':
+			random_diff = atoi(optarg);
+			break;
+		case 'a':
+			add_opts = realloc(add_opts, (n_add_opts + 1) * sizeof(char *));
+			add_opts[n_add_opts] = strdup(optarg);
+			n_add_opts++;
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (argc == optind)
+		return usage(argv);
+
+	if (trace_needs_swap(argv[optind], &need_swap))
+		return 1;
+
+	for (i = 0; i < PID_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&pid_hash[i]);
+	for (i = 0; i < INFLIGHT_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&inflight_hash[i]);
+
+	load_blktrace(argv[optind], need_swap);
+	first_ttime /= 1000ULL;
+
+	return output_p();
+}
diff -Nru fio-2.1.3/t/debug.c fio-3.16/t/debug.c
--- fio-2.1.3/t/debug.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/debug.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,14 @@
+#include <stdio.h>
+
+FILE *f_err;
+struct timespec *fio_ts = NULL;
+unsigned long fio_debug = 0;
+
+void __dprint(int type, const char *str, ...)
+{
+}
+
+void debug_init(void)
+{
+	f_err = stderr;
+}
diff -Nru fio-2.1.3/t/debug.h fio-3.16/t/debug.h
--- fio-2.1.3/t/debug.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/debug.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,6 @@
+#ifndef FIO_DEBUG_INC_H
+#define FIO_DEBUG_INC_H
+
+extern void debug_init(void);
+
+#endif
diff -Nru fio-2.1.3/t/dedupe.c fio-3.16/t/dedupe.c
--- fio-2.1.3/t/dedupe.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/dedupe.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,596 @@
+/*
+ * Small tool to check for dedupable blocks in a file or device. Basically
+ * just scans the filename for extents of the given size, checksums them,
+ * and orders them up.
+ */
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "../flist.h"
+#include "../log.h"
+#include "../fio_sem.h"
+#include "../smalloc.h"
+#include "../minmax.h"
+#include "../crc/md5.h"
+#include "../lib/memalign.h"
+#include "../os/os.h"
+#include "../gettime.h"
+#include "../fio_time.h"
+#include "../lib/rbtree.h"
+
+#include "../lib/bloom.h"
+#include "debug.h"
+
+struct worker_thread {
+	pthread_t thread;
+
+	volatile int done;
+
+	int fd;
+	uint64_t cur_offset;
+	uint64_t size;
+
+	unsigned long items;
+	unsigned long dupes;
+	int err;
+};
+
+struct extent {
+	struct flist_head list;
+	uint64_t offset;
+};
+
+struct chunk {
+	struct fio_rb_node rb_node;
+	uint64_t count;
+	uint32_t hash[MD5_HASH_WORDS];
+	struct flist_head extent_list[0];
+};
+
+struct item {
+	uint64_t offset;
+	uint32_t hash[MD5_HASH_WORDS];
+};
+
+static struct rb_root rb_root;
+static struct bloom *bloom;
+static struct fio_sem *rb_lock;
+
+static unsigned int blocksize = 4096;
+static unsigned int num_threads;
+static unsigned int chunk_size = 1048576;
+static unsigned int dump_output;
+static unsigned int odirect;
+static unsigned int collision_check;
+static unsigned int print_progress = 1;
+static unsigned int use_bloom = 1;
+
+static uint64_t total_size;
+static uint64_t cur_offset;
+static struct fio_sem *size_lock;
+
+static struct fio_file file;
+
+static uint64_t get_size(struct fio_file *f, struct stat *sb)
+{
+	uint64_t ret;
+
+	if (S_ISBLK(sb->st_mode)) {
+		unsigned long long bytes = 0;
+
+		if (blockdev_size(f, &bytes)) {
+			log_err("dedupe: failed getting bdev size\n");
+			return 0;
+		}
+		ret = bytes;
+	} else
+		ret = sb->st_size;
+
+	return (ret & ~((uint64_t)blocksize - 1));
+}
+
+static int get_work(uint64_t *offset, uint64_t *size)
+{
+	uint64_t this_chunk;
+	int ret = 1;
+
+	fio_sem_down(size_lock);
+
+	if (cur_offset < total_size) {
+		*offset = cur_offset;
+		this_chunk = min((uint64_t)chunk_size, total_size - cur_offset);
+		*size = this_chunk;
+		cur_offset += this_chunk;
+		ret = 0;
+	}
+
+	fio_sem_up(size_lock);
+	return ret;
+}
+
+static int __read_block(int fd, void *buf, off_t offset, size_t count)
+{
+	ssize_t ret;
+
+	ret = pread(fd, buf, count, offset);
+	if (ret < 0) {
+		perror("pread");
+		return 1;
+	} else if (!ret)
+		return 1;
+	else if (ret != count) {
+		log_err("dedupe: short read on block\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int read_block(int fd, void *buf, off_t offset)
+{
+	return __read_block(fd, buf, offset, blocksize);
+}
+
+static void add_item(struct chunk *c, struct item *i)
+{
+	/*	
+	 * Save some memory and don't add extent items, if we don't
+	 * use them.
+	 */
+	if (dump_output || collision_check) {
+		struct extent *e;
+
+		e = malloc(sizeof(*e));
+		e->offset = i->offset;
+		flist_add_tail(&e->list, &c->extent_list[0]);
+	}
+
+	c->count++;
+}
+
+static int col_check(struct chunk *c, struct item *i)
+{
+	struct extent *e;
+	char *cbuf, *ibuf;
+	int ret = 1;
+
+	cbuf = fio_memalign(blocksize, blocksize, false);
+	ibuf = fio_memalign(blocksize, blocksize, false);
+
+	e = flist_entry(c->extent_list[0].next, struct extent, list);
+	if (read_block(file.fd, cbuf, e->offset))
+		goto out;
+
+	if (read_block(file.fd, ibuf, i->offset))
+		goto out;
+
+	ret = memcmp(ibuf, cbuf, blocksize);
+out:
+	fio_memfree(cbuf, blocksize, false);
+	fio_memfree(ibuf, blocksize, false);
+	return ret;
+}
+
+static struct chunk *alloc_chunk(void)
+{
+	struct chunk *c;
+
+	if (collision_check || dump_output) {
+		c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
+		INIT_FLIST_HEAD(&c->extent_list[0]);
+	} else
+		c = malloc(sizeof(struct chunk));
+
+	return c;
+}
+
+static void insert_chunk(struct item *i)
+{
+	struct fio_rb_node **p, *parent;
+	struct chunk *c;
+	int diff;
+
+	p = &rb_root.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		c = rb_entry(parent, struct chunk, rb_node);
+		diff = memcmp(i->hash, c->hash, sizeof(i->hash));
+		if (diff < 0)
+			p = &(*p)->rb_left;
+		else if (diff > 0)
+			p = &(*p)->rb_right;
+		else {
+			int ret;
+
+			if (!collision_check)
+				goto add;
+
+			fio_sem_up(rb_lock);
+			ret = col_check(c, i);
+			fio_sem_down(rb_lock);
+
+			if (!ret)
+				goto add;
+
+			p = &(*p)->rb_right;
+		}
+	}
+
+	c = alloc_chunk();
+	RB_CLEAR_NODE(&c->rb_node);
+	c->count = 0;
+	memcpy(c->hash, i->hash, sizeof(i->hash));
+	rb_link_node(&c->rb_node, parent, p);
+	rb_insert_color(&c->rb_node, &rb_root);
+add:
+	add_item(c, i);
+}
+
+static void insert_chunks(struct item *items, unsigned int nitems,
+			  uint64_t *ndupes)
+{
+	int i;
+
+	fio_sem_down(rb_lock);
+
+	for (i = 0; i < nitems; i++) {
+		if (bloom) {
+			unsigned int s;
+			int r;
+
+			s = sizeof(items[i].hash) / sizeof(uint32_t);
+			r = bloom_set(bloom, items[i].hash, s);
+			*ndupes += r;
+		} else
+			insert_chunk(&items[i]);
+	}
+
+	fio_sem_up(rb_lock);
+}
+
+static void crc_buf(void *buf, uint32_t *hash)
+{
+	struct fio_md5_ctx ctx = { .hash = hash };
+
+	fio_md5_init(&ctx);
+	fio_md5_update(&ctx, buf, blocksize);
+	fio_md5_final(&ctx);
+}
+
+static unsigned int read_blocks(int fd, void *buf, off_t offset, size_t size)
+{
+	if (__read_block(fd, buf, offset, size))
+		return 0;
+
+	return size / blocksize;
+}
+
+static int do_work(struct worker_thread *thread, void *buf)
+{
+	unsigned int nblocks, i;
+	off_t offset;
+	int nitems = 0;
+	uint64_t ndupes = 0;
+	struct item *items;
+
+	offset = thread->cur_offset;
+
+	nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+	if (!nblocks)
+		return 1;
+
+	items = malloc(sizeof(*items) * nblocks);
+
+	for (i = 0; i < nblocks; i++) {
+		void *thisptr = buf + (i * blocksize);
+
+		items[i].offset = offset;
+		crc_buf(thisptr, items[i].hash);
+		offset += blocksize;
+		nitems++;
+	}
+
+	insert_chunks(items, nitems, &ndupes);
+
+	free(items);
+	thread->items += nitems;
+	thread->dupes += ndupes;
+	return 0;
+}
+
+static void *thread_fn(void *data)
+{
+	struct worker_thread *thread = data;
+	void *buf;
+
+	buf = fio_memalign(blocksize, chunk_size, false);
+
+	do {
+		if (get_work(&thread->cur_offset, &thread->size)) {
+			thread->err = 1;
+			break;
+		}
+		if (do_work(thread, buf)) {
+			thread->err = 1;
+			break;
+		}
+	} while (1);
+
+	thread->done = 1;
+	fio_memfree(buf, chunk_size, false);
+	return NULL;
+}
+
+static void show_progress(struct worker_thread *threads, unsigned long total)
+{
+	unsigned long last_nitems = 0;
+	struct timespec last_tv;
+
+	fio_gettime(&last_tv, NULL);
+
+	while (print_progress) {
+		unsigned long this_items;
+		unsigned long nitems = 0;
+		uint64_t tdiff;
+		float perc;
+		int some_done = 0;
+		int i;
+
+		for (i = 0; i < num_threads; i++) {
+			nitems += threads[i].items;
+			some_done = threads[i].done;
+			if (some_done)
+				break;
+		}
+
+		if (some_done)
+			break;
+
+		perc = (float) nitems / (float) total;
+		perc *= 100.0;
+		this_items = nitems - last_nitems;
+		this_items *= blocksize;
+		tdiff = mtime_since_now(&last_tv);
+		if (tdiff) {
+			this_items = (this_items * 1000) / (tdiff * 1024);
+			printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
+			last_nitems = nitems;
+			fio_gettime(&last_tv, NULL);
+		} else
+			printf("%3.2f%% done\r", perc);
+		fflush(stdout);
+		usleep(250000);
+	};
+}
+
+static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
+			      uint64_t *nextents, uint64_t *nchunks)
+{
+	struct worker_thread *threads;
+	unsigned long nitems, total_items;
+	int i, err = 0;
+
+	total_size = dev_size;
+	total_items = dev_size / blocksize;
+	cur_offset = 0;
+	size_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+
+	threads = malloc(num_threads * sizeof(struct worker_thread));
+	for (i = 0; i < num_threads; i++) {
+		memset(&threads[i], 0, sizeof(struct worker_thread));
+		threads[i].fd = f->fd;
+
+		err = pthread_create(&threads[i].thread, NULL, thread_fn, &threads[i]);
+		if (err) {
+			log_err("fio: thread startup failed\n");
+			break;
+		}
+	}
+
+	show_progress(threads, total_items);
+
+	nitems = 0;
+	*nextents = 0;
+	*nchunks = 1;
+	for (i = 0; i < num_threads; i++) {
+		void *ret;
+		pthread_join(threads[i].thread, &ret);
+		nitems += threads[i].items;
+		*nchunks += threads[i].dupes;
+	}
+
+	printf("Threads(%u): %lu items processed\n", num_threads, nitems);
+
+	*nextents = nitems;
+	*nchunks = nitems - *nchunks;
+
+	fio_sem_remove(size_lock);
+	free(threads);
+	return err;
+}
+
+static int dedupe_check(const char *filename, uint64_t *nextents,
+			uint64_t *nchunks)
+{
+	uint64_t dev_size;
+	struct stat sb;
+	int flags;
+
+	flags = O_RDONLY;
+	if (odirect)
+		flags |= OS_O_DIRECT;
+
+	memset(&file, 0, sizeof(file));
+	file.file_name = strdup(filename);
+
+	file.fd = open(filename, flags);
+	if (file.fd == -1) {
+		perror("open");
+		goto err;
+	}
+
+	if (fstat(file.fd, &sb) < 0) {
+		perror("fstat");
+		goto err;
+	}
+
+	dev_size = get_size(&file, &sb);
+	if (!dev_size)
+		goto err;
+
+	if (use_bloom) {
+		uint64_t bloom_entries;
+
+		bloom_entries = 8 * (dev_size / blocksize);
+		bloom = bloom_new(bloom_entries);
+	}
+
+	printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+
+	return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+err:
+	if (file.fd != -1)
+		close(file.fd);
+	free(file.file_name);
+	return 1;
+}
+
+static void show_chunk(struct chunk *c)
+{
+	struct flist_head *n;
+	struct extent *e;
+
+	printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+	flist_for_each(n, &c->extent_list[0]) {
+		e = flist_entry(n, struct extent, list);
+		printf("\toffset %llu\n", (unsigned long long) e->offset);
+	}
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks)
+{
+	double perc, ratio;
+
+	printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+
+	if (nchunks) {
+		ratio = (double) nextents / (double) nchunks;
+		printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
+	} else
+		printf("De-dupe ratio: 1:infinite\n");
+
+	perc = 1.00 - ((double) nchunks / (double) nextents);
+	perc *= 100.0;
+	printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
+
+}
+
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+{
+	struct fio_rb_node *n;
+
+	*nchunks = *nextents = 0;
+
+	n = rb_first(&rb_root);
+	if (!n)
+		return;
+
+	do {
+		struct chunk *c;
+
+		c = rb_entry(n, struct chunk, rb_node);
+		(*nchunks)++;
+		*nextents += c->count;
+
+		if (dump_output)
+			show_chunk(c);
+
+	} while ((n = rb_next(n)) != NULL);
+}
+
+static int usage(char *argv[])
+{
+	log_err("Check for dedupable blocks on a device/file\n\n");
+	log_err("%s: [options] <device or file>\n", argv[0]);
+	log_err("\t-b\tChunk size to use\n");
+	log_err("\t-t\tNumber of threads to use\n");
+	log_err("\t-d\tFull extent/chunk debug output\n");
+	log_err("\t-o\tUse O_DIRECT\n");
+	log_err("\t-c\tFull collision check\n");
+	log_err("\t-B\tUse probabilistic bloom filter\n");
+	log_err("\t-p\tPrint progress indicator\n");
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t nextents = 0, nchunks = 0;
+	int c, ret;
+
+	arch_init(argv);
+	debug_init();
+
+	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+		switch (c) {
+		case 'b':
+			blocksize = atoi(optarg);
+			break;
+		case 't':
+			num_threads = atoi(optarg);
+			break;
+		case 'd':
+			dump_output = atoi(optarg);
+			break;
+		case 'o':
+			odirect = atoi(optarg);
+			break;
+		case 'c':
+			collision_check = atoi(optarg);
+			break;
+		case 'p':
+			print_progress = atoi(optarg);
+			break;
+		case 'B':
+			use_bloom = atoi(optarg);
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (collision_check || dump_output)
+		use_bloom = 0;
+
+	if (!num_threads)
+		num_threads = cpus_online();
+
+	if (argc == optind)
+		return usage(argv);
+
+	sinit();
+
+	rb_root = RB_ROOT;
+	rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
+
+	ret = dedupe_check(argv[optind], &nextents, &nchunks);
+
+	if (!ret) {
+		if (!bloom)
+			iter_rb_tree(&nextents, &nchunks);
+
+		show_stat(nextents, nchunks);
+	}
+
+	fio_sem_remove(rb_lock);
+	if (bloom)
+		bloom_free(bloom);
+	scleanup();
+	return ret;
+}
diff -Nru fio-2.1.3/t/gen-rand.c fio-3.16/t/gen-rand.c
--- fio-2.1.3/t/gen-rand.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/gen-rand.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,62 @@
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../lib/types.h"
+#include "../lib/rand.h"
+#include "../log.h"
+
+int main(int argc, char *argv[])
+{
+	struct frand_state s;
+	uint64_t i, start, end, nvalues;
+	unsigned long *buckets, index, pass, fail;
+	double p, dev, mean, vmin, vmax;
+
+	if (argc < 4) {
+		log_err("%s: start end nvalues\n", argv[0]);
+		return 1;
+	}
+
+	start = strtoul(argv[1], NULL, 10);
+	end = strtoul(argv[2], NULL, 10);
+
+	if (start >= end) {
+		log_err("%s: start must be smaller than end\n", argv[0]);
+		return 1;
+	}
+	index = 1 + end - start;
+	buckets = calloc(index, sizeof(unsigned long));
+
+	nvalues = strtoul(argv[3], NULL, 10);
+
+	init_rand(&s, false);
+
+	for (i = 0; i < nvalues; i++) {
+		int v = rand_between(&s, start, end);
+
+		buckets[v - start]++;
+	}
+
+	p = 1.0 / index;
+	dev = sqrt(nvalues * p * (1.0 - p));
+	mean = nvalues * p;
+	vmin = mean - dev;
+	vmax = mean + dev;
+
+	pass = fail = 0;
+	for (i = 0; i < index; i++) {
+		if (buckets[i] < vmin || buckets[i] > vmax) {
+			printf("FAIL bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			fail++;
+		} else {
+			printf("PASS bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			pass++;
+		}
+	}
+
+	printf("Passes=%lu, Fail=%lu\n", pass, fail);
+	free(buckets);
+	return 0;
+}
diff -Nru fio-2.1.3/t/genzipf.c fio-3.16/t/genzipf.c
--- fio-2.1.3/t/genzipf.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/t/genzipf.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,10 +3,10 @@
  * what an access pattern would look like.
  *
  * For instance, the following would generate a zipf distribution
- * with theta 1.2, using 100,000 values and split the reporting into
- * 20 buckets:
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
  *
- *	t/genzipf zipf 1.2 100000 20
+ *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
  *
  * Only the distribution type (zipf or pareto) and spread input need
  * to be given, if not given defaults are used.
@@ -14,16 +14,15 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <fcntl.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "../lib/zipf.h"
+#include "../lib/gauss.h"
 #include "../flist.h"
 #include "../hash.h"
 
-#define DEF_NR		1000000
-#define DEF_NR_OUTPUT	23
+#define DEF_NR_OUTPUT	20
 
 struct node {
 	struct flist_head list;
@@ -39,23 +38,34 @@
 	TYPE_NONE = 0,
 	TYPE_ZIPF,
 	TYPE_PARETO,
+	TYPE_NORMAL,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
+
+enum {
+	OUTPUT_NORMAL,
+	OUTPUT_CSV,
 };
-static const char *dist_types[] = { "None", "Zipf", "Pareto" };
 
 static int dist_type = TYPE_ZIPF;
-static unsigned long gb_size = 500;
+static unsigned long gib_size = 500;
 static unsigned long block_size = 4096;
 static unsigned long output_nranges = DEF_NR_OUTPUT;
 static double percentage;
 static double dist_val;
-static int output_csv = 0;
+static int output_type = OUTPUT_NORMAL;
 
 #define DEF_ZIPF_VAL	1.2
 #define DEF_PARETO_VAL	0.3
 
+static unsigned int hashv(unsigned long long val)
+{
+	return jhash(&val, sizeof(val), 0) & (hash_size - 1);
+}
+
 static struct node *hash_lookup(unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *l = &hash[hashv(val)];
 	struct flist_head *entry;
 	struct node *n;
 
@@ -68,14 +78,13 @@
 	return NULL;
 }
 
-static struct node *hash_insert(struct node *n, unsigned long long val)
+static void hash_insert(struct node *n, unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *l = &hash[hashv(val)];
 
 	n->val = val;
 	n->hits = 1;
 	flist_add_tail(&n->list, l);
-	return n;
 }
 
 static void usage(void)
@@ -83,11 +92,12 @@
 	printf("genzipf: test zipf/pareto values for fio input\n");
 	printf("\t-h\tThis help screen\n");
 	printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
-	printf("\t-t\tDistribution type (zipf or pareto)\n");
-	printf("\t-i\tDistribution algorithm input (zipf theta or pareto power)\n");
+	printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
+	printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
+		"\t\tor normal %% deviation)\n");
 	printf("\t-b\tBlock size of a given range (in bytes)\n");
 	printf("\t-g\tSize of data set (in gigabytes)\n");
-	printf("\t-o\tNumber of output columns\n");
+	printf("\t-o\tNumber of output rows\n");
 	printf("\t-c\tOutput ranges in CSV format\n");
 }
 
@@ -112,13 +122,15 @@
 				dist_type = TYPE_ZIPF;
 			else if (!strncmp(optarg, "pareto", 6))
 				dist_type = TYPE_PARETO;
+			else if (!strncmp(optarg, "normal", 6))
+				dist_type = TYPE_NORMAL;
 			else {
 				printf("wrong dist type: %s\n", optarg);
 				return 1;
 			}
 			break;
 		case 'g':
-			gb_size = strtoul(optarg, NULL, 10);
+			gib_size = strtoul(optarg, NULL, 10);
 			break;
 		case 'i':
 			dist_val = atof(optarg);
@@ -128,7 +140,7 @@
 			output_nranges = strtoul(optarg, NULL, 10);
 			break;
 		case 'c':
-			output_csv = 1;
+			output_type = OUTPUT_CSV;
 			break;
 		default:
 			printf("bad option %c\n", c);
@@ -168,29 +180,128 @@
 	return n2->hits - n1->hits;
 }
 
+static void output_csv(struct node *nodes, unsigned long nnodes)
+{
+	unsigned long i;
+
+	printf("rank, count\n");
+	for (i = 0; i < nnodes; i++)
+		printf("%lu, %lu\n", i, nodes[i].hits);
+}
+
+static void output_normal(struct node *nodes, unsigned long nnodes,
+			  unsigned long nranges)
+{
+	unsigned long i, j, cur_vals, interval_step, next_interval, total_vals;
+	unsigned long blocks = percentage * nnodes / 100;
+	double hit_percent_sum = 0;
+	unsigned long long hit_sum = 0;
+	double perc, perc_i;
+	struct output_sum *output_sums;
+
+	interval_step = (nnodes - 1) / output_nranges + 1;
+	next_interval = interval_step;
+	output_sums = malloc(output_nranges * sizeof(struct output_sum));
+
+	for (i = 0; i < output_nranges; i++) {
+		output_sums[i].output = 0.0;
+		output_sums[i].nranges = 0;
+	}
+
+	j = total_vals = cur_vals = 0;
+
+	for (i = 0; i < nnodes; i++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[i];
+		cur_vals += node->hits;
+		total_vals += node->hits;
+		os->nranges += node->hits;
+		if (i == (next_interval) -1 || i == nnodes - 1) {
+			os->output = (double) cur_vals / (double) nranges;
+			os->output *= 100.0;
+			cur_vals = 0;
+			next_interval += interval_step;
+			j++;
+		}
+
+		if (percentage) {
+			if (total_vals >= blocks) {
+				double cs = (double) i * block_size / (1024.0 * 1024.0);
+				char p = 'M';
+
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'G';
+				}
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'T';
+				}
+
+				printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+				percentage = 0.0;
+			}
+		}
+	}
+
+	perc_i = 100.0 / (double)output_nranges;
+	perc = 0.0;
+
+	printf("\n   Rows           Hits %%         Sum %%           # Hits          Size\n");
+	printf("-----------------------------------------------------------------------\n");
+	for (i = 0; i < output_nranges; i++) {
+		struct output_sum *os = &output_sums[i];
+		double gb = (double)os->nranges * block_size / 1024.0;
+		char p = 'K';
+
+		if (gb > 1024.0) {
+			p = 'M';
+			gb /= 1024.0;
+		}
+		if (gb > 1024.0) {
+			p = 'G';
+			gb /= 1024.0;
+		}
+
+		perc += perc_i;
+		hit_percent_sum += os->output;
+		hit_sum += os->nranges;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%%\t\t%8u\t%6.2f%c\n",
+			i ? "|->" : "Top", perc, os->output, hit_percent_sum,
+			os->nranges, gb, p);
+	}
+
+	printf("-----------------------------------------------------------------------\n");
+	printf("Total\t\t\t\t\t\t%8llu\n", hit_sum);
+	free(output_sums);
+}
+
 int main(int argc, char *argv[])
 {
 	unsigned long offset;
-	unsigned long i, j, k, nr_vals, cur_vals, interval, total_vals, nnodes;
 	unsigned long long nranges;
-	struct output_sum *output_sums;
+	unsigned long nnodes;
 	struct node *nodes;
-	double perc, perc_i;
 	struct zipf_state zs;
+	struct gauss_state gs;
+	int i, j;
 
 	if (parse_options(argc, argv))
 		return 1;
 
-	if( !output_csv )
-		printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
+	if (output_type != OUTPUT_CSV)
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
 
-	nranges = gb_size * 1024 * 1024 * 1024ULL;
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
 	nranges /= block_size;
 
 	if (dist_type == TYPE_ZIPF)
 		zipf_init(&zs, nranges, dist_val, 1);
-	else
+	else if (dist_type == TYPE_PARETO)
 		pareto_init(&zs, nranges, dist_val, 1);
+	else
+		gauss_init(&gs, nranges, dist_val, 1);
 
 	hash_bits = 0;
 	hash_size = nranges;
@@ -199,19 +310,21 @@
 
 	hash_size = 1 << hash_bits;
 
-	hash = malloc(hash_size * sizeof(struct flist_head));
+	hash = calloc(hash_size, sizeof(struct flist_head));
 	for (i = 0; i < hash_size; i++)
 		INIT_FLIST_HEAD(&hash[i]);
 
 	nodes = malloc(nranges * sizeof(struct node));
 
-	for (nr_vals = i = j = 0; i < nranges; i++) {
+	for (i = j = 0; i < nranges; i++) {
 		struct node *n;
 
 		if (dist_type == TYPE_ZIPF)
 			offset = zipf_next(&zs);
-		else
+		else if (dist_type == TYPE_PARETO)
 			offset = pareto_next(&zs);
+		else
+			offset = gauss_next(&gs);
 
 		n = hash_lookup(offset);
 		if (n)
@@ -220,101 +333,15 @@
 			hash_insert(&nodes[j], offset);
 			j++;
 		}
-
-		nr_vals++;
 	}
 
 	qsort(nodes, j, sizeof(struct node), node_cmp);
 	nnodes = j;
-	nr_vals = nnodes;
-
-	if (output_csv) {
-		printf("rank, count\n");
-		for (k = 0; k < nnodes; k++)
-			printf("%lu, %lu\n", k, nodes[k].hits);
-	} else {
-		interval = (nr_vals + output_nranges - 1) / output_nranges;
-
-		output_sums = malloc(output_nranges * sizeof(struct output_sum));
-		for (i = 0; i < output_nranges; i++) {
-			output_sums[i].output = 0.0;
-			output_sums[i].nranges = 1;
-		}
-
-		total_vals = i = j = cur_vals = 0;
-
-		for (k = 0; k < nnodes; k++) {
-			struct output_sum *os = &output_sums[j];
-			struct node *node = &nodes[k];
-
-			if (i >= interval) {
-				os->output =
-				    (double)(cur_vals + 1) / (double)nranges;
-				os->output *= 100.0;
-				j++;
-				cur_vals = node->hits;
-				interval +=
-				    (nr_vals + output_nranges -
-				     1) / output_nranges;
-			} else {
-				cur_vals += node->hits;
-				os->nranges += node->hits;
-			}
-
-			i++;
-			total_vals += node->hits;
-
-			if (percentage) {
-				unsigned long blocks =
-				    percentage * nranges / 100;
-
-				if (total_vals >= blocks) {
-					double cs =
-					    i * block_size / (1024 * 1024);
-					char p = 'M';
-
-					if (cs > 1024.0) {
-						cs /= 1024.0;
-						p = 'G';
-					}
-					if (cs > 1024.0) {
-						cs /= 1024.0;
-						p = 'T';
-					}
-
-					printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
-					percentage = 0.0;
-				}
-			}
-		}
-
-		perc_i = 100.0 / (double)output_nranges;
-		perc = 0.0;
 
-		printf("\n   Rows           Hits           No Hits         Size\n");
-		printf("--------------------------------------------------------\n");
-		for (i = 0; i < j; i++) {
-			struct output_sum *os = &output_sums[i];
-			double gb = (double)os->nranges * block_size / 1024.0;
-			char p = 'K';
-
-			if (gb > 1024.0) {
-				p = 'M';
-				gb /= 1024.0;
-			}
-			if (gb > 1024.0) {
-				p = 'G';
-				gb /= 1024.0;
-			}
-
-			perc += perc_i;
-			printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n",
-			       i ? "|->" : "Top", perc, os->output, os->nranges,
-			       gb, p);
-		}
-
-		free(output_sums);
-	}
+	if (output_type == OUTPUT_CSV)
+		output_csv(nodes, nnodes);
+	else
+		output_normal(nodes, nnodes, nranges);
 
 	free(hash);
 	free(nodes);
diff -Nru fio-2.1.3/t/io_uring.c fio-3.16/t/io_uring.c
--- fio-2.1.3/t/io_uring.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/io_uring.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,627 @@
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <linux/fs.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <sched.h>
+
+#include "../arch/arch.h"
+#include "../lib/types.h"
+#include "../os/linux/io_uring.h"
+
+#define min(a, b)		((a < b) ? (a) : (b))
+
+struct io_sq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	unsigned *flags;
+	unsigned *array;
+};
+
+struct io_cq_ring {
+	unsigned *head;
+	unsigned *tail;
+	unsigned *ring_mask;
+	unsigned *ring_entries;
+	struct io_uring_cqe *cqes;
+};
+
+#define DEPTH			128
+#define BATCH_SUBMIT		32
+#define BATCH_COMPLETE		32
+
+#define BS			4096
+
+#define MAX_FDS			16
+
+static unsigned sq_ring_mask, cq_ring_mask;
+
+struct file {
+	unsigned long max_blocks;
+	unsigned pending_ios;
+	int real_fd;
+	int fixed_fd;
+};
+
+struct submitter {
+	pthread_t thread;
+	int ring_fd;
+	struct drand48_data rand;
+	struct io_sq_ring sq_ring;
+	struct io_uring_sqe *sqes;
+	struct io_cq_ring cq_ring;
+	int inflight;
+	unsigned long reaps;
+	unsigned long done;
+	unsigned long calls;
+	volatile int finish;
+
+	__s32 *fds;
+
+	struct file files[MAX_FDS];
+	unsigned nr_files;
+	unsigned cur_file;
+	struct iovec iovecs[];
+};
+
+static struct submitter *submitter;
+static volatile int finish;
+
+static int depth = DEPTH;
+static int batch_submit = BATCH_SUBMIT;
+static int batch_complete = BATCH_COMPLETE;
+static int polled = 1;		/* use IO polling */
+static int fixedbufs = 1;	/* use fixed user buffers */
+static int register_files = 1;	/* use fixed files */
+static int buffered = 0;	/* use buffered IO, not O_DIRECT */
+static int sq_thread_poll = 0;	/* use kernel submission/poller thread */
+static int sq_thread_cpu = -1;	/* pin above thread to this CPU */
+static int do_nop = 0;		/* no-op SQ ring commands */
+
+static int io_uring_register_buffers(struct submitter *s)
+{
+	if (do_nop)
+		return 0;
+
+	return syscall(__NR_sys_io_uring_register, s->ring_fd,
+			IORING_REGISTER_BUFFERS, s->iovecs, depth);
+}
+
+static int io_uring_register_files(struct submitter *s)
+{
+	int i;
+
+	if (do_nop)
+		return 0;
+
+	s->fds = calloc(s->nr_files, sizeof(__s32));
+	for (i = 0; i < s->nr_files; i++) {
+		s->fds[i] = s->files[i].real_fd;
+		s->files[i].fixed_fd = i;
+	}
+
+	return syscall(__NR_sys_io_uring_register, s->ring_fd,
+			IORING_REGISTER_FILES, s->fds, s->nr_files);
+}
+
+static int io_uring_setup(unsigned entries, struct io_uring_params *p)
+{
+	return syscall(__NR_sys_io_uring_setup, entries, p);
+}
+
+static int io_uring_enter(struct submitter *s, unsigned int to_submit,
+			  unsigned int min_complete, unsigned int flags)
+{
+	return syscall(__NR_sys_io_uring_enter, s->ring_fd, to_submit,
+			min_complete, flags, NULL, 0);
+}
+
+static int gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+static unsigned file_depth(struct submitter *s)
+{
+	return (depth + s->nr_files - 1) / s->nr_files;
+}
+
+static void init_io(struct submitter *s, unsigned index)
+{
+	struct io_uring_sqe *sqe = &s->sqes[index];
+	unsigned long offset;
+	struct file *f;
+	long r;
+
+	if (do_nop) {
+		sqe->opcode = IORING_OP_NOP;
+		return;
+	}
+
+	if (s->nr_files == 1) {
+		f = &s->files[0];
+	} else {
+		f = &s->files[s->cur_file];
+		if (f->pending_ios >= file_depth(s)) {
+			s->cur_file++;
+			if (s->cur_file == s->nr_files)
+				s->cur_file = 0;
+			f = &s->files[s->cur_file];
+		}
+	}
+	f->pending_ios++;
+
+	lrand48_r(&s->rand, &r);
+	offset = (r % (f->max_blocks - 1)) * BS;
+
+	if (register_files) {
+		sqe->flags = IOSQE_FIXED_FILE;
+		sqe->fd = f->fixed_fd;
+	} else {
+		sqe->flags = 0;
+		sqe->fd = f->real_fd;
+	}
+	if (fixedbufs) {
+		sqe->opcode = IORING_OP_READ_FIXED;
+		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
+		sqe->len = BS;
+		sqe->buf_index = index;
+	} else {
+		sqe->opcode = IORING_OP_READV;
+		sqe->addr = (unsigned long) &s->iovecs[index];
+		sqe->len = 1;
+		sqe->buf_index = 0;
+	}
+	sqe->ioprio = 0;
+	sqe->off = offset;
+	sqe->user_data = (unsigned long) f;
+}
+
+static int prep_more_ios(struct submitter *s, int max_ios)
+{
+	struct io_sq_ring *ring = &s->sq_ring;
+	unsigned index, tail, next_tail, prepped = 0;
+
+	next_tail = tail = *ring->tail;
+	do {
+		next_tail++;
+		read_barrier();
+		if (next_tail == *ring->head)
+			break;
+
+		index = tail & sq_ring_mask;
+		init_io(s, index);
+		ring->array[index] = index;
+		prepped++;
+		tail = next_tail;
+	} while (prepped < max_ios);
+
+	if (*ring->tail != tail) {
+		/* order tail store with writes to sqes above */
+		write_barrier();
+		*ring->tail = tail;
+		write_barrier();
+	}
+	return prepped;
+}
+
+static int get_file_size(struct file *f)
+{
+	struct stat st;
+
+	if (fstat(f->real_fd, &st) < 0)
+		return -1;
+	if (S_ISBLK(st.st_mode)) {
+		unsigned long long bytes;
+
+		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
+			return -1;
+
+		f->max_blocks = bytes / BS;
+		return 0;
+	} else if (S_ISREG(st.st_mode)) {
+		f->max_blocks = st.st_size / BS;
+		return 0;
+	}
+
+	return -1;
+}
+
+static int reap_events(struct submitter *s)
+{
+	struct io_cq_ring *ring = &s->cq_ring;
+	struct io_uring_cqe *cqe;
+	unsigned head, reaped = 0;
+
+	head = *ring->head;
+	do {
+		struct file *f;
+
+		read_barrier();
+		if (head == *ring->tail)
+			break;
+		cqe = &ring->cqes[head & cq_ring_mask];
+		if (!do_nop) {
+			f = (struct file *) (uintptr_t) cqe->user_data;
+			f->pending_ios--;
+			if (cqe->res != BS) {
+				printf("io: unexpected ret=%d\n", cqe->res);
+				if (polled && cqe->res == -EOPNOTSUPP)
+					printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+				return -1;
+			}
+		}
+		reaped++;
+		head++;
+	} while (1);
+
+	s->inflight -= reaped;
+	*ring->head = head;
+	write_barrier();
+	return reaped;
+}
+
+static void *submitter_fn(void *data)
+{
+	struct submitter *s = data;
+	struct io_sq_ring *ring = &s->sq_ring;
+	int ret, prepped;
+
+	printf("submitter=%d\n", gettid());
+
+	srand48_r(pthread_self(), &s->rand);
+
+	prepped = 0;
+	do {
+		int to_wait, to_submit, this_reap, to_prep;
+
+		if (!prepped && s->inflight < depth) {
+			to_prep = min(depth - s->inflight, batch_submit);
+			prepped = prep_more_ios(s, to_prep);
+		}
+		s->inflight += prepped;
+submit_more:
+		to_submit = prepped;
+submit:
+		if (to_submit && (s->inflight + to_submit <= depth))
+			to_wait = 0;
+		else
+			to_wait = min(s->inflight + to_submit, batch_complete);
+
+		/*
+		 * Only need to call io_uring_enter if we're not using SQ thread
+		 * poll, or if IORING_SQ_NEED_WAKEUP is set.
+		 */
+		if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
+			unsigned flags = 0;
+
+			if (to_wait)
+				flags = IORING_ENTER_GETEVENTS;
+			if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
+				flags |= IORING_ENTER_SQ_WAKEUP;
+			ret = io_uring_enter(s, to_submit, to_wait, flags);
+			s->calls++;
+		}
+
+		/*
+		 * For non SQ thread poll, we already got the events we needed
+		 * through the io_uring_enter() above. For SQ thread poll, we
+		 * need to loop here until we find enough events.
+		 */
+		this_reap = 0;
+		do {
+			int r;
+			r = reap_events(s);
+			if (r == -1) {
+				s->finish = 1;
+				break;
+			} else if (r > 0)
+				this_reap += r;
+		} while (sq_thread_poll && this_reap < to_wait);
+		s->reaps += this_reap;
+
+		if (ret >= 0) {
+			if (!ret) {
+				to_submit = 0;
+				if (s->inflight)
+					goto submit;
+				continue;
+			} else if (ret < to_submit) {
+				int diff = to_submit - ret;
+
+				s->done += ret;
+				prepped -= diff;
+				goto submit_more;
+			}
+			s->done += ret;
+			prepped = 0;
+			continue;
+		} else if (ret < 0) {
+			if (errno == EAGAIN) {
+				if (s->finish)
+					break;
+				if (this_reap)
+					goto submit;
+				to_submit = 0;
+				goto submit;
+			}
+			printf("io_submit: %s\n", strerror(errno));
+			break;
+		}
+	} while (!s->finish);
+
+	finish = 1;
+	return NULL;
+}
+
+static void sig_int(int sig)
+{
+	printf("Exiting on signal %d\n", sig);
+	submitter->finish = 1;
+	finish = 1;
+}
+
+static void arm_sig_int(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = sig_int;
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGINT, &act, NULL);
+}
+
+static int setup_ring(struct submitter *s)
+{
+	struct io_sq_ring *sring = &s->sq_ring;
+	struct io_cq_ring *cring = &s->cq_ring;
+	struct io_uring_params p;
+	int ret, fd;
+	void *ptr;
+
+	memset(&p, 0, sizeof(p));
+
+	if (polled && !do_nop)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (sq_thread_poll) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (sq_thread_cpu != -1) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = sq_thread_cpu;
+		}
+	}
+
+	fd = io_uring_setup(depth, &p);
+	if (fd < 0) {
+		perror("io_uring_setup");
+		return 1;
+	}
+	s->ring_fd = fd;
+
+	if (fixedbufs) {
+		ret = io_uring_register_buffers(s);
+		if (ret < 0) {
+			perror("io_uring_register_buffers");
+			return 1;
+		}
+	}
+
+	if (register_files) {
+		ret = io_uring_register_files(s);
+		if (ret < 0) {
+			perror("io_uring_register_files");
+			return 1;
+		}
+	}
+
+	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQ_RING);
+	printf("sq_ring ptr = 0x%p\n", ptr);
+	sring->head = ptr + p.sq_off.head;
+	sring->tail = ptr + p.sq_off.tail;
+	sring->ring_mask = ptr + p.sq_off.ring_mask;
+	sring->ring_entries = ptr + p.sq_off.ring_entries;
+	sring->flags = ptr + p.sq_off.flags;
+	sring->array = ptr + p.sq_off.array;
+	sq_ring_mask = *sring->ring_mask;
+
+	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQES);
+	printf("sqes ptr    = 0x%p\n", s->sqes);
+
+	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_CQ_RING);
+	printf("cq_ring ptr = 0x%p\n", ptr);
+	cring->head = ptr + p.cq_off.head;
+	cring->tail = ptr + p.cq_off.tail;
+	cring->ring_mask = ptr + p.cq_off.ring_mask;
+	cring->ring_entries = ptr + p.cq_off.ring_entries;
+	cring->cqes = ptr + p.cq_off.cqes;
+	cq_ring_mask = *cring->ring_mask;
+	return 0;
+}
+
+static void file_depths(char *buf)
+{
+	struct submitter *s = submitter;
+	char *p;
+	int i;
+
+	buf[0] = '\0';
+	p = buf;
+	for (i = 0; i < s->nr_files; i++) {
+		struct file *f = &s->files[i];
+
+		if (i + 1 == s->nr_files)
+			p += sprintf(p, "%d", f->pending_ios);
+		else
+			p += sprintf(p, "%d, ", f->pending_ios);
+	}
+}
+
+static void usage(char *argv)
+{
+	printf("%s [options] -- [filenames]\n"
+		" -d <int> : IO Depth, default %d\n"
+		" -s <int> : Batch submit, default %d\n"
+		" -c <int> : Batch complete, default %d\n",
+		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE);
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct submitter *s;
+	unsigned long done, calls, reap;
+	int err, i, flags, fd, opt;
+	char *fdepths;
+	void *ret;
+
+	if (!do_nop && argc < 2) {
+		printf("%s: filename [options]\n", argv[0]);
+		return 1;
+	}
+
+	while ((opt = getopt(argc, argv, "d:s:c:h?")) != -1) {
+		switch (opt) {
+		case 'd':
+			depth = atoi(optarg);
+			break;
+		case 's':
+			batch_submit = atoi(optarg);
+			break;
+		case 'c':
+			batch_complete = atoi(optarg);
+			break;
+		case 'h':
+		case '?':
+		default:
+			usage(argv[0]);
+			break;
+		}
+	}
+
+	submitter = malloc(sizeof(*submitter) + depth * sizeof(struct iovec));
+	memset(submitter, 0, sizeof(*submitter) + depth * sizeof(struct iovec));
+	s = submitter;
+
+	flags = O_RDONLY | O_NOATIME;
+	if (!buffered)
+		flags |= O_DIRECT;
+
+	i = optind;
+	while (!do_nop && i < argc) {
+		struct file *f;
+
+		if (s->nr_files == MAX_FDS) {
+			printf("Max number of files (%d) reached\n", MAX_FDS);
+			break;
+		}
+		fd = open(argv[i], flags);
+		if (fd < 0) {
+			perror("open");
+			return 1;
+		}
+
+		f = &s->files[s->nr_files];
+		f->real_fd = fd;
+		if (get_file_size(f)) {
+			printf("failed getting size of device/file\n");
+			return 1;
+		}
+		if (f->max_blocks <= 1) {
+			printf("Zero file/device size?\n");
+			return 1;
+		}
+		f->max_blocks--;
+
+		printf("Added file %s\n", argv[i]);
+		s->nr_files++;
+		i++;
+	}
+
+	if (fixedbufs) {
+		struct rlimit rlim;
+
+		rlim.rlim_cur = RLIM_INFINITY;
+		rlim.rlim_max = RLIM_INFINITY;
+		if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+			perror("setrlimit");
+			return 1;
+		}
+	}
+
+	arm_sig_int();
+
+	for (i = 0; i < depth; i++) {
+		void *buf;
+
+		if (posix_memalign(&buf, BS, BS)) {
+			printf("failed alloc\n");
+			return 1;
+		}
+		s->iovecs[i].iov_base = buf;
+		s->iovecs[i].iov_len = BS;
+	}
+
+	err = setup_ring(s);
+	if (err) {
+		printf("ring setup failed: %s, %d\n", strerror(errno), err);
+		return 1;
+	}
+	printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
+	printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+
+	pthread_create(&s->thread, NULL, submitter_fn, s);
+
+	fdepths = malloc(8 * s->nr_files);
+	reap = calls = done = 0;
+	do {
+		unsigned long this_done = 0;
+		unsigned long this_reap = 0;
+		unsigned long this_call = 0;
+		unsigned long rpc = 0, ipc = 0;
+
+		sleep(1);
+		this_done += s->done;
+		this_call += s->calls;
+		this_reap += s->reaps;
+		if (this_call - calls) {
+			rpc = (this_done - done) / (this_call - calls);
+			ipc = (this_reap - reap) / (this_call - calls);
+		} else
+			rpc = ipc = -1;
+		file_depths(fdepths);
+		printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
+				this_done - done, rpc, ipc, s->inflight,
+				fdepths);
+		done = this_done;
+		calls = this_call;
+		reap = this_reap;
+	} while (!finish);
+
+	pthread_join(s->thread, &ret);
+	close(s->ring_fd);
+	free(fdepths);
+	return 0;
+}
diff -Nru fio-2.1.3/t/jobs/readonly-r.fio fio-3.16/t/jobs/readonly-r.fio
--- fio-2.1.3/t/jobs/readonly-r.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/readonly-r.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,5 @@
+[test]
+filename=${DUT}
+rw=randread
+time_based
+runtime=1s
diff -Nru fio-2.1.3/t/jobs/readonly-t.fio fio-3.16/t/jobs/readonly-t.fio
--- fio-2.1.3/t/jobs/readonly-t.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/readonly-t.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,5 @@
+[test]
+filename=${DUT}
+rw=randtrim
+time_based
+runtime=1s
diff -Nru fio-2.1.3/t/jobs/readonly-w.fio fio-3.16/t/jobs/readonly-w.fio
--- fio-2.1.3/t/jobs/readonly-w.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/readonly-w.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,5 @@
+[test]
+filename=${DUT}
+rw=randwrite
+time_based
+runtime=1s
diff -Nru fio-2.1.3/t/jobs/t0009-f8b0bd10.fio fio-3.16/t/jobs/t0009-f8b0bd10.fio
--- fio-2.1.3/t/jobs/t0009-f8b0bd10.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/t0009-f8b0bd10.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,40 @@
+# Expected result: fio verifies and runs for 1m
+# Buggy result: fio crashes with:
+# __get_io_u: Assertion `io_u->flags & IO_U_F_FREE' failed
+
+[global]
+direct=1
+ioengine=null
+size=20g
+norandommap
+randrepeat=0
+bs=4096
+iodepth=170
+#iodepth=96
+#numjobs=1
+numjobs=1
+#numjobs=24
+# number_ios=1
+# runtime=216000
+runtime=3600
+time_based=1
+group_reporting=1
+thread
+gtod_reduce=1
+iodepth_batch=4
+iodepth_batch_complete=4
+cpus_allowed=0-5
+cpus_allowed_policy=split
+rw=randwrite
+verify=crc32c-intel
+verify_backlog=1m
+do_verify=1
+verify_async=6
+verify_async_cpus=0-5
+runtime=1m
+
+[4_KiB_RR_drive_r]
+
+[4_KiB_RR_drive_s]
+
+
diff -Nru fio-2.1.3/t/jobs/t0010-b7aae4ba.fio fio-3.16/t/jobs/t0010-b7aae4ba.fio
--- fio-2.1.3/t/jobs/t0010-b7aae4ba.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/t0010-b7aae4ba.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,8 @@
+# Expected result: fio runs and completes the job
+# Buggy result: fio segfaults
+#
+[test]
+ioengine=null
+size=10g
+io_submit_mode=offload
+iodepth=16
diff -Nru fio-2.1.3/t/jobs/t0011-5d2788d5.fio fio-3.16/t/jobs/t0011-5d2788d5.fio
--- fio-2.1.3/t/jobs/t0011-5d2788d5.fio	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/jobs/t0011-5d2788d5.fio	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,18 @@
+# Expected results: no parse warnings, runs and with roughly 1/8 iops between
+#			the two jobs.
+# Buggy result: parse warning on flow value overflow, no 1/8 division between
+			jobs.
+#
+[global]
+bs=4k
+ioengine=null
+size=100g
+runtime=3
+flow_id=1
+
+[flow1]
+flow=-8
+rate_iops=1000
+
+[flow2]
+flow=1
diff -Nru fio-2.1.3/t/lfsr-test.c fio-3.16/t/lfsr-test.c
--- fio-2.1.3/t/lfsr-test.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/t/lfsr-test.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,15 +1,13 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
 #include <math.h>
 #include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
 
 #include "../lib/lfsr.h"
+#include "../gettime.h"
+#include "../fio_time.h"
 
-void usage()
+static void usage(void)
 {
 	printf("Usage: lfsr-test 0x<numbers> [seed] [spin] [verify]\n");
 	printf("-------------------------------------------------------------\n");
@@ -36,12 +34,17 @@
 	void *v = NULL, *v_start;
 	double total, mean;
 
+	arch_init(argv);
+
 	/* Read arguments */
 	switch (argc) {
 		case 5: if (strncmp(argv[4], "verify", 7) == 0)
-					verify = 1;
+				verify = 1;
+			/* fall through */
 		case 4: spin = atoi(argv[3]);
+			/* fall through */
 		case 3: seed = atol(argv[2]);
+			/* fall through */
 		case 2: numbers = strtol(argv[1], NULL, 16);
 				break;
 		default: usage();
@@ -65,18 +68,18 @@
 	printf("LFSR specs\n");
 	printf("==========================\n");
 	printf("Size is         %u\n", 64 - __builtin_clzl(fl->cached_bit));
-	printf("Max val is      %lu\n", fl->max_val);
-	printf("XOR-mask is     0x%lX\n", fl->xormask);
-	printf("Seed is         %lu\n", fl->last_val);
+	printf("Max val is      %lu\n", (unsigned long) fl->max_val);
+	printf("XOR-mask is     0x%lX\n", (unsigned long) fl->xormask);
+	printf("Seed is         %lu\n", (unsigned long) fl->last_val);
 	printf("Spin is         %u\n", fl->spin);
-	printf("Cycle length is %lu\n", fl->cycle_length);
+	printf("Cycle length is %lu\n", (unsigned long) fl->cycle_length);
 
 	/* Create verification table */
 	if (verify) {
 		v_size = numbers * sizeof(uint8_t);
 		v = malloc(v_size);
 		memset(v, 0, v_size);
-		printf("\nVerification table is %lf KBs\n", (double)(v_size) / 1024);
+		printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
 	}
 	v_start = v;
 
@@ -86,12 +89,12 @@
 	 * negligible overhead.
 	 */
 	fprintf(stderr, "\nTest initiated... ");
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
-	while (!lfsr_next(fl, &i, fl->max_val)) {
+	fio_gettime(&start, NULL);
+	while (!lfsr_next(fl, &i)) {
 		if (verify)
 			*(uint8_t *)(v + i) += 1;
 	}
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
+	fio_gettime(&end, NULL);
 	fprintf(stderr, "finished.\n");
 
 
@@ -102,7 +105,8 @@
 		for (i = 0; i < numbers; i++) {
 			if (*(uint8_t *)(v + i) != 1) {
 				fprintf(stderr, "failed (%lu = %d).\n",
-						i, *(uint8_t *)(v + i));
+						(unsigned long) i,
+						*(uint8_t *)(v + i));
 				r = 1;
 				break;
 			}
@@ -112,16 +116,15 @@
 	}
 
 	/* Calculate elapsed time and mean time per number */
-	total = (end.tv_sec - start.tv_sec) * pow(10,9) +
-		end.tv_nsec - start.tv_nsec;
+	total = utime_since(&start, &end);
 	mean = total / fl->num_vals;
 
 	printf("\nTime results ");
 	if (verify)
 		printf("(slower due to verification)");
 	printf("\n==============================\n");
-	printf("Elapsed: %lf s\n", total / pow(10,9));
-	printf("Mean:    %lf ns\n", mean);
+	printf("Elapsed: %lf s\n", total / pow(10,6));
+	printf("Mean:    %lf us\n", mean);
 
 	free(v_start);
 	free(fl);
diff -Nru fio-2.1.3/t/log.c fio-3.16/t/log.c
--- fio-2.1.3/t/log.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/t/log.c	2019-09-20 01:01:52.000000000 +0000
@@ -2,7 +2,7 @@
 #include <stdarg.h>
 #include "../minmax.h"
 
-int log_err(const char *format, ...)
+size_t log_err(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
@@ -16,7 +16,7 @@
 	return fwrite(buffer, len, 1, stderr);
 }
 
-int log_info(const char *format, ...)
+size_t log_info(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
diff -Nru fio-2.1.3/t/memlock.c fio-3.16/t/memlock.c
--- fio-2.1.3/t/memlock.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/memlock.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+static struct thread_data {
+	unsigned long mib;
+} td;
+
+static void *worker(void *data)
+{
+	struct thread_data *td = data;
+	unsigned long index;
+	size_t size;
+	char *buf;
+	int i, first = 1;
+
+	size = td->mib * 1024UL * 1024UL;
+	buf = malloc(size);
+
+	for (i = 0; i < 100000; i++) {
+		for (index = 0; index + 4096 < size; index += 4096)
+			memset(&buf[index+512], 0x89, 512);
+		if (first) {
+			printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL));
+			first = 0;
+		}
+	}
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long mib, threads;
+	pthread_t *pthreads;
+	int i;
+
+	if (argc < 3) {
+		printf("%s: <MiB per thread> <threads>\n", argv[0]);
+		return 1;
+	}
+
+	mib = strtoul(argv[1], NULL, 10);
+	threads = strtoul(argv[2], NULL, 10);
+
+	pthreads = calloc(threads, sizeof(pthread_t));
+	td.mib = mib;
+
+	for (i = 0; i < threads; i++)
+		pthread_create(&pthreads[i], NULL, worker, &td);
+
+	for (i = 0; i < threads; i++) {
+		void *ret;
+
+		pthread_join(pthreads[i], &ret);
+	}
+	return 0;
+}
diff -Nru fio-2.1.3/t/readonly.sh fio-3.16/t/readonly.sh
--- fio-2.1.3/t/readonly.sh	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/readonly.sh	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,84 @@
+#!/bin/bash
+#
+# Do some basic test of the --readonly parameter
+#
+# DUT should be a device that accepts read, write, and trim operations
+#
+# Example usage:
+#
+# DUT=/dev/fioa t/readonly.sh
+#
+TESTNUM=1
+
+#
+# The first parameter is the return code
+# The second parameter is 0        if the return code should be 0
+#                         positive if the return code should be positive
+#
+check () {
+	echo "********************"
+
+	if [ $2 -gt 0 ]; then
+		if [ $1 -eq 0 ]; then
+			echo "Test $TESTNUM failed"
+			echo "********************"
+			exit 1
+		else
+			echo "Test $TESTNUM passed"
+		fi
+	else
+		if [ $1 -gt 0 ]; then
+			echo "Test $TESTNUM failed"
+			echo "********************"
+			exit 1
+		else
+			echo "Test $TESTNUM passed"
+		fi
+	fi
+
+	echo "********************"
+	echo
+	TESTNUM=$((TESTNUM+1))
+}
+
+./fio --name=test --filename=$DUT --rw=randread  --readonly --time_based --runtime=1s &> /dev/null
+check $? 0
+./fio --name=test --filename=$DUT --rw=randwrite --readonly --time_based --runtime=1s &> /dev/null
+check $? 1
+./fio --name=test --filename=$DUT --rw=randtrim  --readonly --time_based --runtime=1s &> /dev/null
+check $? 1
+
+./fio --name=test --filename=$DUT --readonly --rw=randread  --time_based --runtime=1s &> /dev/null
+check $? 0
+./fio --name=test --filename=$DUT --readonly --rw=randwrite --time_based --runtime=1s &> /dev/null
+check $? 1
+./fio --name=test --filename=$DUT --readonly --rw=randtrim  --time_based --runtime=1s &> /dev/null
+check $? 1
+
+./fio --name=test --filename=$DUT --rw=randread  --time_based --runtime=1s &> /dev/null
+check $? 0
+./fio --name=test --filename=$DUT --rw=randwrite --time_based --runtime=1s &> /dev/null
+check $? 0
+./fio --name=test --filename=$DUT --rw=randtrim  --time_based --runtime=1s &> /dev/null
+check $? 0
+
+./fio t/jobs/readonly-r.fio --readonly &> /dev/null
+check $? 0
+./fio t/jobs/readonly-w.fio --readonly &> /dev/null
+check $? 1
+./fio t/jobs/readonly-t.fio --readonly &> /dev/null
+check $? 1
+
+./fio --readonly t/jobs/readonly-r.fio &> /dev/null
+check $? 0
+./fio --readonly t/jobs/readonly-w.fio &> /dev/null
+check $? 1
+./fio --readonly t/jobs/readonly-t.fio &> /dev/null
+check $? 1
+
+./fio t/jobs/readonly-r.fio &> /dev/null
+check $? 0
+./fio t/jobs/readonly-w.fio &> /dev/null
+check $? 0
+./fio t/jobs/readonly-t.fio &> /dev/null
+check $? 0
diff -Nru fio-2.1.3/t/read-to-pipe-async.c fio-3.16/t/read-to-pipe-async.c
--- fio-2.1.3/t/read-to-pipe-async.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/read-to-pipe-async.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,670 @@
+/*
+ * Read a file and write the contents to stdout. If a given read takes
+ * longer than 'max_us' time, then we schedule a new thread to handle
+ * the next read. This avoids the coordinated omission problem, where
+ * one request appears to take a long time, but in reality a lot of
+ * requests would have been slow, but we don't notice since new submissions
+ * are not being issued if just 1 is held up.
+ *
+ * One test case:
+ *
+ * $ time (./read-to-pipe-async -f randfile.gz | gzip -dc > outfile; sync)
+ *
+ * This will read randfile.gz and log the latencies of doing so, while
+ * piping the output to gzip to decompress it. Any latencies over max_us
+ * are logged when they happen, and latency buckets are displayed at the
+ * end of the run
+ *
+ * gcc -Wall -g -O2 -o read-to-pipe-async read-to-pipe-async.c -lpthread
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../flist.h"
+
+static int bs = 4096;
+static int max_us = 10000;
+static char *file;
+static int separate_writer = 1;
+
+#define PLAT_BITS	8
+#define PLAT_VAL	(1 << PLAT_BITS)
+#define PLAT_GROUP_NR	19
+#define PLAT_NR		(PLAT_GROUP_NR * PLAT_VAL)
+#define PLAT_LIST_MAX	20
+
+struct stats {
+	unsigned int plat[PLAT_NR];
+	unsigned int nr_samples;
+	unsigned int max;
+	unsigned int min;
+	unsigned int over;
+};
+
+static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.99, 99.999, 99.9999, };
+
+struct thread_data {
+	int exit;
+	int done;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_mutex_t done_lock;
+	pthread_cond_t done_cond;
+	pthread_t thread;
+};
+
+struct writer_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct reader_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	int started;
+	int busy;
+	int write_seq;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct work_item {
+	struct flist_head list;
+	void *buf;
+	size_t buf_size;
+	off_t off;
+	int fd;
+	int seq;
+	struct writer_thread *writer;
+	struct reader_thread *reader;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_t thread;
+};
+
+static struct reader_thread reader_thread;
+static struct writer_thread writer_thread;
+
+uint64_t utime_since(const struct timeval *s, const struct timeval *e)
+{
+	long sec, usec;
+	uint64_t ret;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = e->tv_usec - s->tv_usec;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	ret = sec * 1000000ULL + usec;
+
+	return ret;
+}
+
+static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+{
+	struct work_item *work;
+	struct flist_head *entry;
+
+	if (flist_empty(&w->list))
+		return NULL;
+
+	flist_for_each(entry, &w->list) {
+		work = flist_entry(entry, struct work_item, list);
+		if (work->seq == seq)
+			return work;
+	}
+
+	return NULL;
+}
+
+static unsigned int plat_val_to_idx(unsigned int val)
+{
+	unsigned int msb, error_bits, base, offset;
+
+	/* Find MSB starting from bit 0 */
+	if (val == 0)
+		msb = 0;
+	else
+		msb = sizeof(val)*8 - __builtin_clz(val) - 1;
+
+	/*
+	 * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index
+	 */
+	if (msb <= PLAT_BITS)
+		return val;
+
+	/* Compute the number of error bits to discard*/
+	error_bits = msb - PLAT_BITS;
+
+	/* Compute the number of buckets before the group */
+	base = (error_bits + 1) << PLAT_BITS;
+
+	/*
+	 * Discard the error bits and apply the mask to find the
+	 * index for the buckets in the group
+	 */
+	offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+	/* Make sure the index does not exceed (array size - 1) */
+	return (base + offset) < (PLAT_NR - 1) ?
+		(base + offset) : (PLAT_NR - 1);
+}
+
+/*
+ * Convert the given index of the bucket array to the value
+ * represented by the bucket
+ */
+static unsigned int plat_idx_to_val(unsigned int idx)
+{
+	unsigned int error_bits, k, base;
+
+	assert(idx < PLAT_NR);
+
+	/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index */
+	if (idx < (PLAT_VAL << 1))
+		return idx;
+
+	/* Find the group and compute the minimum value of that group */
+	error_bits = (idx >> PLAT_BITS) - 1;
+	base = 1 << (error_bits + PLAT_BITS);
+
+	/* Find its bucket number of the group */
+	k = idx % PLAT_VAL;
+
+	/* Return the mean of the range of the bucket */
+	return base + ((k + 0.5) * (1 << error_bits));
+}
+
+static void add_lat(struct stats *s, unsigned int us, const char *name)
+{
+	int lat_index = 0;
+
+	if (us > s->max)
+		s->max = us;
+	if (us < s->min)
+		s->min = us;
+
+	if (us > max_us) {
+		fprintf(stderr, "%s latency=%u usec\n", name, us);
+		s->over++;
+	}
+
+	lat_index = plat_val_to_idx(us);
+	__sync_fetch_and_add(&s->plat[lat_index], 1);
+	__sync_fetch_and_add(&s->nr_samples, 1);
+}
+
+static int write_work(struct work_item *work)
+{
+	struct timeval s, e;
+	ssize_t ret;
+
+	gettimeofday(&s, NULL);
+	ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+	gettimeofday(&e, NULL);
+	assert(ret == work->buf_size);
+
+	add_lat(&work->writer->s, utime_since(&s, &e), "write");
+	return work->seq + 1;
+}
+
+static void thread_exiting(struct thread_data *thread)
+{
+	__sync_fetch_and_add(&thread->done, 1);
+	pthread_cond_signal(&thread->done_cond);
+}
+
+static void *writer_fn(void *data)
+{
+	struct writer_thread *wt = data;
+	struct work_item *work;
+	unsigned int seq = 1;
+
+	work = NULL;
+	while (!wt->thread.exit || !flist_empty(&wt->list)) {
+		pthread_mutex_lock(&wt->thread.lock);
+
+		if (work) {
+			flist_add_tail(&work->list, &wt->done_list);
+			work = NULL;
+		}
+	
+		work = find_seq(wt, seq);
+		if (work)
+			flist_del_init(&work->list);
+		else
+			pthread_cond_wait(&wt->thread.cond, &wt->thread.lock);
+
+		pthread_mutex_unlock(&wt->thread.lock);
+
+		if (work)
+			seq = write_work(work);
+	}
+
+	thread_exiting(&wt->thread);
+	return NULL;
+}
+
+static void reader_work(struct work_item *work)
+{
+	struct timeval s, e;
+	ssize_t ret;
+	size_t left;
+	void *buf;
+	off_t off;
+
+	gettimeofday(&s, NULL);
+
+	left = work->buf_size;
+	buf = work->buf;
+	off = work->off;
+	while (left) {
+		ret = pread(work->fd, buf, left, off);
+		if (!ret) {
+			fprintf(stderr, "zero read\n");
+			break;
+		} else if (ret < 0) {
+			fprintf(stderr, "errno=%d\n", errno);
+			break;
+		}
+		left -= ret;
+		off += ret;
+		buf += ret;
+	}
+
+	gettimeofday(&e, NULL);
+
+	add_lat(&work->reader->s, utime_since(&s, &e), "read");
+
+	pthread_cond_signal(&work->cond);
+
+	if (separate_writer) {
+		pthread_mutex_lock(&work->writer->thread.lock);
+		flist_add_tail(&work->list, &work->writer->list);
+		pthread_mutex_unlock(&work->writer->thread.lock);
+		pthread_cond_signal(&work->writer->thread.cond);
+	} else {
+		struct reader_thread *rt = work->reader;
+		struct work_item *next = NULL;
+		struct flist_head *entry;
+
+		/*
+		 * Write current work if it matches in sequence.
+		 */
+		if (work->seq == rt->write_seq)
+			goto write_it;
+
+		pthread_mutex_lock(&rt->thread.lock);
+
+		flist_add_tail(&work->list, &rt->done_list);
+
+		/*
+		 * See if the next work item is here, if so, write it
+		 */
+		work = NULL;
+		flist_for_each(entry, &rt->done_list) {
+			next = flist_entry(entry, struct work_item, list);
+			if (next->seq == rt->write_seq) {
+				work = next;
+				flist_del(&work->list);
+				break;
+			}
+		}
+
+		pthread_mutex_unlock(&rt->thread.lock);
+	
+		if (work) {
+write_it:
+			write_work(work);
+			__sync_fetch_and_add(&rt->write_seq, 1);
+		}
+	}
+}
+
+static void *reader_one_off(void *data)
+{
+	reader_work(data);
+	return NULL;
+}
+
+static void *reader_fn(void *data)
+{
+	struct reader_thread *rt = data;
+	struct work_item *work;
+
+	while (!rt->thread.exit || !flist_empty(&rt->list)) {
+		work = NULL;
+		pthread_mutex_lock(&rt->thread.lock);
+		if (!flist_empty(&rt->list)) {
+			work = flist_first_entry(&rt->list, struct work_item, list);
+			flist_del_init(&work->list);
+		} else
+			pthread_cond_wait(&rt->thread.cond, &rt->thread.lock);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		if (work) {
+			__sync_fetch_and_add(&rt->busy, 1);
+			reader_work(work);
+			__sync_fetch_and_sub(&rt->busy, 1);
+		}
+	}
+
+	thread_exiting(&rt->thread);
+	return NULL;
+}
+
+static void queue_work(struct reader_thread *rt, struct work_item *work)
+{
+	if (!rt->started) {
+		pthread_mutex_lock(&rt->thread.lock);
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		rt->started = 1;
+		pthread_create(&rt->thread.thread, NULL, reader_fn, rt);
+	} else if (!rt->busy && !pthread_mutex_trylock(&rt->thread.lock)) {
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		pthread_cond_signal(&rt->thread.cond);
+	} else {
+		int ret = pthread_create(&work->thread, NULL, reader_one_off, work);
+		if (ret)
+			fprintf(stderr, "pthread_create=%d\n", ret);
+		else
+			pthread_detach(work->thread);
+	}
+}
+
+static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
+				     unsigned int **output)
+{
+	unsigned long sum = 0;
+	unsigned int len, i, j = 0;
+	unsigned int oval_len = 0;
+	unsigned int *ovals = NULL;
+	int is_last;
+
+	len = 0;
+	while (len < PLAT_LIST_MAX && plist[len] != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Calculate bucket values, note down max and min values
+	 */
+	is_last = 0;
+	for (i = 0; i < PLAT_NR && !is_last; i++) {
+		sum += io_u_plat[i];
+		while (sum >= (plist[j] / 100.0 * nr)) {
+			assert(plist[j] <= 100.0);
+
+			if (j == oval_len) {
+				oval_len += 100;
+				ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+			}
+
+			ovals[j] = plat_idx_to_val(i);
+			is_last = (j == len - 1);
+			if (is_last)
+				break;
+
+			j++;
+		}
+	}
+
+	*output = ovals;
+	return len;
+}
+
+static void show_latencies(struct stats *s, const char *msg)
+{
+	unsigned int *ovals = NULL;
+	unsigned int len, i;
+
+	len = calc_percentiles(s->plat, s->nr_samples, &ovals);
+	if (len) {
+		fprintf(stderr, "Latency percentiles (usec) (%s)\n", msg);
+		for (i = 0; i < len; i++)
+			fprintf(stderr, "\t%2.4fth: %u\n", plist[i], ovals[i]);
+	}
+
+	if (ovals)
+		free(ovals);
+
+	fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
+}
+
+static void init_thread(struct thread_data *thread)
+{
+	pthread_cond_init(&thread->cond, NULL);
+	pthread_cond_init(&thread->done_cond, NULL);
+	pthread_mutex_init(&thread->lock, NULL);
+	pthread_mutex_init(&thread->done_lock, NULL);
+	thread->exit = 0;
+}
+
+static void exit_thread(struct thread_data *thread,
+			void fn(struct writer_thread *),
+			struct writer_thread *wt)
+{
+	__sync_fetch_and_add(&thread->exit, 1);
+	pthread_cond_signal(&thread->cond);
+
+	while (!thread->done) {
+		pthread_mutex_lock(&thread->done_lock);
+
+		if (fn) {
+			struct timeval tv;
+			struct timespec ts;
+
+			gettimeofday(&tv, NULL);
+			ts.tv_sec = tv.tv_sec + 1;
+			ts.tv_nsec = tv.tv_usec * 1000ULL;
+
+			pthread_cond_timedwait(&thread->done_cond, &thread->done_lock, &ts);
+			fn(wt);
+		} else
+			pthread_cond_wait(&thread->done_cond, &thread->done_lock);
+
+		pthread_mutex_unlock(&thread->done_lock);
+	}
+}
+
+static int usage(char *argv[])
+{
+	fprintf(stderr, "%s: [-b blocksize] [-t max usec] [-w separate writer] -f file\n", argv[0]);
+	return 1;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "f:b:t:w:")) != -1) {
+		switch (c) {
+		case 'f':
+			file = strdup(optarg);
+			break;
+		case 'b':
+			bs = atoi(optarg);
+			break;
+		case 't':
+			max_us = atoi(optarg);
+			break;
+		case 'w':
+			separate_writer = atoi(optarg);
+			if (!separate_writer)
+				fprintf(stderr, "inline writing is broken\n");
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (!file)
+		return usage(argv);
+
+	return 0;
+}
+
+static void prune_done_entries(struct writer_thread *wt)
+{
+	FLIST_HEAD(list);
+
+	if (flist_empty(&wt->done_list))
+		return;
+
+	if (pthread_mutex_trylock(&wt->thread.lock))
+		return;
+
+	if (!flist_empty(&wt->done_list))
+		flist_splice_init(&wt->done_list, &list);
+	pthread_mutex_unlock(&wt->thread.lock);
+
+	while (!flist_empty(&list)) {
+		struct work_item *work;
+
+		work = flist_first_entry(&list, struct work_item, list);
+		flist_del(&work->list);
+
+		pthread_cond_destroy(&work->cond);
+		pthread_mutex_destroy(&work->lock);
+		free(work->buf);
+		free(work);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct timeval s, re, we;
+	struct reader_thread *rt;
+	struct writer_thread *wt;
+	unsigned long rate;
+	struct stat sb;
+	size_t bytes;
+	off_t off;
+	int fd, seq;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 2;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		perror("stat");
+		return 3;
+	}
+
+	wt = &writer_thread;
+	init_thread(&wt->thread);
+	INIT_FLIST_HEAD(&wt->list);
+	INIT_FLIST_HEAD(&wt->done_list);
+	wt->s.max = 0;
+	wt->s.min = -1U;
+	pthread_create(&wt->thread.thread, NULL, writer_fn, wt);
+
+	rt = &reader_thread;
+	init_thread(&rt->thread);
+	INIT_FLIST_HEAD(&rt->list);
+	INIT_FLIST_HEAD(&rt->done_list);
+	rt->s.max = 0;
+	rt->s.min = -1U;
+	rt->write_seq = 1;
+
+	off = 0;
+	seq = 0;
+	bytes = 0;
+
+	gettimeofday(&s, NULL);
+
+	while (sb.st_size) {
+		struct work_item *work;
+		size_t this_len;
+		struct timespec ts;
+		struct timeval tv;
+
+		prune_done_entries(wt);
+
+		this_len = sb.st_size;
+		if (this_len > bs)
+			this_len = bs;
+
+		work = calloc(1, sizeof(*work));
+		work->buf = malloc(this_len);
+		work->buf_size = this_len;
+		work->off = off;
+		work->fd = fd;
+		work->seq = ++seq;
+		work->writer = wt;
+		work->reader = rt;
+		pthread_cond_init(&work->cond, NULL);
+		pthread_mutex_init(&work->lock, NULL);
+
+		queue_work(rt, work);
+
+		gettimeofday(&tv, NULL);
+		ts.tv_sec = tv.tv_sec;
+		ts.tv_nsec = tv.tv_usec * 1000ULL;
+		ts.tv_nsec += max_us * 1000ULL;
+		if (ts.tv_nsec >= 1000000000ULL) {
+			ts.tv_nsec -= 1000000000ULL;
+			ts.tv_sec++;
+		}
+
+		pthread_mutex_lock(&work->lock);
+		pthread_cond_timedwait(&work->cond, &work->lock, &ts);
+		pthread_mutex_unlock(&work->lock);
+
+		off += this_len;
+		sb.st_size -= this_len;
+		bytes += this_len;
+	}
+
+	exit_thread(&rt->thread, NULL, NULL);
+	gettimeofday(&re, NULL);
+
+	exit_thread(&wt->thread, prune_done_entries, wt);
+	gettimeofday(&we, NULL);
+
+	show_latencies(&rt->s, "READERS");
+	show_latencies(&wt->s, "WRITERS");
+
+	bytes /= 1024;
+	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &re);
+	fprintf(stderr, "Read rate (KiB/sec) : %lu\n", rate);
+	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &we);
+	fprintf(stderr, "Write rate (KiB/sec): %lu\n", rate);
+
+	close(fd);
+	return 0;
+}
diff -Nru fio-2.1.3/t/sgunmap-perf.py fio-3.16/t/sgunmap-perf.py
--- fio-2.1.3/t/sgunmap-perf.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/sgunmap-perf.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,115 @@
+#!/usr/bin/python2.7
+#
+# sgunmap-test.py
+#
+# Basic performance testing using fio's sg ioengine
+#
+# USAGE
+# sgunmap-perf.py char-device block-device fio-executable
+#
+# EXAMPLE
+# t/sgunmap-perf.py /dev/sg1 /dev/sdb ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+#
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import sys
+import json
+import argparse
+import subprocess
+from six.moves import range
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('cdev',
+                        help='character device target (e.g., /dev/sg0)')
+    parser.add_argument('bdev',
+                        help='block device target (e.g., /dev/sda)')
+    parser.add_argument('fioc',
+                        help='path to candidate fio executable (e.g., ./fio)')
+    parser.add_argument('fior',
+                        help='path to reference fio executable (e.g., ./fio)')
+    args = parser.parse_args()
+
+    return args
+
+
+def fulldevice(fio, dev, ioengine='psync', rw='trim', bs='1M'):
+    parameters = ["--name=test",
+                  "--output-format=json",
+                  "--random_generator=lfsr",
+                  "--bs={0}".format(bs),
+                  "--rw={0}".format(rw),
+                  "--ioengine={0}".format(ioengine),
+                  "--filename={0}".format(dev)]
+
+    output = subprocess.check_output([fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+    return jobdata
+
+
+def runtest(fio, dev, rw, qd, batch, bs='512', runtime='30s'):
+    parameters = ["--name=test",
+                  "--random_generator=tausworthe64",
+                  "--time_based",
+                  "--runtime={0}".format(runtime),
+                  "--output-format=json",
+                  "--ioengine=sg",
+                  "--blocksize={0}".format(bs),
+                  "--rw={0}".format(rw),
+                  "--filename={0}".format(dev),
+                  "--iodepth={0}".format(qd),
+                  "--iodepth_batch={0}".format(batch)]
+
+    output = subprocess.check_output([fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+#    print(parameters)
+
+    return jobdata
+
+
+def runtests(fio, dev, qd, batch, rw, bs='512', trials=5):
+    iops = []
+    for x in range(trials):
+        jd = runtest(fio, dev, rw, qd, batch, bs=bs)
+        total = jd['read']['iops'] + jd['write']['iops'] + jd['trim']['iops']
+#       print(total)
+        iops.extend([total])
+    return iops, (sum(iops) / trials)
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print("Trimming full device {0}".format(args.cdev))
+    fulldevice(args.fior, args.cdev, ioengine='sg')
+
+    print("Running rand read tests on {0}"
+        " with fio candidate build {1}".format(args.cdev, args.fioc))
+    randread, rrmean = runtests(args.fioc, args.cdev, 16, 1, 'randread',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rrmean, randread))
+
+    print("Running rand read tests on {0}"
+        " with fio reference build {1}".format(args.cdev, args.fior))
+    randread, rrmean = runtests(args.fior, args.cdev, 16, 1, 'randread',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rrmean, randread))
+
+    print("Running rand write tests on {0}"
+        " with fio candidate build {1}".format(args.cdev, args.fioc))
+    randwrite, rwmean = runtests(args.fioc, args.cdev, 16, 1, 'randwrite',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rwmean, randwrite))
+
+    print("Running rand write tests on {0}"
+        " with fio reference build {1}".format(args.cdev, args.fior))
+    randwrite, rwmean = runtests(args.fior, args.cdev, 16, 1, 'randwrite',
+        trials=5)
+    print("IOPS mean {0}, trials {1}".format(rwmean, randwrite))
diff -Nru fio-2.1.3/t/sgunmap-test.py fio-3.16/t/sgunmap-test.py
--- fio-2.1.3/t/sgunmap-test.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/sgunmap-test.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,173 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python 3 compatible.
+#
+# sgunmap-test.py
+#
+# Limited functonality test for trim workloads using fio's sg ioengine
+# This checks only the three sets of reported iodepths
+#
+# !!!WARNING!!!
+# This script carries out destructive tests. Be sure that
+# there is no data you want to keep on the supplied devices.
+#
+# USAGE
+# sgunmap-test.py char-device block-device fio-executable
+#
+# EXAMPLE
+# t/sgunmap-test.py /dev/sg1 /dev/sdb ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+#
+# TEST MATRIX
+# For both char-dev and block-dev these are the expected
+# submit/complete IO depths
+#
+#                       blockdev                chardev
+#                       iodepth                 iodepth
+# R QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD1                 sub/comp: 1-4=100%      sub/comp: 1-4=100%
+#
+# R QD16, batch8        sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD16, batch8        sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD16, batch8        sub/comp: 1-4=100%      sub/comp: 5-8=100%
+#
+# R QD16, batch16       sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# W QD16, batch16       sub/comp: 1-4=100%      sub/comp: 1-4=100%
+# T QD16, batch16       sub/comp: 1-4=100%      sub/comp: 9-16=100%
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import sys
+import json
+import argparse
+import traceback
+import subprocess
+from six.moves import range
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('chardev',
+                        help='character device target (e.g., /dev/sg0)')
+    parser.add_argument('blockdev',
+                        help='block device target (e.g., /dev/sda)')
+    parser.add_argument('fio',
+                        help='path to fio executable (e.g., ./fio)')
+    args = parser.parse_args()
+
+    return args
+
+#
+# With block devices,
+#     iodepth = 1 always
+#     submit = complete = 1-4 always
+# With character devices,
+# RW
+#     iodepth = qd
+#     submit = 1-4
+#     complete = 1-4 except for the IOs in flight
+#                when the job is ending
+# T
+#     iodepth = qd
+#     submit = qdbatch
+#     complete = qdbatch except for the IOs in flight
+#                when the job is ending
+#
+
+
+def check(jsondata, parameters, block, qd, qdbatch, rw):
+    iodepth = jsondata['iodepth_level']
+    submit = jsondata['iodepth_submit']
+    complete = jsondata['iodepth_complete']
+
+    try:
+        if block:
+            assert iodepth['1'] == 100.0
+            assert submit['4'] == 100.0
+            assert complete['4'] == 100.0
+        elif 'read' in rw or 'write' in rw:
+            assert iodepth[str(qd)] > 99.9
+            assert submit['4'] == 100.0
+            assert complete['4'] > 99.9
+        else:
+            if qdbatch <= 4:
+                batchkey = '4'
+            elif qdbatch > 64:
+                batchkey = '>=64'
+            else:
+                batchkey = str(qdbatch)
+            if qd >= 64:
+                qdkey = ">=64"
+            else:
+                qdkey = str(qd)
+            assert iodepth[qdkey] > 99
+            assert submit[batchkey] == 100.0
+            assert complete[batchkey] > 99
+    except AssertionError:
+        print("Assertion failed")
+        traceback.print_exc()
+        print(jsondata)
+        return
+
+    print("**********passed*********")
+
+
+def runalltests(args, qd, batch):
+    block = False
+    for dev in [args.chardev, args.blockdev]:
+        for rw in ["randread", "randwrite", "randtrim"]:
+            parameters = ["--name=test",
+                           "--time_based",
+                           "--runtime=30s",
+                           "--output-format=json",
+                           "--ioengine=sg",
+                           "--rw={0}".format(rw),
+                           "--filename={0}".format(dev),
+                           "--iodepth={0}".format(qd),
+                           "--iodepth_batch={0}".format(batch)]
+
+            print(parameters)
+            output = subprocess.check_output([args.fio] + parameters)
+            jsondata = json.loads(output)
+            jobdata = jsondata['jobs'][0]
+            check(jobdata, parameters, block, qd, batch, rw)
+        block = True
+
+
+def runcdevtrimtest(args, qd, batch):
+    parameters = ["--name=test",
+                   "--time_based",
+                   "--runtime=30s",
+                   "--output-format=json",
+                   "--ioengine=sg",
+                   "--rw=randtrim",
+                   "--filename={0}".format(args.chardev),
+                   "--iodepth={0}".format(qd),
+                   "--iodepth_batch={0}".format(batch)]
+
+    print(parameters)
+    output = subprocess.check_output([args.fio] + parameters)
+    jsondata = json.loads(output)
+    jobdata = jsondata['jobs'][0]
+    check(jobdata, parameters, False, qd, batch, "randtrim")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    runcdevtrimtest(args, 32, 2)
+    runcdevtrimtest(args, 32, 4)
+    runcdevtrimtest(args, 32, 8)
+    runcdevtrimtest(args, 64, 4)
+    runcdevtrimtest(args, 64, 8)
+    runcdevtrimtest(args, 64, 16)
+    runcdevtrimtest(args, 128, 8)
+    runcdevtrimtest(args, 128, 16)
+    runcdevtrimtest(args, 128, 32)
+
+    runalltests(args, 1, 1)
+    runalltests(args, 16, 2)
+    runalltests(args, 16, 16)
diff -Nru fio-2.1.3/t/steadystate_tests.py fio-3.16/t/steadystate_tests.py
--- fio-2.1.3/t/steadystate_tests.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/steadystate_tests.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,226 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python 3 compatible.
+#
+# steadystate_tests.py
+#
+# Test option parsing and functonality for fio's steady state detection feature.
+#
+# steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio
+#
+# REQUIREMENTS
+# Python 2.6+
+# SciPy
+#
+# KNOWN ISSUES
+# only option parsing and read tests are carried out
+# On Windows this script works under Cygwin but not from cmd.exe
+# On Windows I encounter frequent fio problems generating JSON output (nothing to decode)
+# min runtime:
+# if ss attained: min runtime = ss_dur + ss_ramp
+# if not attained: runtime = timeout
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import sys
+import json
+import uuid
+import pprint
+import argparse
+import subprocess
+from scipy import stats
+from six.moves import range
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('fio',
+                        help='path to fio executable')
+    parser.add_argument('--read',
+                        help='target for read testing')
+    parser.add_argument('--write',
+                        help='target for write testing')
+    args = parser.parse_args()
+
+    return args
+
+
+def check(data, iops, slope, pct, limit, dur, criterion):
+    measurement = 'iops' if iops else 'bw'
+    data = data[measurement]
+    mean = sum(data) / len(data)
+    if slope:
+        x = list(range(len(data)))
+        m, intercept, r_value, p_value, std_err = stats.linregress(x,data)
+        m = abs(m)
+        if pct:
+            target = m / mean * 100
+            criterion = criterion[:-1]
+        else:
+            target = m
+    else:
+        maxdev = 0
+        for x in data:
+            maxdev = max(abs(mean-x), maxdev)
+        if pct:
+            target = maxdev / mean * 100
+            criterion = criterion[:-1]
+        else:
+            target = maxdev
+
+    criterion = float(criterion)
+    return (abs(target - criterion) / criterion < 0.005), target < limit, mean, target
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    pp = pprint.PrettyPrinter(indent=4)
+
+#
+# test option parsing
+#
+    parsing = [ { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10", "--ss_ramp=5"],
+                  'output': "set steady state IOPS threshold to 10.000000" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:12", "--ss_ramp=5"],
+                  'output': "set steady state BW threshold to 12" },
+              ]
+    for test in parsing:
+        output = subprocess.check_output([args.fio] + test['args'])
+        if test['output'] in output.decode():
+            print("PASSED '{0}' found with arguments {1}".format(test['output'], test['args']))
+        else:
+            print("FAILED '{0}' NOT found with arguments {1}".format(test['output'], test['args']))
+
+#
+# test some read workloads
+#
+# if ss active and attained,
+#   check that runtime is less than job time
+#   check criteria
+#   how to check ramp time?
+#
+# if ss inactive
+#   check that runtime is what was specified
+#
+    reads = [ {'s': True, 'timeout': 100, 'numjobs': 1, 'ss_dur': 5, 'ss_ramp': 3, 'iops': True, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': False, 'timeout': 20, 'numjobs': 2},
+              {'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+            ]
+
+    if args.read == None:
+        if os.name == 'posix':
+            args.read = '/dev/zero'
+            extra = [ "--size=134217728" ]  # 128 MiB
+        else:
+            print("ERROR: file for read testing must be specified on non-posix systems")
+            sys.exit(1)
+    else:
+        extra = []
+
+    jobnum = 0
+    for job in reads:
+
+        tf = uuid.uuid4().hex
+        parameters = [ "--name=job{0}".format(jobnum) ]
+        parameters.extend(extra)
+        parameters.extend([ "--thread",
+                            "--output-format=json",
+                            "--output={0}".format(tf),
+                            "--filename={0}".format(args.read),
+                            "--rw=randrw",
+                            "--rwmixread=100",
+                            "--stonewall",
+                            "--group_reporting",
+                            "--numjobs={0}".format(job['numjobs']),
+                            "--time_based",
+                            "--runtime={0}".format(job['timeout']) ])
+        if job['s']:
+           if job['iops']:
+               ss = 'iops'
+           else:
+               ss = 'bw'
+           if job['slope']:
+               ss += "_slope"
+           ss += ":" + str(job['ss_limit'])
+           if job['pct']:
+               ss += '%'
+           parameters.extend([ '--ss_dur={0}'.format(job['ss_dur']),
+                               '--ss={0}'.format(ss),
+                               '--ss_ramp={0}'.format(job['ss_ramp']) ])
+
+        output = subprocess.call([args.fio] + parameters)
+        with open(tf, 'r') as source:
+            jsondata = json.loads(source.read())
+        os.remove(tf)
+
+        for jsonjob in jsondata['jobs']:
+            line = "job {0}".format(jsonjob['job options']['name'])
+            if job['s']:
+                if jsonjob['steadystate']['attained'] == 1:
+                    # check runtime >= ss_dur + ss_ramp, check criterion, check criterion < limit
+                    mintime = (job['ss_dur'] + job['ss_ramp']) * 1000
+                    actual = jsonjob['read']['runtime']
+                    if mintime > actual:
+                        line = 'FAILED ' + line + ' ss attained, runtime {0} < ss_dur {1} + ss_ramp {2}'.format(actual, job['ss_dur'], job['ss_ramp'])
+                    else:
+                        line = line + ' ss attained, runtime {0} > ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                        else:
+                            if met:
+                                line = 'PASSED ' + line + ' target {0} < limit {1}'.format(target, job['ss_limit'])
+                            else:
+                                line = 'FAILED ' + line + ' target {0} < limit {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                else:
+                    # check runtime, confirm criterion calculation, and confirm that criterion was not met
+                    expected = job['timeout'] * 1000
+                    actual = jsonjob['read']['runtime']
+                    if abs(expected - actual) > 10:
+                        line = 'FAILED ' + line + ' ss not attained, expected runtime {0} != actual runtime {1}'.format(expected, actual)
+                    else:
+                        line = line + ' ss not attained, runtime {0} != ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            if actual > (job['ss_dur'] + job['ss_ramp'])*1000:
+                                line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                            else:
+                                line = 'PASSED ' + line + ' fio criterion {0} == 0.0 since ss_dur + ss_ramp has not elapsed '.format(jsonjob['steadystate']['criterion'])
+                        else:
+                            if met:
+                                line = 'FAILED ' + line + ' target {0} < threshold {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                            else:
+                                line = 'PASSED ' + line + ' criterion {0} > threshold {1}'.format(target, job['ss_limit'])
+            else:
+                expected = job['timeout'] * 1000
+                actual = jsonjob['read']['runtime']
+                if abs(expected - actual) < 10:
+                    result = 'PASSED '
+                else:
+                    result = 'FAILED '
+                line = result + line + ' no ss, expected runtime {0} ~= actual runtime {1}'.format(expected, actual)
+            print(line)
+            if 'steadystate' in jsonjob:
+                pp.pprint(jsonjob['steadystate'])
+        jobnum += 1
diff -Nru fio-2.1.3/t/stest.c fio-3.16/t/stest.c
--- fio-2.1.3/t/stest.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/t/stest.c	2019-09-20 01:01:52.000000000 +0000
@@ -4,36 +4,39 @@
 
 #include "../smalloc.h"
 #include "../flist.h"
-
-FILE *f_err;
-struct timeval *fio_tv = NULL;
-unsigned int fio_debug = 0;
+#include "../arch/arch.h"
+#include "debug.h"
 
 #define MAGIC1	0xa9b1c8d2
 #define MAGIC2	0xf0a1e9b3
 
 #define LOOPS	32
+#define MAXSMALLOC	120*1024*1024UL
+#define LARGESMALLOC	128*1024U
 
 struct elem {
 	unsigned int magic1;
 	struct flist_head list;
 	unsigned int magic2;
+	unsigned int size;
 };
 
-FLIST_HEAD(list);
+static FLIST_HEAD(list);
 
 static int do_rand_allocs(void)
 {
 	unsigned int size, nr, rounds = 0;
 	unsigned long total;
 	struct elem *e;
+	bool error;
 
 	while (rounds++ < LOOPS) {
 #ifdef STEST_SEED
 		srand(MAGIC1);
 #endif
+		error = false;
 		nr = total = 0;
-		while (total < 128*1024*1024UL) {
+		while (total < MAXSMALLOC) {
 			size = 8 * sizeof(struct elem) + (int) (999.0 * (rand() / (RAND_MAX + 1.0)));
 			e = smalloc(size);
 			if (!e) {
@@ -42,6 +45,7 @@
 			}
 			e->magic1 = MAGIC1;
 			e->magic2 = MAGIC2;
+			e->size = size;
 			total += size;
 			flist_add_tail(&e->list, &list);
 			nr++;
@@ -53,38 +57,35 @@
 			e = flist_entry(list.next, struct elem, list);
 			assert(e->magic1 == MAGIC1);
 			assert(e->magic2 == MAGIC2);
+			total -= e->size;
 			flist_del(&e->list);
 			sfree(e);
+
+			if (!error) {
+				e = smalloc(LARGESMALLOC);
+				if (!e) {
+					error = true;
+					printf("failure allocating %u bytes at %lu allocated during sfree phase\n",
+						LARGESMALLOC, total);
+				}
+				else
+					sfree(e);
+			}
 		}
 	}
 
 	return 0;
 }
 
-static int do_specific_alloc(unsigned long size)
-{
-	void *ptr;
-
-	ptr = smalloc(size);
-	sfree(ptr);
-	return 0;
-}
-
 int main(int argc, char *argv[])
 {
-	f_err = stderr;
-
+	arch_init(argv);
 	sinit();
+	debug_init();
 
 	do_rand_allocs();
-
-	/* smalloc bug, commit 271067a6 */
-	do_specific_alloc(671386584);
+	smalloc_debug(0);	/* free and total blocks should match */
 
 	scleanup();
 	return 0;
 }
-
-void __dprint(int type, const char *str, ...)
-{
-}
diff -Nru fio-2.1.3/t/time-test.c fio-3.16/t/time-test.c
--- fio-2.1.3/t/time-test.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/time-test.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,544 @@
+/*
+ * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
+ *
+ * When we use the CPU clock for timing, we do the following:
+ *
+ * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
+ *    to actual time.
+ *
+ *    Using gettimeofday() or clock_gettime(), count how many CPU clock
+ *    ticks occur per usec
+ *
+ * 2) Calculate conversion factors so that we can ultimately convert
+ *    from clocks ticks to nsec with
+ *      nsec = (ticks * clock_mult) >> clock_shift
+ *
+ *    This is equivalent to
+ *	nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
+ *    where
+ *	clock_mult = MULTIPLIER / cycles_per_nsec
+ *      MULTIPLIER = 2^clock_shift
+ *
+ *    It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
+ *    but all of this is necessary because of rounding when calculating
+ *    cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
+ *    be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
+ *    portion is lost with integer arithmetic.
+ *
+ *    This multiply and shift calculation also has a performance benefit
+ *    as multiplication and bit shift operations are faster than integer
+ *    division.
+ *
+ * 3) Dynamically determine clock_shift and clock_mult at run time based
+ *    on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
+ *    duration for which the conversion will be valid.
+ *
+ *    The primary constraint is that (ticks * clock_mult) must not overflow
+ *    when ticks is at its maximum value.
+ *
+ *    So we have
+ *	max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec
+ *	max_ticks * clock_mult <= ULLONG_MAX
+ *	max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
+ *      MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Then choose the largest clock_shift that satisfies
+ *	2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Finally calculate the appropriate clock_mult associated with clock_shift
+ *	clock_mult = 2^clock_shift / cycles_per_nsec
+ *
+ * 4) In the code below we have cycles_per_usec and use
+ *	cycles_per_nsec = cycles_per_usec / 1000
+ *
+ *
+ * The code below implements 4 clock tick to nsec conversion strategies
+ *
+ *   i) 64-bit arithmetic for the (ticks * clock_mult) product with the
+ *	conversion valid for at most MAX_CLOCK_SEC
+ *
+ *  ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication
+ *	for the (ticks * clock_mult) product
+ *
+ * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in
+ *	two stages. The first stage counts the number of discrete, large chunks
+ *	of time that have elapsed. To this is added the time represented by
+ *	the remaining clock ticks. The advantage of this strategy is better
+ *	accuracy because the (ticks * clock_mult) product used for final
+ *	fractional chunk
+ *
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *	two stages. This is carried out using locks to update the number of
+ *	large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
+ *
+ *   v) 128-bit arithmetic used for the clock ticks to nsec conversion.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "lib/seqlock.h"
+
+#define DEBUG 0
+#define MAX_CLOCK_SEC 365*24*60*60ULL
+#define MAX_CLOCK_SEC_2STAGE 60*60ULL
+#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
+
+enum {
+	__CLOCK64_BIT		= 1 << 0,
+	__CLOCK128_BIT		= 1 << 1,
+	__CLOCK_MULT_SHIFT	= 1 << 2,
+	__CLOCK_EMULATE_128	= 1 << 3,
+	__CLOCK_2STAGE		= 1 << 4,
+	__CLOCK_LOCK		= 1 << 5,
+
+	CLOCK64_MULT_SHIFT	= __CLOCK64_BIT | __CLOCK_MULT_SHIFT,
+	CLOCK64_EMULATE_128	= __CLOCK64_BIT | __CLOCK_EMULATE_128,
+	CLOCK64_2STAGE		= __CLOCK64_BIT | __CLOCK_2STAGE,
+	CLOCK64_LOCK		= __CLOCK64_BIT | __CLOCK_LOCK,
+	CLOCK128_MULT_SHIFT	= __CLOCK128_BIT | __CLOCK_MULT_SHIFT,
+};
+
+static struct seqlock clock_seqlock;
+static unsigned long long cycles_start;
+static unsigned long long elapsed_nsec;
+
+static unsigned int max_cycles_shift;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+
+static unsigned int clock_shift;
+static unsigned long long clock_mult;
+
+static unsigned long long *nsecs;
+static unsigned long long clock_mult64_128[2];
+static __uint128_t clock_mult128;
+
+/*
+ * Functions for carrying out 128-bit
+ * arithmetic using 64-bit integers
+ *
+ * 128-bit integers are stored as
+ * arrays of two 64-bit integers
+ *
+ * Ordering is little endian
+ *
+ * a[0] has the less significant bits
+ * a[1] has the more significant bits
+ *
+ * NOT FULLY IMPLEMENTED
+ */
+static void do_mult(unsigned long long a[2], unsigned long long b,
+		    unsigned long long product[2])
+{
+	product[0] = product[1] = 0;
+	return;
+}
+
+static void do_div(unsigned long long a[2], unsigned long long b,
+		   unsigned long long c[2])
+{
+	return;
+}
+
+static void do_shift64(unsigned long long a[2], unsigned int count)
+{
+	a[0] = a[1] >> (count-64);
+	a[1] = 0;
+}
+
+static void do_shift(unsigned long long a[2], unsigned int count)
+{
+	if (count > 64)
+		do_shift64(a, count);
+	else {
+		while (count--) {
+			a[0] >>= 1;
+			a[0] |= a[1] << 63;
+			a[1] >>= 1;
+		}
+	}
+}
+
+static void update_clock(unsigned long long t)
+{
+	write_seqlock_begin(&clock_seqlock);
+	elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles;
+	cycles_start = t & ~max_cycles_mask;
+	write_seqlock_end(&clock_seqlock);
+}
+
+static unsigned long long _get_nsec(int mode, unsigned long long t)
+{
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		return (t * clock_mult) >> clock_shift;
+	case CLOCK64_EMULATE_128: {
+		unsigned long long product[2] =  { };
+
+		do_mult(clock_mult64_128, t, product);
+		do_shift(product, clock_shift);
+		return product[0];
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long multiples, nsec;
+
+		multiples = t >> max_cycles_shift;
+		dprintf("multiples=%llu\n", multiples);
+		nsec = multiples * nsecs_for_max_cycles;
+		nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+		return nsec;
+		}
+	case CLOCK64_LOCK: {
+		unsigned int seq;
+		unsigned long long nsec;
+
+		do {
+			seq = read_seqlock_begin(&clock_seqlock);
+			nsec = elapsed_nsec;
+			nsec += ((t - cycles_start) * clock_mult) >> clock_shift;
+		} while (read_seqlock_retry(&clock_seqlock, seq));
+		return nsec;
+		}
+	case CLOCK128_MULT_SHIFT:
+		return (unsigned long long)((t * clock_mult128) >> clock_shift);
+		default:
+			assert(0);
+	}
+}
+
+static unsigned long long get_nsec(int mode, unsigned long long t)
+{
+	if (mode == CLOCK64_LOCK) {
+		update_clock(t);
+	}
+
+	return _get_nsec(mode, t);
+}
+
+static void calc_mult_shift(int mode, void *mult, unsigned int *shift,
+			    unsigned long long max_sec,
+			    unsigned long long cycles_per_usec)
+{
+	unsigned long long max_ticks;
+	max_ticks = max_sec * cycles_per_usec * 1000000ULL;
+
+	switch (mode) {
+	case CLOCK64_MULT_SHIFT: {
+		unsigned long long max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 64-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ULLONG_MAX / max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=%llu, sft=%u\n", tmp, sft);
+		}
+
+		*shift = sft;
+		*((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_EMULATE_128: {
+		unsigned long long max_mult[2], tmp[2] = { };
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion,
+		 * but use only 64-bit integers in the process
+		 */
+		max_mult[0] = max_mult[1] = ULLONG_MAX;
+		do_div(max_mult, max_ticks, max_mult);
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+			max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		do_div(max_mult, cycles_per_usec, tmp);
+		do_div(tmp, 1000ULL, tmp);
+		while (tmp[0] > 1 || tmp[1] > 1) {
+			do_shift(tmp, 1);
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
+		}
+
+		*shift = sft;
+//		*((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	case CLOCK64_2STAGE: {
+		unsigned long long tmp;
+/*
+ * This clock tick to nsec conversion requires two stages.
+ *
+ * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks
+ * 	have elapsed and set nsecs to the appropriate value for those
+ *	~MAX_CLOCK_SEC_2STAGE periods.
+ * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from
+ *	Stage 1. Convert remaining clock ticks to nsecs and add to previously
+ *	set nsec value.
+ *
+ * To optimize the arithmetic operations, use the greatest power of 2 ticks
+ * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds.
+ *
+ */
+		// Use a period shorter than MAX_CLOCK_SEC here for better accuracy
+		calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec);
+
+		// Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE
+		max_cycles_shift = max_cycles_mask = 0;
+		tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec;
+		dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		while (tmp > 1) {
+			tmp >>= 1;
+			max_cycles_shift++;
+			dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+		}
+		// if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
+		// have a discontinuity every (1ULL << max_cycles_shift) cycles
+		nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
+
+		// Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
+		for (tmp = 0; tmp < max_cycles_shift; tmp++)
+			max_cycles_mask |= 1ULL << tmp;
+
+		dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
+				max_cycles_shift, (1ULL << max_cycles_shift),
+				nsecs_for_max_cycles, max_cycles_mask);
+
+
+		break;
+		}
+	case CLOCK64_LOCK: {
+/*
+ * This clock tick to nsec conversion also requires two stages.
+ *
+ * Stage 1: Add to nsec the current running total of elapsed long periods
+ * Stage 2: Subtract from clock ticks the tick count corresponding to the
+ *	most recently elapsed long period. Convert the remaining ticks to
+ *	nsec and add to the previous nsec value.
+ *
+ * In practice the elapsed nsec from Stage 1 and the tick count subtracted
+ * in Stage 2 will be maintained in a separate thread.
+ *
+ */
+		calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec);
+		cycles_start = 0;
+		break;
+		}
+	case CLOCK128_MULT_SHIFT: {
+		__uint128_t max_mult, tmp;
+		unsigned int sft = 0;
+
+		/*
+		 * Calculate the largest multiplier that will not
+		 * produce a 128-bit overflow in the multiplication
+		 * step of the clock ticks to nsec conversion
+		 */
+		max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
+		max_mult /= max_ticks;
+		dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+				max_ticks, __builtin_clzll(max_ticks),
+				(unsigned long long) (max_mult >> 64),
+				(unsigned long long) max_mult);
+
+		/*
+		 * Find the largest shift count that will produce
+		 * a multiplier less than max_mult
+		 */
+		tmp = max_mult * cycles_per_usec / 1000;
+		while (tmp > 1) {
+			tmp >>= 1;
+			sft++;
+			dprintf("tmp=0x%016llx%016llx, sft=%u\n",
+					(unsigned long long) (tmp >> 64),
+					(unsigned long long) tmp, sft);
+		}
+
+		*shift = sft;
+		*((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+		break;
+		}
+	}
+}
+
+static int discontinuity(int mode, int delta_ticks, int delta_nsec,
+			 unsigned long long start, unsigned long len)
+{
+	int i;
+	unsigned long mismatches = 0, bad_mismatches = 0;
+	unsigned long long delta, max_mismatch = 0;
+	unsigned long long *ns = nsecs;
+
+	for (i = 0; i < len; ns++, i++) {
+		*ns = get_nsec(mode, start + i);
+		if (i - delta_ticks >= 0) {
+			if (*ns > *(ns - delta_ticks))
+				delta = *ns - *(ns - delta_ticks);
+			else
+				delta = *(ns - delta_ticks) - *ns;
+			if (delta > delta_nsec)
+				delta -= delta_nsec;
+			else
+				delta = delta_nsec - delta;
+			if (delta) {
+				mismatches++;
+				if (delta > 1)
+					bad_mismatches++;
+				if (delta > max_mismatch)
+					max_mismatch = delta;
+			}
+		}
+		if (!bad_mismatches)
+			assert(max_mismatch == 0 || max_mismatch == 1);
+		if (!mismatches)
+			assert(max_mismatch == 0);
+	}
+
+	printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
+		mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
+		start + len - 1);
+	return mismatches;
+}
+
+#define MIN_TICKS 1ULL
+#define LEN 1000000000ULL
+#define NSEC_ONE_SEC 1000000000ULL
+#define TESTLEN 9
+
+static long long test_clock(int mode, int cycles_per_usec, int fast_test,
+			    int quiet, int delta_ticks, int delta_nsec)
+{
+	int i;
+	long long delta;
+	unsigned long long max_ticks;
+	unsigned long long nsecs;
+	void *mult;
+	unsigned long long test_ns[TESTLEN] =
+			{NSEC_ONE_SEC, NSEC_ONE_SEC,
+			 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
+			 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
+			 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
+	unsigned long long test_ticks[TESTLEN];
+
+	max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
+
+	switch(mode) {
+	case CLOCK64_MULT_SHIFT:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_EMULATE_128:
+		mult = clock_mult64_128;
+		break;
+	case CLOCK64_2STAGE:
+		mult = &clock_mult;
+		break;
+	case CLOCK64_LOCK:
+		mult = &clock_mult;
+		break;
+	case CLOCK128_MULT_SHIFT:
+		mult = &clock_mult128;
+		break;
+	default:
+		assert(0);
+	}
+	calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
+	nsecs = get_nsec(mode, max_ticks);
+	delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
+
+	if (mode == CLOCK64_2STAGE) {
+		test_ns[0] = nsecs_for_max_cycles - 1;
+		test_ns[1] = nsecs_for_max_cycles;
+		test_ticks[0] = (1ULL << max_cycles_shift) - 1;
+		test_ticks[1] = (1ULL << max_cycles_shift);
+
+		for (i = 2; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+	else {
+		for (i = 0; i < TESTLEN; i++)
+			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+	}
+
+	if (!quiet) {
+		printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
+			cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
+		switch(mode) {
+			case CLOCK64_LOCK:
+			case CLOCK64_2STAGE:
+			case CLOCK64_MULT_SHIFT: {
+				printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
+					clock_mult, (double) clock_mult / (1ULL << clock_shift));
+				break;
+			}
+			case CLOCK64_EMULATE_128: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					clock_mult64_128[1], clock_mult64_128[0]);
+				break;
+			}
+			case CLOCK128_MULT_SHIFT: {
+				printf("clock_mult=0x%016llx%016llx\n",
+					(unsigned long long) (clock_mult128 >> 64),
+					(unsigned long long) clock_mult128);
+				break;
+			}
+		}
+		printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
+			nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
+	}
+
+	for (i = 0; i < TESTLEN; i++)
+	{
+		nsecs = get_nsec(mode, test_ticks[i]);
+		delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
+		if (!quiet || delta > 0)
+			printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
+				test_ticks[i], nsecs, test_ns[i], delta);
+	}
+
+	if (!fast_test) {
+		discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
+		discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
+	}
+
+	if (!quiet)
+		printf("\n\n");
+
+	return delta;
+}
+
+int main(int argc, char *argv[])
+{
+	nsecs = malloc(LEN * sizeof(unsigned long long));
+
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0);
+	test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1);
+	test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10);
+	test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1);
+	test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000);
+	test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125);
+	test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2);
+	test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1);
+
+	free(nsecs);
+	return 0;
+}
diff -Nru fio-2.1.3/t/verify-state.c fio-3.16/t/verify-state.c
--- fio-2.1.3/t/verify-state.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/verify-state.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,157 @@
+/*
+ * Dump the contents of a verify state file in plain text
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include "../log.h"
+#include "../os/os.h"
+#include "../verify-state.h"
+#include "../crc/crc32c.h"
+#include "debug.h"
+
+static void show_s(struct thread_io_list *s, unsigned int no_s)
+{
+	int i;
+
+	printf("Thread:\t\t%u\n", no_s);
+	printf("Name:\t\t%s\n", s->name);
+	printf("Completions:\t%llu\n", (unsigned long long) s->no_comps);
+	printf("Depth:\t\t%llu\n", (unsigned long long) s->depth);
+	printf("Number IOs:\t%llu\n", (unsigned long long) s->numberio);
+	printf("Index:\t\t%llu\n", (unsigned long long) s->index);
+
+	printf("Completions:\n");
+	if (!s->no_comps)
+		return;
+	for (i = s->no_comps - 1; i >= 0; i--) {
+		printf("\t(file=%2llu) %llu\n",
+				(unsigned long long) s->comps[i].fileno,
+				(unsigned long long) s->comps[i].offset);
+	}
+}
+
+static void show(struct thread_io_list *s, size_t size)
+{
+	int no_s;
+
+	no_s = 0;
+	do {
+		int i;
+
+		s->no_comps = le64_to_cpu(s->no_comps);
+		s->depth = le32_to_cpu(s->depth);
+		s->nofiles = le32_to_cpu(s->nofiles);
+		s->numberio = le64_to_cpu(s->numberio);
+		s->index = le64_to_cpu(s->index);
+
+		for (i = 0; i < s->no_comps; i++) {
+			s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+			s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+		}
+
+		show_s(s, no_s);
+		no_s++;
+		size -= __thread_io_list_sz(s->depth, s->nofiles);
+		s = (struct thread_io_list *)((char *) s +
+			__thread_io_list_sz(s->depth, s->nofiles));
+	} while (size != 0);
+}
+
+static void show_verify_state(void *buf, size_t size)
+{
+	struct verify_state_hdr *hdr = buf;
+	struct thread_io_list *s;
+	uint32_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	printf("Version:\t0x%x\n", (unsigned int) hdr->version);
+	printf("Size:\t\t%u\n", (unsigned int) hdr->size);
+	printf("CRC:\t\t0x%x\n", (unsigned int) hdr->crc);
+
+	size -= sizeof(*hdr);
+	if (hdr->size != size) {
+		log_err("Size mismatch\n");
+		return;
+	}
+
+	s = buf + sizeof(*hdr);
+	crc = fio_crc32c((unsigned char *) s, hdr->size);
+	if (crc != hdr->crc) {
+		log_err("crc mismatch %x != %x\n", crc, (unsigned int) hdr->crc);
+		return;
+	}
+
+	if (hdr->version == 0x03)
+		show(s, size);
+	else
+		log_err("Unsupported version %d\n", (int) hdr->version);
+}
+
+static int show_file(const char *file)
+{
+	struct stat sb;
+	void *buf;
+	int ret, fd;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		log_err("open %s: %s\n", file, strerror(errno));
+		return 1;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		log_err("stat: %s\n", strerror(errno));
+		close(fd);
+		return 1;
+	}
+
+	buf = malloc(sb.st_size);
+	ret = read(fd, buf, sb.st_size);
+	if (ret < 0) {
+		log_err("read: %s\n", strerror(errno));
+		close(fd);
+		free(buf);
+		return 1;
+	} else if (ret != sb.st_size) {
+		log_err("Short read\n");
+		close(fd);
+		free(buf);
+		return 1;
+	}
+
+	close(fd);
+	show_verify_state(buf, sb.st_size);
+
+	free(buf);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, ret;
+
+	debug_init();
+
+	if (argc < 2) {
+		log_err("Usage: %s <state file>\n", argv[0]);
+		return 1;
+	}
+
+	ret = 0;
+	for (i = 1; i < argc; i++) {
+		ret = show_file(argv[i]);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
diff -Nru fio-2.1.3/t/zbd/functions fio-3.16/t/zbd/functions
--- fio-2.1.3/t/zbd/functions	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/zbd/functions	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+blkzone=$(type -p blkzone 2>/dev/null)
+sg_inq=$(type -p sg_inq 2>/dev/null)
+zbc_report_zones=$(type -p zbc_report_zones 2>/dev/null)
+zbc_reset_zone=$(type -p zbc_reset_zone 2>/dev/null)
+if [ -z "${blkzone}" ] &&
+       { [ -z "${zbc_report_zones}" ] || [ -z "${zbc_reset_zone}" ]; }; then
+    echo "Error: neither blkzone nor zbc_report_zones is available"
+    exit 1
+fi
+
+# Reports the starting sector and length of the first sequential zone of device
+# $1.
+first_sequential_zone() {
+    local dev=$1
+
+    if [ -n "${blkzone}" ]; then
+	${blkzone} report "$dev" |
+	    sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*type:[[:blank:]]2(.*/\1 \2/p' |
+	    {
+		read -r starting_sector length &&
+		    # Convert from hex to decimal
+		    echo $((starting_sector)) $((length))
+	    }
+    else
+	${zbc_report_zones} "$dev" |
+	    sed -n 's/^Zone [0-9]*: type 0x2 .*, sector \([0-9]*\), \([0-9]*\) sectors,.*$/\1 \2/p' |
+	    head -n1
+    fi
+}
+
+max_open_zones() {
+    local dev=$1
+
+    if [ -n "${sg_inq}" ]; then
+	if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" 2> /dev/null; then
+	    # Non scsi device such as null_blk can not return max open zones.
+	    # Use default value.
+	    echo 128
+	else
+	    ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
+		{
+		    read -r offset b0 b1 b2 b3 trailer || return $?
+		    # Convert from hex to decimal
+		    max_nr_open_zones=$((0x${b0}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b1}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b2}))
+		    max_nr_open_zones=$((max_nr_open_zones * 256 + 0x${b3}))
+		    echo ${max_nr_open_zones}
+		}
+	fi
+    else
+	${zbc_report_zones} "$dev" |
+	    sed -n 's/^[[:blank:]]*Maximum number of open sequential write required zones:[[:blank:]]*//p'
+    fi
+}
+
+# Reset the write pointer of one zone on device $1 at offset $2. The offset
+# must be specified in units of 512 byte sectors. Offset -1 means reset all
+# zones.
+reset_zone() {
+    local dev=$1 offset=$2 sectors
+
+    if [ -n "${blkzone}" ]; then
+	if [ "$offset" -lt 0 ]; then
+	    sectors=$(<"/sys/class/block/${dev#/dev/}/size")
+	    ${blkzone} reset -o "${offset}" -l "$sectors" "$dev"
+	else
+	    ${blkzone} reset -o "${offset}" -c 1 "$dev"
+	fi
+    else
+	if [ "$offset" -lt 0 ]; then
+	    ${zbc_reset_zone} -all "$dev" "${offset}" >/dev/null
+	else
+	    ${zbc_reset_zone} -sector "$dev" "${offset}" >/dev/null
+	fi
+    fi
+}
+
+# Extract the number of bytes that have been transferred from a line like
+# READ: bw=6847KiB/s (7011kB/s), 6847KiB/s-6847KiB/s (7011kB/s-7011kB/s), io=257MiB (269MB), run=38406-38406msec
+fio_io() {
+    sed -n 's/^[[:blank:]]*'"$1"'.*, io=\([^[:blank:]]*\).*/\1/p' |
+	tail -n 1 |
+	(
+	    read -r io;
+	    # Parse <number>.<number><suffix> into n1, n2 and s. See also
+	    # num2str().
+	    shopt -s extglob
+	    n1=${io%${io##*([0-9])}}
+	    s=${io#${io%%*([a-zA-Z])}}
+	    n2=${io#${n1}}
+	    n2=${n2#.}
+	    n2=${n2%$s}000
+	    n2=${n2:0:3}
+	    case "$s" in
+		KiB) m=10;;
+		MiB) m=20;;
+		GiB) m=30;;
+		B)   m=0;;
+		*)   return 1;;
+	    esac
+	    [ -n "$n1" ] || return 1
+	    echo $(((n1 << m) + (n2 << m) / 1000))
+	)
+}
+
+fio_read() {
+    fio_io 'READ:'
+}
+
+fio_written() {
+    fio_io 'WRITE:'
+}
+
+fio_reset_count() {
+    local count
+
+    count=$(sed -n 's/^.*write:[^;]*; \([0-9]*\) zone resets$/\1/p')
+    echo "${count:-0}"
+}
diff -Nru fio-2.1.3/t/zbd/run-tests-against-regular-nullb fio-3.16/t/zbd/run-tests-against-regular-nullb
--- fio-2.1.3/t/zbd/run-tests-against-regular-nullb	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/zbd/run-tests-against-regular-nullb	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+
+for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+modprobe -r null_blk
+modprobe null_blk nr_devices=0 || return $?
+for d in /sys/kernel/config/nullb/*; do
+    [ -d "$d" ] && rmdir "$d"
+done
+modprobe -r null_blk
+[ -e /sys/module/null_blk ] && exit $?
+modprobe null_blk nr_devices=0 &&
+    cd /sys/kernel/config/nullb &&
+    mkdir nullb0 &&
+    cd nullb0 &&
+    echo 0 > completion_nsec &&
+    echo 4096 > blocksize &&
+    echo 1024 > size &&
+    echo 1 > memory_backed &&
+    echo 1 > power
+
+"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff -Nru fio-2.1.3/t/zbd/run-tests-against-zoned-nullb fio-3.16/t/zbd/run-tests-against-zoned-nullb
--- fio-2.1.3/t/zbd/run-tests-against-zoned-nullb	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/zbd/run-tests-against-zoned-nullb	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+
+for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+modprobe -r null_blk
+modprobe null_blk nr_devices=0 || return $?
+for d in /sys/kernel/config/nullb/*; do
+    [ -d "$d" ] && rmdir "$d"
+done
+modprobe -r null_blk
+[ -e /sys/module/null_blk ] && exit $?
+modprobe null_blk nr_devices=0 &&
+    cd /sys/kernel/config/nullb &&
+    mkdir nullb0 &&
+    cd nullb0 &&
+    echo 1 > zoned &&
+    echo 1 > zone_size &&
+    echo 0 > completion_nsec &&
+    echo 4096 > blocksize &&
+    echo 1024 > size &&
+    echo 1 > memory_backed &&
+    echo 1 > power || exit $?
+
+"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff -Nru fio-2.1.3/t/zbd/test-zbd-support fio-3.16/t/zbd/test-zbd-support
--- fio-2.1.3/t/zbd/test-zbd-support	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/t/zbd/test-zbd-support	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,858 @@
+#!/bin/bash
+#
+# Copyright (C) 2018 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+
+usage() {
+    echo "Usage: $(basename "$0") [-d] [-e] [-r] [-v] [-t <test>] <SMR drive device node>"
+}
+
+max() {
+    if [ "$1" -gt "$2" ]; then
+	echo "$1"
+    else
+	echo "$2"
+    fi
+}
+
+min() {
+    if [ "$1" -lt "$2" ]; then
+	echo "$1"
+    else
+	echo "$2"
+    fi
+}
+
+set_io_scheduler() {
+    local dev=$1 sched=$2
+
+    [ -e "/sys/block/$dev" ] || return $?
+    if [ -e "/sys/block/$dev/mq" ]; then
+	case "$sched" in
+	    noop)        sched=none;;
+	    deadline)    sched=mq-deadline;;
+	esac
+    else
+	case "$sched" in
+	    none)        sched=noop;;
+	    mq-deadline) sched=deadline;;
+	esac
+    fi
+
+    echo "$sched" >"/sys/block/$dev/queue/scheduler"
+}
+
+check_read() {
+    local read
+
+    read=$(fio_read <"${logfile}.${test_number}")
+    echo "read: $read <> $1" >> "${logfile}.${test_number}"
+    [ "$read" = "$1" ]
+}
+
+check_written() {
+    local written
+
+    written=$(fio_written <"${logfile}.${test_number}")
+    echo "written: $written <> $1" >> "${logfile}.${test_number}"
+    [ "$written" = "$1" ]
+}
+
+# Compare the reset count from the log file with reset count $2 using operator
+# $1 (=, -ge, -gt, -le, -lt).
+check_reset_count() {
+    local reset_count
+
+    reset_count=$(fio_reset_count <"${logfile}.${test_number}")
+    echo "reset_count: test $reset_count $1 $2" >> "${logfile}.${test_number}"
+    eval "[ '$reset_count' '$1' '$2' ]"
+}
+
+# Whether or not $1 (/dev/...) is a SCSI device.
+is_scsi_device() {
+    local d f
+
+    d=$(basename "$dev")
+    for f in /sys/class/scsi_device/*/device/block/"$d"; do
+	[ -e "$f" ] && return 0
+    done
+    return 1
+}
+
+run_fio() {
+    local fio opts
+
+    fio=$(dirname "$0")/../../fio
+
+    opts=("--aux-path=/tmp" "--allow_file_create=0" \
+			    "--significant_figures=10" "$@")
+    { echo; echo "fio ${opts[*]}"; echo; } >>"${logfile}.${test_number}"
+
+    "${dynamic_analyzer[@]}" "$fio" "${opts[@]}"
+}
+
+run_one_fio_job() {
+    local r
+
+    r=$(((RANDOM << 16) | RANDOM))
+    run_fio --name="$dev" --filename="$dev" "$@" --randseed="$r"	\
+	    --thread=1 --direct=1
+}
+
+# Run fio on the first four sequential zones of the disk.
+run_fio_on_seq() {
+    local opts=()
+
+    opts+=("--offset=$((first_sequential_zone_sector * 512))")
+    opts+=("--size=$((4 * zone_size))" "--zonemode=zbd")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_one_fio_job "${opts[@]}" "$@"
+}
+
+# Check whether buffered writes are refused.
+test1() {
+    run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K	\
+	    --size="${zone_size}" --thread=1				\
+	    --zonemode=zbd --zonesize="${zone_size}" 2>&1 |
+	tee -a "${logfile}.${test_number}" |
+	grep -q 'Using direct I/O is mandatory for writing to ZBD drives'
+    local fio_rc=${PIPESTATUS[0]} grep_rc=${PIPESTATUS[2]}
+    case "$fio_rc" in
+	0|1) ;;
+	*)   return "$fio_rc"
+    esac
+    if [ -n "$is_zbd" ]; then
+	[ "$grep_rc" = 0 ]
+    else
+	[ "$grep_rc" != 0 ]
+    fi
+}
+
+# Block size exceeds zone size.
+test2() {
+    local bs off opts=() rc
+
+    off=$(((first_sequential_zone_sector + 2 * sectors_per_zone) * 512))
+    bs=$((2 * zone_size))
+    opts+=("--name=job1" "--filename=$dev" "--rw=write" "--direct=1")
+    opts+=("--zonemode=zbd" "--offset=$off" "--bs=$bs" "--size=$bs")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    ! grep -q 'WRITE:' "${logfile}.${test_number}"
+}
+
+# Run fio against an empty zone. This causes fio to report "No I/O performed".
+test3() {
+    local off opts=() rc
+
+    off=$((first_sequential_zone_sector * 512 + 128 * zone_size))
+    size=$((zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=4K")
+    opts+=("--size=$size" "--zonemode=zbd")
+    opts+=("--ioengine=psync" "--rw=read" "--direct=1" "--thread=1")
+    if [ -z "$is_zbd" ]; then
+	opts+=("--zonesize=${zone_size}")
+    fi
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    grep -q 'READ:' "${logfile}.${test_number}"
+    rc=$?
+    if [ -n "$is_zbd" ]; then
+	[ $rc != 0 ]
+    else
+	[ $rc = 0 ]
+    fi
+}
+
+# Run fio with --read_beyond_wp=1 against an empty zone.
+test4() {
+    local off opts=()
+
+    off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
+    size=$((zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+    opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
+    opts+=("--ioengine=psync" "--rw=read" "--direct=1" "--disable_lat=1")
+    opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Sequential write to sequential zones.
+test5() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write		\
+		   --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Sequential read from sequential zones. Must be run after test5.
+test6() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=read		\
+		   --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 1.
+test7() {
+    local size=$((zone_size))
+
+    run_fio_on_seq --ioengine=libaio --iodepth=1 --rw=randwrite		\
+		   --bs="$(min 16384 "${zone_size}")"			\
+		   --do_verify=1 --verify=md5 --size="$size"		\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64.
+test8() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite	\
+		   --bs="$(min 16384 "${zone_size}")"			\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, sg, queue depth 1.
+test9() {
+    local size
+
+    if ! is_scsi_device "$dev"; then
+	echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
+	return 0
+    fi
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=sg --iodepth=1 --rw=randwrite --bs=16K	\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, sg, queue depth 64.
+test10() {
+    local size
+
+    if ! is_scsi_device "$dev"; then
+	echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
+	return 0
+    fi
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=sg --iodepth=64 --rw=randwrite --bs=16K	\
+		   --do_verify=1 --verify=md5				\
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, random block size.
+test11() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite	\
+		   --bsrange=4K-64K --do_verify=1 --verify=md5		\
+		   --debug=zbd >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, max 1 open zone.
+test12() {
+    local size
+
+    size=$((8 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K     \
+		   --max_open_zones=1 --size=$size --do_verify=1 --verify=md5 \
+		   --debug=zbd >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to sequential zones, libaio, queue depth 64, max 4 open zones.
+test13() {
+    local size
+
+    size=$((8 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K     \
+		   --max_open_zones=4 --size=$size --do_verify=1 --verify=md5 \
+		   --debug=zbd						      \
+		   >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $size || return $?
+    check_read $size || return $?
+}
+
+# Random write to conventional zones.
+test14() {
+    local size
+
+    size=$((16 * 2**20)) # 20 MB
+    if [ $size -gt $((first_sequential_zone_sector * 512)) ]; then
+	echo "$dev does not have enough sequential zones" \
+	     >>"${logfile}.${test_number}"
+	return 0
+    fi
+    run_one_fio_job --ioengine=libaio --iodepth=64 --rw=randwrite --bs=16K \
+		    --zonemode=zbd --zonesize="${zone_size}" --do_verify=1 \
+		    --verify=md5 --size=$size				   \
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size)) || return $?
+    check_read $((size)) || return $?
+}
+
+# Sequential read on a mix of empty and full zones.
+test15() {
+    local i off size
+
+    for ((i=0;i<4;i++)); do
+	[ -n "$is_zbd" ] &&
+	    reset_zone "$dev" $((first_sequential_zone_sector +
+				 i*sectors_per_zone))
+    done
+    off=$(((first_sequential_zone_sector + 2 * sectors_per_zone) * 512))
+    size=$((2 * zone_size))
+    run_one_fio_job --ioengine=psync --rw=write --bs=$((zone_size / 16))\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$size >>"${logfile}.${test_number}" 2>&1 ||
+	return $?
+    check_written $size || return $?
+    off=$((first_sequential_zone_sector * 512))
+    size=$((4 * zone_size))
+    run_one_fio_job --ioengine=psync --rw=read --bs=$((zone_size / 16))	\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
+	return $?
+    if [ -n "$is_zbd" ]; then
+	check_read $((size / 2))
+    else
+	check_read $size
+    fi
+}
+
+# Random read on a mix of empty and full zones. Must be run after test15.
+test16() {
+    local off size
+
+    off=$((first_sequential_zone_sector * 512))
+    size=$((4 * zone_size))
+    run_one_fio_job --ioengine=libaio --iodepth=64 --rw=randread --bs=16K \
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
+		    --size=$size >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_read $size || return $?
+}
+
+# Random reads and writes in the last zone.
+test17() {
+    local io off read size written
+
+    off=$(((disk_size / zone_size - 1) * zone_size))
+    size=$((disk_size - off))
+    # Overwrite the last zone to avoid that reading from that zone fails.
+    if [ -n "$is_zbd" ]; then
+	reset_zone "$dev" $((off / 512)) || return $?
+    fi
+    run_one_fio_job --ioengine=psync --rw=write --offset="$off"		\
+		    --zonemode=zbd --zonesize="${zone_size}"		\
+		    --bs="$zone_size" --size="$zone_size"		\
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    check_written "$zone_size" || return $?
+    run_one_fio_job --ioengine=libaio --iodepth=8 --rw=randrw --bs=4K	\
+		    --zonemode=zbd --zonesize="${zone_size}"		\
+		    --offset=$off --loops=2 --norandommap=1\
+		    >>"${logfile}.${test_number}" 2>&1 || return $?
+    written=$(fio_written <"${logfile}.${test_number}")
+    read=$(fio_read <"${logfile}.${test_number}")
+    io=$((written + read))
+    echo "Total number of bytes read and written: $io <> $size" \
+	 >>"${logfile}.${test_number}"
+    [ $io = $((size * 2)) ];
+}
+
+# Out-of-range zone reset threshold and frequency parameters.
+test18() {
+    run_fio_on_seq --zone_reset_threshold=-1 |&
+	tee -a "${logfile}.${test_number}"   |
+	    grep -q 'value out of range' || return $?
+}
+
+test19() {
+    run_fio_on_seq --zone_reset_threshold=2  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test20() {
+    run_fio_on_seq --zone_reset_threshold=.4:.6 |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'the list exceeding max length' || return $?
+}
+
+test21() {
+    run_fio_on_seq --zone_reset_frequency=-1 |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test22() {
+    run_fio_on_seq --zone_reset_frequency=2  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'value out of range' || return $?
+}
+
+test23() {
+    run_fio_on_seq --zone_reset_frequency=.4:.6  |&
+	tee -a "${logfile}.${test_number}"   |
+	grep -q 'the list exceeding max length' || return $?
+}
+
+test24() {
+    local bs loops=9 size=$((zone_size))
+
+    bs=$(min $((256*1024)) "$zone_size")
+    run_fio_on_seq --ioengine=psync --rw=write --bs="$bs" --size=$size	 \
+		   --loops=$loops					 \
+		   --zone_reset_frequency=.01 --zone_reset_threshold=.90 \
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size * loops)) || return $?
+    check_reset_count -eq 8 ||
+	check_reset_count -eq 9 ||
+	check_reset_count -eq 10 || return $?
+}
+
+# Multiple non-overlapping sequential write jobs for the same drive.
+test25() {
+    local i opts=()
+
+    for ((i=0;i<16;i++)); do
+        [ -n "$is_zbd" ] &&
+	    reset_zone "$dev" $((first_sequential_zone_sector + i*sectors_per_zone))
+    done
+    for ((i=0;i<16;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--thread=1" "--direct=1")
+	opts+=("--offset=$((first_sequential_zone_sector*512 + zone_size*i))")
+	opts+=("--size=$zone_size" "--ioengine=psync" "--rw=write" "--bs=16K")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}" "--group_reporting=1")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+write_to_first_seq_zone() {
+    local loops=4 r
+
+    r=$(((RANDOM << 16) | RANDOM))
+    run_fio --name="$dev" --filename="$dev" --ioengine=psync --rw="$1"	\
+	    --thread=1 --do_verify=1 --verify=md5 --direct=1 --bs=4K	\
+	    --offset=$((first_sequential_zone_sector * 512))		\
+	    "--size=$zone_size" --loops=$loops --randseed="$r"		\
+	    --zonemode=zbd --zonesize="${zone_size}" --group_reporting=1	\
+	    --gtod_reduce=1 >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((loops * zone_size)) || return $?
+}
+
+# Overwrite the first sequential zone four times sequentially.
+test26() {
+    write_to_first_seq_zone write
+}
+
+# Overwrite the first sequential zone four times using random writes.
+test27() {
+    write_to_first_seq_zone randwrite
+}
+
+# Multiple overlapping random write jobs for the same drive.
+test28() {
+    local i jobs=16 off opts
+
+    off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts=("--debug=zbd")
+    for ((i=0;i<jobs;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
+	opts+=("--size=$zone_size" "--ioengine=psync" "--rw=randwrite")
+	opts+=("--thread=1" "--direct=1" "--zonemode=zbd")
+	opts+=("--zonesize=${zone_size}" "--group_reporting=1")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((jobs * zone_size)) || return $?
+    check_reset_count -eq $jobs ||
+	check_reset_count -eq $((jobs - 1)) ||
+	return $?
+}
+
+# Multiple overlapping random write jobs for the same drive and with a limited
+# number of open zones.
+test29() {
+    local i jobs=16 off opts=()
+
+    off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
+    size=$((16*zone_size))
+    [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
+    opts=("--debug=zbd")
+    for ((i=0;i<jobs;i++)); do
+	opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
+	opts+=("--size=$size" "--io_size=$zone_size" "--thread=1")
+	opts+=("--ioengine=psync" "--rw=randwrite" "--direct=1")
+	opts+=("--max_open_zones=4" "--group_reporting=1")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    done
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((jobs * zone_size)) || return $?
+}
+
+# Random reads and writes across the entire disk for 30s.
+test30() {
+    local off
+
+    off=$((first_sequential_zone_sector * 512))
+    run_one_fio_job --ioengine=libaio --iodepth=8 --rw=randrw		\
+		    --bs="$(max $((zone_size / 128)) "$logical_block_size")"\
+		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off\
+		    --loops=2 --time_based --runtime=30s --norandommap=1\
+		    >>"${logfile}.${test_number}" 2>&1
+}
+
+# Random reads across all sequential zones for 30s. This is not only a fio
+# test but also allows to verify the performance of a drive.
+test31() {
+    local bs inc nz off opts size
+
+    # Start with writing 128 KB to 128 sequential zones.
+    bs=128K
+    nz=128
+    # shellcheck disable=SC2017
+    inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
+	   * zone_size))
+    opts=()
+    for ((off = first_sequential_zone_sector * 512; off < disk_size;
+	  off += inc)); do
+	opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
+	opts+=("--bs=$bs" "--size=$zone_size" "--ioengine=libaio")
+	opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
+	opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
+    done
+    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+    # Next, run the test.
+    off=$((first_sequential_zone_sector * 512))
+    size=$((disk_size - off))
+    opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
+    opts+=("--bs=$bs" "--ioengine=psync" "--rw=randread" "--direct=1")
+    opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")
+    opts+=("--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Random writes across all sequential zones. This is not only a fio test but
+# also allows to verify the performance of a drive.
+test32() {
+    local off opts=() size
+
+    off=$((first_sequential_zone_sector * 512))
+    size=$((disk_size - off))
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
+    opts+=("--bs=128K" "--ioengine=psync" "--rw=randwrite" "--direct=1")
+    opts+=("--thread=1" "--time_based" "--runtime=30")
+    opts+=("--max_open_zones=$max_open_zones" "--zonemode=zbd")
+    opts+=("--zonesize=${zone_size}")
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Write to sequential zones with a block size that is not a divisor of the
+# zone size.
+test33() {
+    local bs io_size size
+
+    size=$((2 * zone_size))
+    io_size=$((5 * zone_size))
+    bs=$((3 * zone_size / 4))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write --size=$size	\
+		   --io_size=$io_size --bs=$bs				\
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $(((io_size + bs - 1) / bs * bs)) || return $?
+}
+
+# Write to sequential zones with a block size that is not a divisor of the
+# zone size and with data verification enabled.
+test34() {
+    local size
+
+    size=$((2 * zone_size))
+    run_fio_on_seq --ioengine=psync --iodepth=1 --rw=write --size=$size	  \
+		   --do_verify=1 --verify=md5 --bs=$((3 * zone_size / 4)) \
+		   >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'not a divisor of' "${logfile}.${test_number}"
+}
+
+# Test 1/4 for the I/O boundary rounding code: $size < $zone_size.
+test35() {
+    local bs off io_size size
+
+    off=$(((first_sequential_zone_sector + 1) * 512))
+    size=$((zone_size - 2 * 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Test 2/4 for the I/O boundary rounding code: $size < $zone_size.
+test36() {
+    local bs off io_size size
+
+    off=$(((first_sequential_zone_sector) * 512))
+    size=$((zone_size - 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Test 3/4 for the I/O boundary rounding code: $size > $zone_size.
+test37() {
+    local bs off size
+
+    if [ "$first_sequential_zone_sector" = 0 ]; then
+	off=0
+    else
+	off=$(((first_sequential_zone_sector - 1) * 512))
+    fi
+    size=$((zone_size + 2 * 512))
+    bs=$((zone_size / 4))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1
+    check_written $((zone_size)) || return $?
+}
+
+# Test 4/4 for the I/O boundary rounding code: $offset > $disk_size - $zone_size
+test38() {
+    local bs off size
+
+    size=$((logical_block_size))
+    off=$((disk_size - logical_block_size))
+    bs=$((logical_block_size))
+    run_one_fio_job --offset=$off --size=$size --ioengine=psync	--iodepth=1 \
+		    --rw=write --do_verify=1 --verify=md5 --bs=$bs	    \
+		    --zonemode=zbd --zonesize="${zone_size}"		    \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'io_size must be at least one zone' "${logfile}.${test_number}"
+}
+
+# Read one block from a block device.
+read_one_block() {
+    local bs
+
+    bs=$((logical_block_size))
+    run_one_fio_job --rw=read --ioengine=psync --bs=$bs --size=$bs "$@" 2>&1 |
+	tee -a "${logfile}.${test_number}"
+}
+
+# Check whether fio accepts --zonemode=none for zoned block devices.
+test39() {
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=none >/dev/null || return $?
+    check_read $((logical_block_size)) || return $?
+}
+
+# Check whether fio accepts --zonemode=strided for zoned block devices.
+test40() {
+    local bs
+
+    bs=$((logical_block_size))
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=strided |
+	grep -q 'fio: --zonesize must be specified when using --zonemode=strided' ||
+	return $?
+    read_one_block --zonemode=strided --zonesize=$bs >/dev/null || return $?
+    check_read $bs || return $?
+}
+
+# Check whether fio checks the zone size for zoned block devices.
+test41() {
+    [ -n "$is_zbd" ] || return 0
+    read_one_block --zonemode=zbd --zonesize=$((2 * zone_size)) |
+	grep -q 'job parameter zonesize.*does not match disk zone size'
+}
+
+# Check whether fio handles --zonesize=0 correctly for regular block devices.
+test42() {
+    [ -n "$is_zbd" ] && return 0
+    read_one_block --zonemode=zbd --zonesize=0 |
+	grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+}
+
+# Check whether fio handles --zonesize=1 correctly for regular block devices.
+test43() {
+    [ -n "$is_zbd" ] && return 0
+    read_one_block --zonemode=zbd --zonesize=1 |
+	grep -q 'zone size must be at least 512 bytes for --zonemode=zbd'
+}
+
+# Check whether fio handles --zonemode=none --zonesize=1 correctly.
+test44() {
+    read_one_block --zonemode=none --zonesize=1 |
+	grep -q 'fio: --zonemode=none and --zonesize are not compatible'
+}
+
+test45() {
+    local bs i
+
+    [ -z "$is_zbd" ] && return 0
+    bs=$((logical_block_size))
+    run_one_fio_job --ioengine=psync --iodepth=1 --rw=randwrite --bs=$bs\
+		    --offset=$((first_sequential_zone_sector * 512)) \
+		    --size="$zone_size" --do_verify=1 --verify=md5 2>&1 |
+	tee -a "${logfile}.${test_number}" |
+	grep -q "fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
+}
+
+# Random write to sequential zones, libaio, 8 jobs, queue depth 64 per job
+test46() {
+    local size
+
+    size=$((4 * zone_size))
+    run_fio_on_seq --ioengine=libaio --iodepth=64 --rw=randwrite --bs=4K \
+		   --group_reporting=1 --numjobs=8 \
+		   >> "${logfile}.${test_number}" 2>&1 || return $?
+    check_written $((size * 8)) || return $?
+}
+
+# Check whether fio handles --zonemode=zbd --zoneskip=1 correctly.
+test47() {
+    local bs
+
+    [ -z "$is_zbd" ] && return 0
+    bs=$((logical_block_size))
+    run_one_fio_job --ioengine=psync --rw=write --bs=$bs \
+		    --zonemode=zbd --zoneskip=1		 \
+		    >> "${logfile}.${test_number}" 2>&1 && return 1
+    grep -q 'zoneskip 1 is not a multiple of the device zone size' "${logfile}.${test_number}"
+}
+
+tests=()
+dynamic_analyzer=()
+reset_all_zones=
+
+while [ "${1#-}" != "$1" ]; do
+  case "$1" in
+    -d) dynamic_analyzer=(valgrind "--read-var-info=yes" "--tool=drd"
+			  "--show-confl-seg=no");
+	shift;;
+    -e) dynamic_analyzer=(valgrind "--read-var-info=yes" "--tool=helgrind");
+	shift;;
+    -r) reset_all_zones=1; shift;;
+    -t) tests+=("$2"); shift; shift;;
+    -v) dynamic_analyzer=(valgrind "--read-var-info=yes");
+	shift;;
+    --) shift; break;;
+  esac
+done
+
+if [ $# != 1 ]; then
+    usage
+    exit 1
+fi
+
+# shellcheck source=functions
+source "$(dirname "$0")/functions" || exit $?
+
+dev=$1
+realdev=$(readlink -f "$dev")
+basename=$(basename "$realdev")
+major=$((0x$(stat -L -c '%t' "$realdev"))) || exit $?
+minor=$((0x$(stat -L -c '%T' "$realdev"))) || exit $?
+disk_size=$(($(<"/sys/dev/block/$major:$minor/size")*512))
+# When the target is a partition device, get basename of its holder device to
+# access sysfs path of the holder device
+if [[ -r "/sys/dev/block/$major:$minor/partition" ]]; then
+	realsysfs=$(readlink "/sys/dev/block/$major:$minor")
+	basename=$(basename "${realsysfs%/*}")
+fi
+logical_block_size=$(<"/sys/block/$basename/queue/logical_block_size")
+case "$(<"/sys/class/block/$basename/queue/zoned")" in
+    host-managed|host-aware)
+	is_zbd=true
+	if ! result=($(first_sequential_zone "$dev")); then
+	    echo "Failed to determine first sequential zone"
+	    exit 1
+	fi
+	first_sequential_zone_sector=${result[0]}
+	sectors_per_zone=${result[1]}
+	zone_size=$((sectors_per_zone * 512))
+	if ! max_open_zones=$(max_open_zones "$dev"); then
+	    echo "Failed to determine maximum number of open zones"
+	    exit 1
+	fi
+	echo "First sequential zone starts at sector $first_sequential_zone_sector; zone size: $((zone_size >> 20)) MB"
+	set_io_scheduler "$basename" deadline || exit $?
+	if [ -n "$reset_all_zones" ]; then
+	    reset_zone "$dev" -1
+	fi
+	;;
+    *)
+	first_sequential_zone_sector=$(((disk_size / 2) &
+					(logical_block_size - 1)))
+	zone_size=$(max 65536 "$logical_block_size")
+	sectors_per_zone=$((zone_size / 512))
+	max_open_zones=128
+	set_io_scheduler "$basename" none || exit $?
+	;;
+esac
+
+if [ "${#tests[@]}" = 0 ]; then
+    for ((i=1;i<=46;i++)); do
+	tests+=("$i")
+    done
+fi
+
+logfile=$0.log
+
+passed=0
+failed=0
+rc=0
+for test_number in "${tests[@]}"; do
+    rm -f "${logfile}.${test_number}"
+    echo -n "Running test $test_number ... "
+    if eval "test$test_number"; then
+	status="PASS"
+	((passed++))
+    else
+	status="FAIL"
+	((failed++))
+	rc=1
+    fi
+    echo "$status"
+    echo "$status" >> "${logfile}.${test_number}"
+done
+
+echo "$passed tests passed"
+if [ $failed -gt 0 ]; then
+    echo " and $failed tests failed"
+fi
+exit $rc
diff -Nru fio-2.1.3/td_error.c fio-3.16/td_error.c
--- fio-2.1.3/td_error.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/td_error.c	2019-09-20 01:01:52.000000000 +0000
@@ -20,8 +20,7 @@
 
 	if (!td->o.ignore_error[etype]) {
 		td->o.ignore_error[etype] = __NON_FATAL_ERR;
-		td->o.ignore_error_nr[etype] = sizeof(__NON_FATAL_ERR)
-			/ sizeof(int);
+		td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
 	}
 
 	if (!(td->o.continue_on_error & (1 << etype)))
diff -Nru fio-2.1.3/td_error.h fio-3.16/td_error.h
--- fio-2.1.3/td_error.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/td_error.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,8 +1,11 @@
 #ifndef FIO_TD_ERROR_H
 #define FIO_TD_ERROR_H
 
+#include "io_ddir.h"
+
 /*
- * What type of errors to continue on when continue_on_error is used
+ * What type of errors to continue on when continue_on_error is used,
+ * and what type of errors to ignore when ignore_error is used.
  */
 enum error_type_bit {
 	ERROR_TYPE_READ_BIT = 0,
diff -Nru fio-2.1.3/thread_options.h fio-3.16/thread_options.h
--- fio-2.1.3/thread_options.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/thread_options.h	2019-09-20 01:01:52.000000000 +0000
@@ -3,11 +3,21 @@
 
 #include "arch/arch.h"
 #include "os/os.h"
+#include "options.h"
 #include "stat.h"
 #include "gettime.h"
 #include "lib/ieee754.h"
+#include "lib/pattern.h"
 #include "td_error.h"
 
+enum fio_zone_mode {
+	ZONE_MODE_NOT_SPECIFIED	= 0,
+	ZONE_MODE_NONE		= 1,
+	ZONE_MODE_STRIDED	= 2, /* perform I/O in one zone at a time */
+	/* perform I/O across multiple zones simultaneously */
+	ZONE_MODE_ZBD		= 3,
+};
+
 /*
  * What type of allocation to use for io buffers
  */
@@ -17,49 +27,73 @@
 	MEM_SHMHUGE,	/* use shared memory segments with huge pages */
 	MEM_MMAP,	/* use anonynomous mmap */
 	MEM_MMAPHUGE,	/* memory mapped huge file */
+	MEM_MMAPSHARED, /* use mmap with shared flag */
+	MEM_CUDA_MALLOC,/* use GPU memory */
 };
 
 #define ERROR_STR_MAX	128
 
 #define BSSPLIT_MAX	64
+#define ZONESPLIT_MAX	256
 
 struct bssplit {
-	uint32_t bs;
+	uint64_t bs;
 	uint32_t perc;
 };
 
+struct zone_split {
+	uint8_t access_perc;
+	uint8_t size_perc;
+	uint8_t pad[6];
+	uint64_t size;
+};
+
+#define NR_OPTS_SZ	(FIO_MAX_OPTS / (8 * sizeof(uint64_t)))
+
+#define OPT_MAGIC	0x4f50544e
+
 struct thread_options {
-	int pad;
+	int magic;
+	uint64_t set_options[NR_OPTS_SZ];
 	char *description;
 	char *name;
+	char *wait_for;
 	char *directory;
 	char *filename;
 	char *filename_format;
 	char *opendir;
 	char *ioengine;
+	char *ioengine_so_path;
 	char *mmapfile;
 	enum td_ddir td_ddir;
 	unsigned int rw_seq;
 	unsigned int kb_base;
 	unsigned int unit_base;
 	unsigned int ddir_seq_nr;
-	long ddir_seq_add;
+	long long ddir_seq_add;
 	unsigned int iodepth;
 	unsigned int iodepth_low;
 	unsigned int iodepth_batch;
-	unsigned int iodepth_batch_complete;
+	unsigned int iodepth_batch_complete_min;
+	unsigned int iodepth_batch_complete_max;
+	unsigned int serialize_overlap;
+
+	unsigned int unique_filename;
 
 	unsigned long long size;
+	unsigned long long io_size;
 	unsigned int size_percent;
 	unsigned int fill_device;
+	unsigned int file_append;
 	unsigned long long file_size_low;
 	unsigned long long file_size_high;
 	unsigned long long start_offset;
+	unsigned long long start_offset_align;
 
-	unsigned int bs[DDIR_RWDIR_CNT];
-	unsigned int ba[DDIR_RWDIR_CNT];
-	unsigned int min_bs[DDIR_RWDIR_CNT];
-	unsigned int max_bs[DDIR_RWDIR_CNT];
+	unsigned long long bs[DDIR_RWDIR_CNT];
+	unsigned long long ba[DDIR_RWDIR_CNT];
+	unsigned long long min_bs[DDIR_RWDIR_CNT];
+	unsigned long long max_bs[DDIR_RWDIR_CNT];
 	struct bssplit *bssplit[DDIR_RWDIR_CNT];
 	unsigned int bssplit_nr[DDIR_RWDIR_CNT];
 
@@ -72,6 +106,7 @@
 	enum file_lock_mode file_lock_mode;
 
 	unsigned int odirect;
+	unsigned int oatomic;
 	unsigned int invalidate_cache;
 	unsigned int create_serialize;
 	unsigned int create_fsync;
@@ -80,44 +115,63 @@
 	unsigned int end_fsync;
 	unsigned int pre_read;
 	unsigned int sync_io;
+	unsigned int write_hint;
 	unsigned int verify;
 	unsigned int do_verify;
-	unsigned int verifysort;
-	unsigned int verifysort_nr;
 	unsigned int verify_interval;
 	unsigned int verify_offset;
 	char verify_pattern[MAX_PATTERN_SIZE];
 	unsigned int verify_pattern_bytes;
+	struct pattern_fmt verify_fmt[8];
+	unsigned int verify_fmt_sz;
 	unsigned int verify_fatal;
 	unsigned int verify_dump;
 	unsigned int verify_async;
 	unsigned long long verify_backlog;
 	unsigned int verify_batch;
 	unsigned int experimental_verify;
+	unsigned int verify_state;
+	unsigned int verify_state_save;
 	unsigned int use_thread;
 	unsigned int unlink;
+	unsigned int unlink_each_loop;
 	unsigned int do_disk_util;
 	unsigned int override_sync;
 	unsigned int rand_repeatable;
-	unsigned int use_os_rand;
+	unsigned int allrand_repeatable;
+	unsigned long long rand_seed;
 	unsigned int log_avg_msec;
+	unsigned int log_hist_msec;
+	unsigned int log_hist_coarseness;
+	unsigned int log_max;
+	unsigned int log_offset;
+	unsigned int log_gz;
+	unsigned int log_gz_store;
+	unsigned int log_unix_epoch;
 	unsigned int norandommap;
 	unsigned int softrandommap;
 	unsigned int bs_unaligned;
 	unsigned int fsync_on_close;
 	unsigned int bs_is_seq_rand;
 
+	unsigned int verify_only;
+
 	unsigned int random_distribution;
+	unsigned int exitall_error;
+
+	struct zone_split *zone_split[DDIR_RWDIR_CNT];
+	unsigned int zone_split_nr[DDIR_RWDIR_CNT];
 
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
 
 	unsigned int random_generator;
 
 	unsigned int perc_rand[DDIR_RWDIR_CNT];
 
 	unsigned int hugepage_size;
-	unsigned int rw_min_bs;
+	unsigned long long rw_min_bs;
 	unsigned int thinktime;
 	unsigned int thinktime_spin;
 	unsigned int thinktime_blocks;
@@ -125,8 +179,14 @@
 	unsigned int fdatasync_blocks;
 	unsigned int barrier_blocks;
 	unsigned long long start_delay;
+	unsigned long long start_delay_orig;
+	unsigned long long start_delay_high;
 	unsigned long long timeout;
 	unsigned long long ramp_time;
+	unsigned int ss_state;
+	fio_fp64_t ss_limit;
+	unsigned long long ss_dur;
+	unsigned long long ss_ramp_time;
 	unsigned int overwrite;
 	unsigned int bw_avg_time;
 	unsigned int iops_avg_time;
@@ -134,42 +194,46 @@
 	unsigned long long zone_range;
 	unsigned long long zone_size;
 	unsigned long long zone_skip;
+	enum fio_zone_mode zone_mode;
 	unsigned long long lockmem;
 	enum fio_memtype mem_type;
 	unsigned int mem_align;
 
-	unsigned int max_latency;
+	unsigned long long max_latency;
 
 	unsigned int stonewall;
 	unsigned int new_group;
 	unsigned int numjobs;
 	os_cpu_mask_t cpumask;
-	unsigned int cpumask_set;
 	os_cpu_mask_t verify_cpumask;
-	unsigned int verify_cpumask_set;
-#ifdef CONFIG_LIBNUMA
-	struct bitmask *numa_cpunodesmask;
-	unsigned int numa_cpumask_set;
+	os_cpu_mask_t log_gz_cpumask;
+	unsigned int cpus_allowed_policy;
+	char *numa_cpunodes;
 	unsigned short numa_mem_mode;
 	unsigned int numa_mem_prefer_node;
-	struct bitmask *numa_memnodesmask;
-	unsigned int numa_memmask_set;
-#endif
+	char *numa_memnodes;
+	unsigned int gpu_dev_id;
+	unsigned int start_offset_percent;
+
 	unsigned int iolog;
 	unsigned int rwmixcycle;
-	unsigned int rwmix[2];
+	unsigned int rwmix[DDIR_RWDIR_CNT];
 	unsigned int nice;
 	unsigned int ioprio;
 	unsigned int ioprio_class;
 	unsigned int file_service_type;
 	unsigned int group_reporting;
+	unsigned int stats;
 	unsigned int fadvise_hint;
 	enum fio_fallocate_mode fallocate_mode;
 	unsigned int zero_buffers;
 	unsigned int refill_buffers;
 	unsigned int scramble_buffers;
+	char buffer_pattern[MAX_PATTERN_SIZE];
+	unsigned int buffer_pattern_bytes;
 	unsigned int compress_percentage;
 	unsigned int compress_chunk;
+	unsigned int dedupe_percentage;
 	unsigned int time_based;
 	unsigned int disable_lat;
 	unsigned int disable_clat;
@@ -178,7 +242,6 @@
 	unsigned int unified_rw_rep;
 	unsigned int gtod_reduce;
 	unsigned int gtod_cpu;
-	unsigned int gtod_offload;
 	enum fio_cs clocksource;
 	unsigned int no_stall;
 	unsigned int trim_percentage;
@@ -186,14 +249,26 @@
 	unsigned int trim_zero;
 	unsigned long long trim_backlog;
 	unsigned int clat_percentiles;
+	unsigned int lat_percentiles;
 	unsigned int percentile_precision;	/* digits after decimal for percentiles */
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	char *read_iolog_file;
+	bool read_iolog_chunked;
 	char *write_iolog_file;
+	char *merge_blktrace_file;
+	fio_fp64_t merge_blktrace_scalars[FIO_IO_U_LIST_MAX_LEN];
+	fio_fp64_t merge_blktrace_iters[FIO_IO_U_LIST_MAX_LEN];
+
+	unsigned int write_bw_log;
+	unsigned int write_lat_log;
+	unsigned int write_iops_log;
+	unsigned int write_hist_log;
+
 	char *bw_log_file;
 	char *lat_log_file;
 	char *iops_log_file;
+	char *hist_log_file;
 	char *replay_redirect;
 
 	/*
@@ -202,11 +277,14 @@
 	char *exec_prerun;
 	char *exec_postrun;
 
-	unsigned int rate[DDIR_RWDIR_CNT];
-	unsigned int ratemin[DDIR_RWDIR_CNT];
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
 	unsigned int ratecycle;
+	unsigned int io_submit_mode;
 	unsigned int rate_iops[DDIR_RWDIR_CNT];
 	unsigned int rate_iops_min[DDIR_RWDIR_CNT];
+	unsigned int rate_process;
+	unsigned int rate_ign_think;
 
 	char *ioscheduler;
 
@@ -235,17 +313,44 @@
 	int flow_watermark;
 	unsigned int flow_sleep;
 
+	unsigned int offset_increment_percent;
 	unsigned long long offset_increment;
 	unsigned long long number_ios;
 
 	unsigned int sync_file_range;
+
+	unsigned long long latency_target;
+	unsigned long long latency_window;
+	fio_fp64_t latency_percentile;
+
+	unsigned int sig_figs;
+
+	unsigned block_error_hist;
+
+	unsigned int replay_align;
+	unsigned int replay_scale;
+	unsigned int replay_time_scale;
+	unsigned int replay_skip;
+
+	unsigned int per_job_logs;
+
+	unsigned int allow_create;
+	unsigned int allow_mounted_write;
+
+	/* Parameters that affect zonemode=zbd */
+	unsigned int read_beyond_wp;
+	int max_open_zones;
+	fio_fp64_t zrt;
+	fio_fp64_t zrf;
 };
 
 #define FIO_TOP_STR_MAX		256
 
 struct thread_options_pack {
+	uint64_t set_options[NR_OPTS_SZ];
 	uint8_t description[FIO_TOP_STR_MAX];
 	uint8_t name[FIO_TOP_STR_MAX];
+	uint8_t wait_for[FIO_TOP_STR_MAX];
 	uint8_t directory[FIO_TOP_STR_MAX];
 	uint8_t filename[FIO_TOP_STR_MAX];
 	uint8_t filename_format[FIO_TOP_STR_MAX];
@@ -261,19 +366,26 @@
 	uint32_t iodepth;
 	uint32_t iodepth_low;
 	uint32_t iodepth_batch;
-	uint32_t iodepth_batch_complete;
+	uint32_t iodepth_batch_complete_min;
+	uint32_t iodepth_batch_complete_max;
+	uint32_t serialize_overlap;
+	uint32_t lat_percentiles;
 
 	uint64_t size;
+	uint64_t io_size;
 	uint32_t size_percent;
 	uint32_t fill_device;
+	uint32_t file_append;
+	uint32_t unique_filename;
 	uint64_t file_size_low;
 	uint64_t file_size_high;
 	uint64_t start_offset;
+	uint64_t start_offset_align;
 
-	uint32_t bs[DDIR_RWDIR_CNT];
-	uint32_t ba[DDIR_RWDIR_CNT];
-	uint32_t min_bs[DDIR_RWDIR_CNT];
-	uint32_t max_bs[DDIR_RWDIR_CNT];
+	uint64_t bs[DDIR_RWDIR_CNT];
+	uint64_t ba[DDIR_RWDIR_CNT];
+	uint64_t min_bs[DDIR_RWDIR_CNT];
+	uint64_t max_bs[DDIR_RWDIR_CNT];
 	struct bssplit bssplit[DDIR_RWDIR_CNT][BSSPLIT_MAX];
 	uint32_t bssplit_nr[DDIR_RWDIR_CNT];
 
@@ -286,6 +398,7 @@
 	uint32_t file_lock_mode;
 
 	uint32_t odirect;
+	uint32_t oatomic;
 	uint32_t invalidate_cache;
 	uint32_t create_serialize;
 	uint32_t create_fsync;
@@ -294,10 +407,9 @@
 	uint32_t end_fsync;
 	uint32_t pre_read;
 	uint32_t sync_io;
+	uint32_t write_hint;
 	uint32_t verify;
 	uint32_t do_verify;
-	uint32_t verifysort;
-	uint32_t verifysort_nr;
 	uint32_t verify_interval;
 	uint32_t verify_offset;
 	uint8_t verify_pattern[MAX_PATTERN_SIZE];
@@ -308,13 +420,25 @@
 	uint64_t verify_backlog;
 	uint32_t verify_batch;
 	uint32_t experimental_verify;
+	uint32_t verify_state;
+	uint32_t verify_state_save;
 	uint32_t use_thread;
 	uint32_t unlink;
+	uint32_t unlink_each_loop;
 	uint32_t do_disk_util;
 	uint32_t override_sync;
 	uint32_t rand_repeatable;
-	uint32_t use_os_rand;
+	uint32_t allrand_repeatable;
+	uint32_t pad;
+	uint64_t rand_seed;
 	uint32_t log_avg_msec;
+	uint32_t log_hist_msec;
+	uint32_t log_hist_coarseness;
+	uint32_t log_max;
+	uint32_t log_offset;
+	uint32_t log_gz;
+	uint32_t log_gz_store;
+	uint32_t log_unix_epoch;
 	uint32_t norandommap;
 	uint32_t softrandommap;
 	uint32_t bs_unaligned;
@@ -322,15 +446,24 @@
 	uint32_t bs_is_seq_rand;
 
 	uint32_t random_distribution;
+	uint32_t exitall_error;
+
+	uint32_t sync_file_range;
+
+	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
+	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
+
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
 
 	uint32_t random_generator;
 
 	uint32_t perc_rand[DDIR_RWDIR_CNT];
 
 	uint32_t hugepage_size;
-	uint32_t rw_min_bs;
+	uint64_t rw_min_bs;
+	uint32_t pad2;
 	uint32_t thinktime;
 	uint32_t thinktime_spin;
 	uint32_t thinktime_blocks;
@@ -338,8 +471,13 @@
 	uint32_t fdatasync_blocks;
 	uint32_t barrier_blocks;
 	uint64_t start_delay;
+	uint64_t start_delay_high;
 	uint64_t timeout;
 	uint64_t ramp_time;
+	uint64_t ss_dur;
+	uint64_t ss_ramp_time;
+	uint32_t ss_state;
+	fio_fp64_t ss_limit;
 	uint32_t overwrite;
 	uint32_t bw_avg_time;
 	uint32_t iops_avg_time;
@@ -351,30 +489,39 @@
 	uint32_t mem_type;
 	uint32_t mem_align;
 
-	uint32_t max_latency;
-
 	uint32_t stonewall;
 	uint32_t new_group;
 	uint32_t numjobs;
+	/*
+	 * We currently can't convert these, so don't enable them
+	 */
+#if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
-	uint32_t cpumask_set;
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
-	uint32_t verify_cpumask_set;
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
+#endif
+	uint32_t gpu_dev_id;
+	uint32_t start_offset_percent;
+	uint32_t cpus_allowed_policy;
 	uint32_t iolog;
 	uint32_t rwmixcycle;
-	uint32_t rwmix[2];
+	uint32_t rwmix[DDIR_RWDIR_CNT];
 	uint32_t nice;
 	uint32_t ioprio;
 	uint32_t ioprio_class;
 	uint32_t file_service_type;
 	uint32_t group_reporting;
+	uint32_t stats;
 	uint32_t fadvise_hint;
 	uint32_t fallocate_mode;
 	uint32_t zero_buffers;
 	uint32_t refill_buffers;
 	uint32_t scramble_buffers;
-	unsigned int compress_percentage;
-	unsigned int compress_chunk;
+	uint8_t buffer_pattern[MAX_PATTERN_SIZE];
+	uint32_t buffer_pattern_bytes;
+	uint32_t compress_percentage;
+	uint32_t compress_chunk;
+	uint32_t dedupe_percentage;
 	uint32_t time_based;
 	uint32_t disable_lat;
 	uint32_t disable_clat;
@@ -383,7 +530,6 @@
 	uint32_t unified_rw_rep;
 	uint32_t gtod_reduce;
 	uint32_t gtod_cpu;
-	uint32_t gtod_offload;
 	uint32_t clocksource;
 	uint32_t no_stall;
 	uint32_t trim_percentage;
@@ -396,9 +542,19 @@
 
 	uint8_t read_iolog_file[FIO_TOP_STR_MAX];
 	uint8_t write_iolog_file[FIO_TOP_STR_MAX];
+	uint8_t merge_blktrace_file[FIO_TOP_STR_MAX];
+	fio_fp64_t merge_blktrace_scalars[FIO_IO_U_LIST_MAX_LEN];
+	fio_fp64_t merge_blktrace_iters[FIO_IO_U_LIST_MAX_LEN];
+
+	uint32_t write_bw_log;
+	uint32_t write_lat_log;
+	uint32_t write_iops_log;
+	uint32_t write_hist_log;
+
 	uint8_t bw_log_file[FIO_TOP_STR_MAX];
 	uint8_t lat_log_file[FIO_TOP_STR_MAX];
 	uint8_t iops_log_file[FIO_TOP_STR_MAX];
+	uint8_t hist_log_file[FIO_TOP_STR_MAX];
 	uint8_t replay_redirect[FIO_TOP_STR_MAX];
 
 	/*
@@ -407,11 +563,15 @@
 	uint8_t exec_prerun[FIO_TOP_STR_MAX];
 	uint8_t exec_postrun[FIO_TOP_STR_MAX];
 
-	uint32_t rate[DDIR_RWDIR_CNT];
-	uint32_t ratemin[DDIR_RWDIR_CNT];
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
 	uint32_t ratecycle;
+	uint32_t io_submit_mode;
 	uint32_t rate_iops[DDIR_RWDIR_CNT];
 	uint32_t rate_iops_min[DDIR_RWDIR_CNT];
+	uint32_t rate_process;
+	uint32_t rate_ign_think;
+	uint32_t pad3;
 
 	uint8_t ioscheduler[FIO_TOP_STR_MAX];
 
@@ -440,10 +600,31 @@
 	int32_t flow_watermark;
 	uint32_t flow_sleep;
 
+	uint32_t offset_increment_percent;
+	uint32_t pad4;
 	uint64_t offset_increment;
 	uint64_t number_ios;
 
-	uint32_t sync_file_range;
+	uint64_t latency_target;
+	uint64_t latency_window;
+	uint64_t max_latency;
+	fio_fp64_t latency_percentile;
+
+	uint32_t sig_figs;
+
+	uint32_t block_error_hist;
+
+	uint32_t replay_align;
+	uint32_t replay_scale;
+	uint32_t replay_time_scale;
+	uint32_t replay_skip;
+
+	uint32_t per_job_logs;
+
+	uint32_t allow_create;
+	uint32_t allow_mounted_write;
+
+	uint32_t zone_mode;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
diff -Nru fio-2.1.3/tickmarks.c fio-3.16/tickmarks.c
--- fio-2.1.3/tickmarks.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tickmarks.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,6 +1,6 @@
 #include <stdio.h>
 #include <math.h>
-#include <malloc.h>
+#include <stdlib.h>
 #include <string.h>
 
 /*
diff -Nru fio-2.1.3/time.c fio-3.16/time.c
--- fio-2.1.3/time.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/time.c	2019-09-20 01:01:52.000000000 +0000
@@ -3,31 +3,52 @@
 
 #include "fio.h"
 
-static struct timeval genesis;
+static struct timespec genesis;
 static unsigned long ns_granularity;
 
+void timespec_add_msec(struct timespec *ts, unsigned int msec)
+{
+	uint64_t adj_nsec = 1000000ULL * msec;
+
+	ts->tv_nsec += adj_nsec;
+	if (adj_nsec >= 1000000000) {
+		uint64_t adj_sec = adj_nsec / 1000000000;
+
+		ts->tv_nsec -= adj_sec * 1000000000;
+		ts->tv_sec += adj_sec;
+	}
+	if (ts->tv_nsec >= 1000000000){
+		ts->tv_nsec -= 1000000000;
+		ts->tv_sec++;
+	}
+}
+
 /*
  * busy looping version for the last few usec
  */
-void usec_spin(unsigned int usec)
+uint64_t usec_spin(unsigned int usec)
 {
-	struct timeval start;
+	struct timespec start;
+	uint64_t t;
 
 	fio_gettime(&start, NULL);
-	while (utime_since_now(&start) < usec)
+	while ((t = utime_since_now(&start)) < usec)
 		nop;
+
+	return t;
 }
 
-void usec_sleep(struct thread_data *td, unsigned long usec)
+uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
 {
 	struct timespec req;
-	struct timeval tv;
+	struct timespec tv;
+	uint64_t t = 0;
 
 	do {
 		unsigned long ts = usec;
 
 		if (usec < ns_granularity) {
-			usec_spin(usec);
+			t += usec_spin(usec);
 			break;
 		}
 
@@ -46,11 +67,19 @@
 			break;
 
 		ts = utime_since_now(&tv);
+		t += ts;
 		if (ts >= usec)
 			break;
 
 		usec -= ts;
 	} while (!td->terminate);
+
+	return t;
+}
+
+uint64_t time_since_genesis(void)
+{
+	return time_since_now(&genesis);
 }
 
 uint64_t mtime_since_genesis(void)
@@ -58,27 +87,52 @@
 	return mtime_since_now(&genesis);
 }
 
-int in_ramp_time(struct thread_data *td)
+uint64_t utime_since_genesis(void)
+{
+	return utime_since_now(&genesis);
+}
+
+bool in_ramp_time(struct thread_data *td)
 {
 	return td->o.ramp_time && !td->ramp_time_over;
 }
 
-int ramp_time_over(struct thread_data *td)
+static bool parent_update_ramp(struct thread_data *td)
 {
-	struct timeval tv;
+	struct thread_data *parent = td->parent;
+
+	if (!parent || parent->ramp_time_over)
+		return false;
+
+	reset_all_stats(parent);
+	parent->ramp_time_over = true;
+	td_set_runstate(parent, TD_RAMP);
+	return true;
+}
 
+bool ramp_time_over(struct thread_data *td)
+{
 	if (!td->o.ramp_time || td->ramp_time_over)
-		return 1;
+		return true;
 
-	fio_gettime(&tv, NULL);
-	if (mtime_since(&td->epoch, &tv) >= td->o.ramp_time * 1000) {
-		td->ramp_time_over = 1;
+	if (utime_since_now(&td->epoch) >= td->o.ramp_time) {
+		td->ramp_time_over = true;
 		reset_all_stats(td);
+		reset_io_stats(td);
 		td_set_runstate(td, TD_RAMP);
-		return 1;
+
+		/*
+		 * If we have a parent, the parent isn't doing IO. Hence
+		 * the parent never enters do_io(), which will switch us
+		 * from RAMP -> RUNNING. Do this manually here.
+		 */
+		if (parent_update_ramp(td))
+			td_set_runstate(td, TD_RUNNING);
+
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void fio_time_init(void)
@@ -91,8 +145,7 @@
 	 * Check the granularity of the nanosleep function
 	 */
 	for (i = 0; i < 10; i++) {
-		struct timeval tv;
-		struct timespec ts;
+		struct timespec tv, ts;
 		unsigned long elapsed;
 
 		fio_gettime(&tv, NULL);
@@ -112,7 +165,18 @@
 	fio_gettime(&genesis, NULL);
 }
 
-void fill_start_time(struct timeval *t)
+void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+{
+	fio_gettime(&td->epoch, NULL);
+	if (log_unix_epoch) {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+		td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
+		                 (unsigned long long)(tv.tv_usec) / 1000;
+	}
+}
+
+void fill_start_time(struct timespec *t)
 {
 	memcpy(t, &genesis, sizeof(genesis));
 }
diff -Nru fio-2.1.3/time.h fio-3.16/time.h
--- fio-2.1.3/time.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/time.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-#ifndef FIO_TIME_H
-#define FIO_TIME_H
-
-extern uint64_t utime_since(struct timeval *, struct timeval *);
-extern uint64_t utime_since_now(struct timeval *);
-extern uint64_t mtime_since(struct timeval *, struct timeval *);
-extern uint64_t mtime_since_now(struct timeval *);
-extern uint64_t time_since_now(struct timeval *);
-extern uint64_t mtime_since_genesis(void);
-extern void usec_spin(unsigned int);
-extern void usec_sleep(struct thread_data *, unsigned long);
-extern void fill_start_time(struct timeval *);
-extern void set_genesis_time(void);
-extern int ramp_time_over(struct thread_data *);
-extern int in_ramp_time(struct thread_data *);
-extern void fio_time_init(void);
-
-#endif
diff -Nru fio-2.1.3/tools/fio_generate_plots fio-3.16/tools/fio_generate_plots
--- fio-2.1.3/tools/fio_generate_plots	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/fio_generate_plots	2019-09-20 01:01:52.000000000 +0000
@@ -7,7 +7,7 @@
 #
 # The script uses the files generated by FIO to create nice graphs in the
 # SVG format. This output format is supported by most modern browsers and
-# allows resolution independant graphs to be generated.
+# allows resolution independent graphs to be generated.
 #
 # This script supports GNUPLOT 4.4 and higher.
 # 
@@ -93,20 +93,26 @@
 
     i=0
     
-    for x in *_"$FILETYPE".log
+    for x in *_"$FILETYPE".log *_"$FILETYPE".*.log
     do
-        i=$((i+1))
-        PT=$(echo $x | sed s/_"$FILETYPE".log//g)
-        if [ ! -z "$PLOT_LINE" ]
-        then
-            PLOT_LINE=$PLOT_LINE", "
-        fi
+        if [ -e "$x" ]; then
+            i=$((i+1))
+            PT=$(echo $x | sed 's/\(.*\)_'$FILETYPE'\(.*\).log$/\1\2/')
+            if [ ! -z "$PLOT_LINE" ]
+            then
+                PLOT_LINE=$PLOT_LINE", "
+            fi
 
-        DEPTH=$(echo $PT | cut -d "-" -f 4)
-	    PLOT_LINE=$PLOT_LINE"'$x' using (\$1/1000):(\$2/$SCALE) title \"Queue depth $DEPTH\" with lines ls $i" 
-        
+            DEPTH=$(echo $PT | cut -d "-" -f 4)
+            PLOT_LINE=$PLOT_LINE"'$x' using (\$1/1000):(\$2/$SCALE) title \"Queue depth $DEPTH\" with lines ls $i" 
+        fi
     done
 
+    if [ $i -eq 0 ]; then
+       echo "No log files found"
+       exit 1
+    fi
+
     OUTPUT="set output \"$TITLE-$FILETYPE.svg\" "
 
     echo " $PLOT_TITLE ; $YAXIS ; $DEFAULT_OPTS ; show style lines ; $OUTPUT ; plot "  $PLOT_LINE  | $GNUPLOT -
diff -Nru fio-2.1.3/tools/fio_generate_plots.1 fio-3.16/tools/fio_generate_plots.1
--- fio-2.1.3/tools/fio_generate_plots.1	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/fio_generate_plots.1	2019-09-20 01:01:52.000000000 +0000
@@ -38,8 +38,7 @@
 The script takes the title of the plot as only argument. It does
 not offer any additional options.
 .SH AUTHOR
-fio_generate_plots was written by Jens Axboe <jens.axboe@oracle.com>,
-now Jens Axboe <jaxboe@fusionio.com>.
+fio_generate_plots was written by Jens Axboe <axboe@kernel.dk>
 .PP
 This manual page was written by Martin Steigerwald <ms@teamix.de>,
 for the Debian project (but may be used by others).
diff -Nru fio-2.1.3/tools/fio_jsonplus_clat2csv fio-3.16/tools/fio_jsonplus_clat2csv
--- fio-2.1.3/tools/fio_jsonplus_clat2csv	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/fio_jsonplus_clat2csv	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,177 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python3 compatible.
+#
+# fio_jsonplus_clat2csv
+#
+# This script converts fio's json+ completion latency data to CSV format.
+#
+# For example:
+#
+# Run the following fio jobs:
+# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1
+#  	--ioengine=null --time_based --runtime=5s --size=1G --rw=randrw
+# 	--name=test2 --ioengine=null --time_based --runtime=3s --size=1G
+# 	--rw=read --name=test3 --ioengine=null --time_based --runtime=4s
+# 	--size=8G --rw=write
+#
+# Then run:
+# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv
+#
+# You will end up with the following 3 files
+#
+# -rw-r--r-- 1 root root  6467 Jun 27 14:57 fio-latency_job0.csv
+# -rw-r--r-- 1 root root  3985 Jun 27 14:57 fio-latency_job1.csv
+# -rw-r--r-- 1 root root  4490 Jun 27 14:57 fio-latency_job2.csv
+#
+# fio-latency_job0.csv will look something like:
+#
+# clat_nsec, read_count, read_cumulative, read_percentile, write_count,
+# 	write_cumulative, write_percentile, trim_count, trim_cumulative,
+# 	trim_percentile,
+# 25, 1, 1, 1.50870705013e-07, , , , , , ,
+# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , ,
+# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , ,
+# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , ,
+# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , ,
+# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , ,
+# ...
+# 2544, , , , 2, 6624404, 0.999997433738, , , ,
+# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , ,
+# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , ,
+# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , ,
+# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , ,
+# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , ,
+# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , ,
+# 2800, , , , 1, 6624420, 0.999999849043, , , ,
+# 2832, 1, 6628190, 0.999999698259, , , , , , ,
+# 4192, 1, 6628191, 0.999999849129, , , , , , ,
+# 5792, , , , 1, 6624421, 1.0, , , ,
+# 10304, 1, 6628192, 1.0, , , , , , ,
+#
+# The first line says that you had one read IO with 25ns clat,
+# the cumulative number of read IOs at or below 25ns is 1, and
+# 25ns is the 0.00001509th percentile for read latency
+#
+# The job had 2 write IOs complete in 2544ns,
+# 6624404 write IOs completed in 2544ns or less,
+# and this represents the 99.99974th percentile for write latency
+#
+# The last line says that one read IO had 10304ns clat,
+# 6628192 read IOs had 10304ns or shorter clat, and
+# 10304ns is the 100th percentile for read latency
+#
+
+from __future__ import absolute_import
+from __future__ import print_function
+import os
+import json
+import argparse
+import six
+from six.moves import range
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('source',
+                        help='fio json+ output file containing completion '
+                             'latency data')
+    parser.add_argument('dest',
+                        help='destination file stub for latency data in CSV '
+                             'format. job number will be appended to filename')
+    args = parser.parse_args()
+
+    return args
+
+
+def percentile(idx, run_total):
+    total = run_total[len(run_total)-1]
+    if total == 0:
+        return 0
+
+    return float(run_total[idx]) / total
+
+
+def more_lines(indices, bins):
+    for key, value in six.iteritems(indices):
+        if value < len(bins[key]):
+            return True
+
+    return False
+
+
+def main():
+    args = parse_args()
+
+    with open(args.source, 'r') as source:
+        jsondata = json.loads(source.read())
+
+    for jobnum in range(0, len(jsondata['jobs'])):
+        bins = {}
+        run_total = {}
+        ddir_set = set(['read', 'write', 'trim'])
+
+        prev_ddir = None
+        for ddir in ddir_set:
+            if 'bins' in jsondata['jobs'][jobnum][ddir]['clat_ns']:
+                bins_loc = 'clat_ns'
+            elif 'bins' in jsondata['jobs'][jobnum][ddir]['lat_ns']:
+                bins_loc = 'lat_ns'
+            else:
+                raise RuntimeError("Latency bins not found. "
+                                   "Are you sure you are using json+ output?")
+
+            bins[ddir] = [[int(key), value] for key, value in
+                          six.iteritems(jsondata['jobs'][jobnum][ddir][bins_loc]
+                          ['bins'])]
+            bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0])
+
+            run_total[ddir] = [0 for x in range(0, len(bins[ddir]))]
+            if len(bins[ddir]) > 0:
+                run_total[ddir][0] = bins[ddir][0][1]
+                for x in range(1, len(bins[ddir])):
+                    run_total[ddir][x] = run_total[ddir][x-1] + \
+                        bins[ddir][x][1]
+
+        stub, ext = os.path.splitext(args.dest)
+        outfile = stub + '_job' + str(jobnum) + ext
+
+        with open(outfile, 'w') as output:
+            output.write("{0}ec, ".format(bins_loc))
+            ddir_list = list(ddir_set)
+            for ddir in ddir_list:
+                output.write("{0}_count, {0}_cumulative, {0}_percentile, ".
+                             format(ddir))
+            output.write("\n")
+
+#
+# Have a counter for each ddir
+# In each round, pick the shortest remaining duration
+# and output a line with any values for that duration
+#
+            indices = {x: 0 for x in ddir_list}
+            while more_lines(indices, bins):
+                min_lat = 17112760320
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]):
+                        min_lat = min(bins[ddir][indices[ddir]][0], min_lat)
+
+                output.write("{0}, ".format(min_lat))
+
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]) and \
+                       min_lat == bins[ddir][indices[ddir]][0]:
+                        count = bins[ddir][indices[ddir]][1]
+                        cumulative = run_total[ddir][indices[ddir]]
+                        ptile = percentile(indices[ddir], run_total[ddir])
+                        output.write("{0}, {1}, {2}, ".format(count,
+                                     cumulative, ptile))
+                        indices[ddir] += 1
+                    else:
+                        output.write(", , , ")
+                output.write("\n")
+
+            print("{0} generated".format(outfile))
+
+
+if __name__ == '__main__':
+    main()
diff -Nru fio-2.1.3/tools/fiologparser.py fio-3.16/tools/fiologparser.py
--- fio-2.1.3/tools/fiologparser.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/fiologparser.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,223 @@
+#!/usr/bin/python2.7
+# Note: this script is python2 and python 3 compatible.
+#
+# fiologparser.py
+#
+# This tool lets you parse multiple fio log files and look at interaval
+# statistics even when samples are non-uniform.  For instance:
+#
+# fiologparser.py -s *bw*
+#
+# to see per-interval sums for all bandwidth logs or:
+#
+# fiologparser.py -a *clat*
+#
+# to see per-interval average completion latency.
+
+from __future__ import absolute_import
+from __future__ import print_function
+import argparse
+import math
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--interval', required=False, type=int, default=1000, help='interval of time in seconds.')
+    parser.add_argument('-d', '--divisor', required=False, type=int, default=1, help='divide the results by this value.')
+    parser.add_argument('-f', '--full', dest='full', action='store_true', default=False, help='print full output.')
+    parser.add_argument('-A', '--all', dest='allstats', action='store_true', default=False, 
+                        help='print all stats for each interval.')
+    parser.add_argument('-a', '--average', dest='average', action='store_true', default=False, help='print the average for each interval.')
+    parser.add_argument('-s', '--sum', dest='sum', action='store_true', default=False, help='print the sum for each interval.')
+    parser.add_argument("FILE", help="collectl log output files to parse", nargs="+")
+    args = parser.parse_args()
+
+    return args
+
+def get_ftime(series):
+    ftime = 0
+    for ts in series:
+        if ftime == 0 or ts.last.end < ftime:
+            ftime = ts.last.end
+    return ftime
+
+def print_full(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %s" % (end, ', '.join(["%0.3f" % i for i in results])))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_sums(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, sum(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_averages(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, float(sum(results))/len(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+# FIXME: this routine is computationally inefficient
+# and has O(N^2) behavior
+# it would be better to make one pass through samples
+# to segment them into a series of time intervals, and
+# then compute stats on each time interval instead.
+# to debug this routine, use
+#   # sort -n -t ',' -k 2 small.log
+# on your input.
+
+def my_extend( vlist, val ):
+    vlist.extend(val)
+    return vlist
+
+array_collapser = lambda vlist, val:  my_extend(vlist, val) 
+
+def print_all_stats(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+    print('start-time, samples, min, avg, median, 90%, 95%, 99%, max')
+    while (start < ftime):  # for each time interval
+        end = ftime if ftime < end else end
+        sample_arrays = [ s.get_samples(start, end) for s in series ]
+        samplevalue_arrays = []
+        for sample_array in sample_arrays:
+            samplevalue_arrays.append( 
+                [ sample.value for sample in sample_array ] )
+        # collapse list of lists of sample values into list of sample values
+        samplevalues = reduce( array_collapser, samplevalue_arrays, [] )
+        # compute all stats and print them
+        mymin = min(samplevalues)
+        myavg = sum(samplevalues) / float(len(samplevalues))
+        mymedian = median(samplevalues)
+        my90th = percentile(samplevalues, 0.90) 
+        my95th = percentile(samplevalues, 0.95)
+        my99th = percentile(samplevalues, 0.99)
+        mymax = max(samplevalues)
+        print( '%f, %d, %f, %f, %f, %f, %f, %f, %f' % (
+            start, len(samplevalues), 
+            mymin, myavg, mymedian, my90th, my95th, my99th, mymax))
+
+        # advance to next interval
+        start += ctx.interval
+        end += ctx.interval
+
+def median(values):
+    s=sorted(values)
+    return float(s[(len(s)-1)/2]+s[(len(s)/2)])/2
+
+def percentile(values, p):
+    s = sorted(values)
+    k = (len(s)-1) * p
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return s[int(k)]
+    return (s[int(f)] * (c-k)) + (s[int(c)] * (k-f))
+
+def print_default(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+    averages = []
+    weights = []
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        averages.append(sum(results)) 
+        weights.append(end-start)
+        start += ctx.interval
+        end += ctx.interval
+
+    total = 0
+    for i in range(0, len(averages)):
+        total += averages[i]*weights[i]
+    print('%0.3f' % (total/sum(weights)))
+ 
+class TimeSeries(object):
+    def __init__(self, ctx, fn):
+        self.ctx = ctx
+        self.last = None 
+        self.samples = []
+        self.read_data(fn)
+
+    def read_data(self, fn):
+        f = open(fn, 'r')
+        p_time = 0
+        for line in f:
+            (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ')
+            self.add_sample(p_time, int(time), int(value))
+            p_time = int(time)
+ 
+    def add_sample(self, start, end, value):
+        sample = Sample(ctx, start, end, value)
+        if not self.last or self.last.end < end:
+            self.last = sample
+        self.samples.append(sample)
+
+    def get_samples(self, start, end):
+        sample_list = []
+        for s in self.samples:
+            if s.start >= start and s.end <= end:
+                sample_list.append(s)
+        return sample_list
+
+    def get_value(self, start, end):
+        value = 0
+        for sample in self.samples:
+            value += sample.get_contribution(start, end)
+        return value
+
+class Sample(object):
+    def __init__(self, ctx, start, end, value):
+       self.ctx = ctx
+       self.start = start
+       self.end = end
+       self.value = value
+
+    def get_contribution(self, start, end):
+       # short circuit if not within the bound
+       if (end < self.start or start > self.end):
+           return 0 
+
+       sbound = self.start if start < self.start else start
+       ebound = self.end if end > self.end else end
+       ratio = float(ebound-sbound) / (end-start) 
+       return self.value*ratio/ctx.divisor
+
+
+if __name__ == '__main__':
+    ctx = parse_args()
+    series = []
+    for fn in ctx.FILE:
+       series.append(TimeSeries(ctx, fn)) 
+    if ctx.sum:
+        print_sums(ctx, series)
+    elif ctx.average:
+        print_averages(ctx, series)
+    elif ctx.full:
+        print_full(ctx, series)
+    elif ctx.allstats:
+        print_all_stats(ctx, series)
+    else:
+        print_default(ctx, series)
diff -Nru fio-2.1.3/tools/fio.service fio-3.16/tools/fio.service
--- fio-2.1.3/tools/fio.service	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/fio.service	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,10 @@
+[Unit]
+Description=Flexible I/O tester server
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/fio --server
+
+[Install]
+WantedBy=multi-user.target
diff -Nru fio-2.1.3/tools/genfio fio-3.16/tools/genfio
--- fio-2.1.3/tools/genfio	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/genfio	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/bash
 #
 #  Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
 #  Author: Erwan Velu  <erwan@enovance.com>
@@ -17,7 +17,7 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 
 BLK_SIZE=
 BLOCK_SIZE=4k
@@ -54,6 +54,8 @@
 					Default is $IODEPTH
 -d disk1[,disk2,disk3,..]	: Run the tests on the selected disks
 					Separated each disk with a comma
+-z filesize                     : Specify the working file size, if you are passing filepaths to -d
+                                        Disabled by default
 -r seconds			: Time in seconds per benchmark
 					0 means till the end of the device
 					Default is $RUNTIME seconds
@@ -203,7 +205,7 @@
 }
 
 parse_cmdline() {
-while getopts "hacpsd:b:r:m:x:D:A:B:" opt; do
+while getopts "hacpsd:b:r:m:x:z:D:A:B:" opt; do
   case $opt in
     h)
 	show_help
@@ -260,6 +262,10 @@
     A)
 	echo "exec_postrun=$OPTARG" >> $TEMPLATE
       ;;
+    z)
+	FSIZE=$OPTARG
+	echo "size=$FSIZE" >> $TEMPLATE
+      ;;
     \?)
       echo "Invalid option: -$OPTARG" >&2
       ;;
diff -Nru fio-2.1.3/tools/hist/fio-histo-log-pctiles.py fio-3.16/tools/hist/fio-histo-log-pctiles.py
--- fio-2.1.3/tools/hist/fio-histo-log-pctiles.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/hist/fio-histo-log-pctiles.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,774 @@
+#!/usr/bin/env python
+
+# module to parse fio histogram log files, not using pandas
+# runs in python v2 or v3
+# to get help with the CLI: $ python fio-histo-log-pctiles.py -h
+# this can be run standalone as a script but is callable
+# assumes all threads run for same time duration
+# assumes all threads are doing the same thing for the entire run
+
+# percentiles:
+#  0 - min latency
+#  50 - median
+#  100 - max latency
+
+# TO-DO: 
+#   separate read and write stats for randrw mixed workload
+#   report average latency if needed
+#   prove that it works (partially done with unit tests)
+
+# to run unit tests, set UNITTEST environment variable to anything
+# if you do this, don't pass normal CLI parameters to it
+# otherwise it runs the CLI
+
+import sys, os, math, copy, time
+from copy import deepcopy
+import argparse
+
+unittest2_imported = True
+try:
+    import unittest2
+except ImportError:
+    unittest2_imported = False
+
+msec_per_sec = 1000
+nsec_per_usec = 1000
+direction_read = 0
+direction_write = 1
+
+class FioHistoLogExc(Exception):
+    pass
+
+# if there is an error, print message, and exit with error status
+
+def myabort(msg):
+    print('ERROR: ' + msg)
+    sys.exit(1)
+
+# convert histogram log file into a list of
+# (time_ms, direction, bsz, buckets) tuples where
+# - time_ms is the time in msec at which the log record was written
+# - direction is 0 (read) or 1 (write)
+# - bsz is block size (not used)
+# - buckets is a CSV list of counters that make up the histogram
+# caller decides if the expected number of counters are present
+
+
+def exception_suffix( record_num, pathname ):
+    return 'in histogram record %d file %s' % (record_num+1, pathname)
+
+# log file parser raises FioHistoLogExc exceptions
+# it returns histogram buckets in whatever unit fio uses
+# inputs:
+#  logfn: pathname to histogram log file
+#  buckets_per_interval - how many histogram buckets to expect
+#  log_hist_msec - if not None, expected time interval between histogram records
+
+def parse_hist_file(logfn, buckets_per_interval, log_hist_msec):
+    previous_ts_ms_read = -1
+    previous_ts_ms_write = -1
+ 
+    with open(logfn, 'r') as f:
+        records = [ l.strip() for l in f.readlines() ]
+    intervals = []
+    last_time_ms = -1
+    last_direction = -1
+    for k, r in enumerate(records):
+        if r == '':
+            continue
+        tokens = r.split(',')
+        try:
+            int_tokens = [ int(t) for t in tokens ]
+        except ValueError as e:
+            raise FioHistoLogExc('non-integer value %s' % exception_suffix(k+1, logfn))
+
+        neg_ints = list(filter( lambda tk : tk < 0, int_tokens ))
+        if len(neg_ints) > 0:
+            raise FioHistoLogExc('negative integer value %s' % exception_suffix(k+1, logfn))
+
+        if len(int_tokens) < 3:
+            raise FioHistoLogExc('too few numbers %s' % exception_suffix(k+1, logfn))
+
+        direction = int_tokens[1]
+        if direction != direction_read and direction != direction_write:
+            raise FioHistoLogExc('invalid I/O direction %s' % exception_suffix(k+1, logfn))
+
+        time_ms = int_tokens[0]
+        if direction == direction_read:
+            if time_ms < previous_ts_ms_read:
+                raise FioHistoLogExc('read timestamp in column 1 decreased %s' % exception_suffix(k+1, logfn))
+            previous_ts_ms_read = time_ms
+        elif direction == direction_write:
+            if time_ms < previous_ts_ms_write:
+                raise FioHistoLogExc('write timestamp in column 1 decreased %s' % exception_suffix(k+1, logfn))
+            previous_ts_ms_write = time_ms
+
+        bsz = int_tokens[2]
+        if bsz > (1 << 24):
+            raise FioHistoLogExc('block size too large %s' % exception_suffix(k+1, logfn))
+
+        buckets = int_tokens[3:]
+        if len(buckets) != buckets_per_interval:
+            raise FioHistoLogExc('%d buckets per interval but %d expected in %s' % 
+                    (len(buckets), buckets_per_interval, exception_suffix(k+1, logfn)))
+
+        # hack to filter out records with the same timestamp
+        # we should not have to do this if fio logs histogram records correctly
+
+        if time_ms == last_time_ms and direction == last_direction:
+            continue
+        last_time_ms = time_ms
+        last_direction = direction
+
+        intervals.append((time_ms, direction, bsz, buckets))
+    if len(intervals) == 0:
+        raise FioHistoLogExc('no records in %s' % logfn)
+    (first_timestamp, _, _, _) = intervals[0]
+    if first_timestamp < 1000000:
+        start_time = 0    # assume log_unix_epoch = 0
+    elif log_hist_msec != None:
+        start_time = first_timestamp - log_hist_msec
+    elif len(intervals) > 1:
+        (second_timestamp, _, _, _) = intervals[1]
+        start_time = first_timestamp - (second_timestamp - first_timestamp)
+    else:
+        raise FioHistoLogExc('no way to estimate test start time')
+    (end_timestamp, _, _, _) = intervals[-1]
+
+    return (intervals, start_time, end_timestamp)
+
+
+# compute time range for each bucket index in histogram record
+# see comments in https://github.com/axboe/fio/blob/master/stat.h
+# for description of bucket groups and buckets
+# fio v3 bucket ranges are in nanosec (since response times are measured in nanosec)
+# but we convert fio v3 nanosecs to floating-point microseconds
+
+def time_ranges(groups, counters_per_group, fio_version=3):
+    bucket_width = 1
+    bucket_base = 0
+    bucket_intervals = []
+    for g in range(0, groups):
+        for b in range(0, counters_per_group):
+            rmin = float(bucket_base)
+            rmax = rmin + bucket_width
+            if fio_version == 3:
+                rmin /= nsec_per_usec
+                rmax /= nsec_per_usec
+            bucket_intervals.append( [rmin, rmax] )
+            bucket_base += bucket_width
+        if g != 0:
+            bucket_width *= 2
+    return bucket_intervals
+
+
+# compute number of time quantum intervals in the test
+
+def get_time_intervals(time_quantum, min_timestamp_ms, max_timestamp_ms):
+    # round down to nearest second
+    max_timestamp = max_timestamp_ms // msec_per_sec
+    min_timestamp = min_timestamp_ms // msec_per_sec
+    # round up to nearest whole multiple of time_quantum
+    time_interval_count = ((max_timestamp - min_timestamp) + time_quantum) // time_quantum
+    end_time = min_timestamp + (time_interval_count * time_quantum)
+    return (end_time, time_interval_count)
+
+# align raw histogram log data to time quantum so 
+# we can then combine histograms from different threads with addition
+# for randrw workload we count both reads and writes in same output bucket
+# but we separate reads and writes for purposes of calculating
+# end time for histogram record.
+# this requires us to weight a raw histogram bucket by the 
+# fraction of time quantum that the bucket overlaps the current
+# time quantum interval
+# for example, if we have a bucket with 515 samples for time interval
+# [ 1010, 2014 ] msec since start of test, and time quantum is 1 sec, then
+# for time quantum interval [ 1000, 2000 ] msec, the overlap is
+# (2000 - 1010) / (2000 - 1000) = 0.99
+# so the contribution of this bucket to this time quantum is
+# 515 x 0.99 = 509.85
+
+def align_histo_log(raw_histogram_log, time_quantum, bucket_count, min_timestamp_ms, max_timestamp_ms):
+
+    # slice up test time int intervals of time_quantum seconds
+
+    (end_time, time_interval_count) = get_time_intervals(time_quantum, min_timestamp_ms, max_timestamp_ms)
+    time_qtm_ms = time_quantum * msec_per_sec
+    end_time_ms = end_time * msec_per_sec
+    aligned_intervals = []
+    for j in range(0, time_interval_count):
+        aligned_intervals.append((
+            min_timestamp_ms + (j * time_qtm_ms),
+            [ 0.0 for j in range(0, bucket_count) ] ))
+
+    log_record_count = len(raw_histogram_log)
+    for k, record in enumerate(raw_histogram_log):
+
+        # find next record with same direction to get end-time
+        # have to avoid going past end of array
+        # for fio randrw workload, 
+        # we have read and write records on same time interval
+        # sometimes read and write records are in opposite order
+        # assertion checks that next read/write record 
+        # can be separated by at most 2 other records
+
+        (time_msec, direction, sz, interval_buckets) = record
+        if k+1 < log_record_count:
+            (time_msec_end, direction2, _, _) = raw_histogram_log[k+1]
+            if direction2 != direction:
+                if k+2 < log_record_count:
+                    (time_msec_end, direction2, _, _) = raw_histogram_log[k+2]
+                    if direction2 != direction:
+                        if k+3 < log_record_count:
+                            (time_msec_end, direction2, _, _) = raw_histogram_log[k+3]
+                            assert direction2 == direction
+                        else:
+                            time_msec_end = end_time_ms
+                else:
+                    time_msec_end = end_time_ms
+        else:
+            time_msec_end = end_time_ms
+
+        # calculate first quantum that overlaps this histogram record 
+
+        offset_from_min_ts = time_msec - min_timestamp_ms
+        qtm_start_ms = min_timestamp_ms + (offset_from_min_ts // time_qtm_ms) * time_qtm_ms
+        qtm_end_ms = min_timestamp_ms + ((offset_from_min_ts + time_qtm_ms) // time_qtm_ms) * time_qtm_ms
+        qtm_index = offset_from_min_ts // time_qtm_ms
+
+        # for each quantum that overlaps this histogram record's time interval
+
+        while qtm_start_ms < time_msec_end:  # while quantum overlaps record
+
+            # some histogram logs may be longer than others
+
+            if len(aligned_intervals) <= qtm_index:
+                break
+
+            # calculate fraction of time that this quantum 
+            # overlaps histogram record's time interval
+            
+            overlap_start = max(qtm_start_ms, time_msec)
+            overlap_end = min(qtm_end_ms, time_msec_end)
+            weight = float(overlap_end - overlap_start)
+            weight /= (time_msec_end - time_msec)
+            (_,aligned_histogram) = aligned_intervals[qtm_index]
+            for bx, b in enumerate(interval_buckets):
+                weighted_bucket = weight * b
+                aligned_histogram[bx] += weighted_bucket
+
+            # advance to the next time quantum
+
+            qtm_start_ms += time_qtm_ms
+            qtm_end_ms += time_qtm_ms
+            qtm_index += 1
+
+    return aligned_intervals
+
+# add histogram in "source" to histogram in "target"
+# it is assumed that the 2 histograms are precisely time-aligned
+
+def add_to_histo_from( target, source ):
+    for b in range(0, len(source)):
+        target[b] += source[b]
+
+
+# calculate total samples in the histogram buckets
+
+def get_samples(buckets):
+    return reduce( lambda x,y: x + y, buckets)
+
+
+# compute percentiles
+# inputs:
+#   buckets: histogram bucket array 
+#   wanted: list of floating-pt percentiles to calculate
+#   time_ranges: [tmin,tmax) time interval for each bucket
+# returns None if no I/O reported.
+# otherwise we would be dividing by zero
+# think of buckets as probability distribution function
+# and this loop is integrating to get cumulative distribution function
+
+def get_pctiles(buckets, wanted, time_ranges):
+
+    # get total of IO requests done
+    total_ios = 0
+    for io_count in buckets:
+        total_ios += io_count
+
+    # don't return percentiles if no I/O was done during interval
+    if total_ios == 0.0:
+        return None
+
+    pctile_count = len(wanted)
+
+    # results returned as dictionary keyed by percentile
+    pctile_result = {}
+
+    # index of next percentile in list
+    pctile_index = 0
+
+    # next percentile
+    next_pctile = wanted[pctile_index]
+
+    # no one is interested in percentiles bigger than this but not 100.0
+    # this prevents floating-point error from preventing loop exit
+    almost_100 = 99.9999
+
+    # pct is the percentile corresponding to 
+    # all I/O requests up through bucket b
+    pct = 0.0
+    total_so_far = 0
+    for b, io_count in enumerate(buckets):
+        if io_count == 0:
+            continue
+        total_so_far += io_count
+        # last_pct_lt is the percentile corresponding to 
+        # all I/O requests up to, but not including, bucket b
+        last_pct = pct
+        pct = 100.0 * float(total_so_far) / total_ios
+        # a single bucket could satisfy multiple pctiles
+        # so this must be a while loop
+        # for 100-percentile (max latency) case, no bucket exceeds it 
+        # so we must stop there.
+        while ((next_pctile == 100.0 and pct >= almost_100) or
+               (next_pctile < 100.0  and pct > next_pctile)):
+            # interpolate between min and max time for bucket time interval
+            # we keep the time_ranges access inside this loop, 
+            # even though it could be above the loop,
+            # because in many cases we will not be even entering 
+            # the loop so we optimize out these accesses
+            range_max_time = time_ranges[b][1]
+            range_min_time = time_ranges[b][0]
+            offset_frac = (next_pctile - last_pct)/(pct - last_pct)
+            interpolation = range_min_time + (offset_frac*(range_max_time - range_min_time))
+            pctile_result[next_pctile] = interpolation
+            pctile_index += 1
+            if pctile_index == pctile_count:
+                break
+            next_pctile = wanted[pctile_index]
+        if pctile_index == pctile_count:
+            break
+    assert pctile_index == pctile_count
+    return pctile_result
+
+
+# this is really the main program
+
+def compute_percentiles_from_logs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fio-version", dest="fio_version", 
+        default="3", choices=[2,3], type=int, 
+        help="fio version (default=3)")
+    parser.add_argument("--bucket-groups", dest="bucket_groups", default="29", type=int, 
+        help="fio histogram bucket groups (default=29)")
+    parser.add_argument("--bucket-bits", dest="bucket_bits", 
+        default="6", type=int, 
+        help="fio histogram buckets-per-group bits (default=6 means 64 buckets/group)")
+    parser.add_argument("--percentiles", dest="pctiles_wanted", 
+        default=[ 0., 50., 95., 99., 100.], type=float, nargs='+',
+        help="fio histogram buckets-per-group bits (default=6 means 64 buckets/group)")
+    parser.add_argument("--time-quantum", dest="time_quantum", 
+        default="1", type=int,
+        help="time quantum in seconds (default=1)")
+    parser.add_argument("--log-hist-msec", dest="log_hist_msec", 
+        type=int, default=None,
+        help="log_hist_msec value in fio job file")
+    parser.add_argument("--output-unit", dest="output_unit", 
+        default="usec", type=str,
+        help="Latency percentile output unit: msec|usec|nsec (default usec)")
+    parser.add_argument("file_list", nargs='+', 
+        help='list of files, preceded by " -- " if necessary')
+    args = parser.parse_args()
+
+    # default changes based on fio version
+    if args.fio_version == 2:
+        args.bucket_groups = 19
+
+    # print parameters
+
+    print('fio version = %d' % args.fio_version)
+    print('bucket groups = %d' % args.bucket_groups)
+    print('bucket bits = %d' % args.bucket_bits)
+    print('time quantum = %d sec' % args.time_quantum)
+    print('percentiles = %s' % ','.join([ str(p) for p in args.pctiles_wanted ]))
+    buckets_per_group = 1 << args.bucket_bits
+    print('buckets per group = %d' % buckets_per_group)
+    buckets_per_interval = buckets_per_group * args.bucket_groups
+    print('buckets per interval = %d ' % buckets_per_interval)
+    bucket_index_range = range(0, buckets_per_interval)
+    if args.log_hist_msec != None:
+        print('log_hist_msec = %d' % args.log_hist_msec)
+    if args.time_quantum == 0:
+        print('ERROR: time-quantum must be a positive number of seconds')
+    print('output unit = ' + args.output_unit)
+    if args.output_unit == 'msec':
+        time_divisor = float(msec_per_sec)
+    elif args.output_unit == 'usec':
+        time_divisor = 1.0
+
+    # construct template for each histogram bucket array with buckets all zeroes
+    # we just copy this for each new histogram
+
+    zeroed_buckets = [ 0.0 for r in bucket_index_range ]
+
+    # calculate response time interval associated with each histogram bucket
+
+    bucket_times = time_ranges(args.bucket_groups, buckets_per_group, fio_version=args.fio_version)
+
+    # parse the histogram logs
+    # assumption: each bucket has a monotonically increasing time
+    # assumption: time ranges do not overlap for a single thread's records
+    # (exception: if randrw workload, then there is a read and a write 
+    # record for the same time interval)
+
+    test_start_time = 0
+    test_end_time = 1.0e18
+    hist_files = {}
+    for fn in args.file_list:
+        try:
+            (hist_files[fn], log_start_time, log_end_time)  = parse_hist_file(fn, buckets_per_interval, args.log_hist_msec)
+        except FioHistoLogExc as e:
+            myabort(str(e))
+        # we consider the test started when all threads have started logging
+        test_start_time = max(test_start_time, log_start_time)
+        # we consider the test over when one of the logs has ended
+        test_end_time = min(test_end_time, log_end_time)
+
+    if test_start_time >= test_end_time:
+        raise FioHistoLogExc('no time interval when all threads logs overlapped')
+    if test_start_time > 0:
+        print('all threads running as of unix epoch time %d = %s' % (
+               test_start_time/float(msec_per_sec), 
+               time.ctime(test_start_time/1000.0)))
+
+    (end_time, time_interval_count) = get_time_intervals(args.time_quantum, test_start_time, test_end_time)
+    all_threads_histograms = [ ((j*args.time_quantum*msec_per_sec), deepcopy(zeroed_buckets))
+                               for j in range(0, time_interval_count) ]
+
+    for logfn in hist_files.keys():
+        aligned_per_thread = align_histo_log(hist_files[logfn], 
+                                             args.time_quantum, 
+                                             buckets_per_interval, 
+                                             test_start_time,
+                                             test_end_time)
+        for t in range(0, time_interval_count):
+            (_, all_threads_histo_t) = all_threads_histograms[t]
+            (_, log_histo_t) = aligned_per_thread[t]
+            add_to_histo_from( all_threads_histo_t, log_histo_t )
+
+    # calculate percentiles across aggregate histogram for all threads
+    # print CSV header just like fiologparser_hist does
+
+    header = 'msec-since-start, samples, '
+    for p in args.pctiles_wanted:
+        if p == 0.:
+            next_pctile_header = 'min'
+        elif p == 100.:
+            next_pctile_header = 'max'
+        elif p == 50.:
+            next_pctile_header = 'median'
+        else:
+            next_pctile_header = '%3.1f' % p
+        header += '%s, ' % next_pctile_header
+
+    print('time (millisec), percentiles in increasing order with values in ' + args.output_unit)
+    print(header)
+
+    for (t_msec, all_threads_histo_t) in all_threads_histograms:
+        samples = get_samples(all_threads_histo_t)
+        record = '%8d, %8d, ' % (t_msec, samples)
+        pct = get_pctiles(all_threads_histo_t, args.pctiles_wanted, bucket_times)
+        if not pct:
+            for w in args.pctiles_wanted:
+                record += ', '
+        else:
+            pct_keys = [ k for k in pct.keys() ]
+            pct_values = [ str(pct[wanted]/time_divisor) for wanted in sorted(pct_keys) ]
+            record += ', '.join(pct_values)
+        print(record)
+
+
+
+#end of MAIN PROGRAM
+
+
+##### below are unit tests ##############
+
+if unittest2_imported:
+  import tempfile, shutil
+  from os.path import join
+  should_not_get_here = False
+
+  class Test(unittest2.TestCase):
+    tempdir = None
+
+    # a little less typing please
+    def A(self, boolean_val):
+        self.assertTrue(boolean_val)
+
+    # initialize unit test environment
+
+    @classmethod
+    def setUpClass(cls):
+        d = tempfile.mkdtemp()
+        Test.tempdir = d
+
+    # remove anything left by unit test environment
+    # unless user sets UNITTEST_LEAVE_FILES environment variable
+
+    @classmethod
+    def tearDownClass(cls):
+        if not os.getenv("UNITTEST_LEAVE_FILES"):
+            shutil.rmtree(cls.tempdir)
+
+    def setUp(self):
+        self.fn = join(Test.tempdir, self.id())
+
+    def test_a_add_histos(self):
+        a = [ 1.0, 2.0 ]
+        b = [ 1.5, 2.5 ]
+        add_to_histo_from( a, b )
+        self.A(a == [2.5, 4.5])
+        self.A(b == [1.5, 2.5])
+
+    def test_b1_parse_log(self):
+        with open(self.fn, 'w') as f:
+            f.write('1234, 0, 4096, 1, 2, 3, 4\n')
+            f.write('5678,1,16384,5,6,7,8 \n')
+        (raw_histo_log, min_timestamp, max_timestamp) = parse_hist_file(self.fn, 4, None) # 4 buckets per interval
+        # if not log_unix_epoch=1, then min_timestamp will always be set to zero
+        self.A(len(raw_histo_log) == 2 and min_timestamp == 0 and max_timestamp == 5678)
+        (time_ms, direction, bsz, histo) = raw_histo_log[0]
+        self.A(time_ms == 1234 and direction == 0 and bsz == 4096 and histo == [ 1, 2, 3, 4 ])
+        (time_ms, direction, bsz, histo) = raw_histo_log[1]
+        self.A(time_ms == 5678 and direction == 1 and bsz == 16384 and histo == [ 5, 6, 7, 8 ])
+
+    def test_b2_parse_empty_log(self):
+        with open(self.fn, 'w') as f:
+            pass
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(should_not_get_here)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('no records'))
+
+    def test_b3_parse_empty_records(self):
+        with open(self.fn, 'w') as f:
+            f.write('\n')
+            f.write('1234, 0, 4096, 1, 2, 3, 4\n')
+            f.write('5678,1,16384,5,6,7,8 \n')
+            f.write('\n')
+        (raw_histo_log, _, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(len(raw_histo_log) == 2 and max_timestamp_ms == 5678)
+        (time_ms, direction, bsz, histo) = raw_histo_log[0]
+        self.A(time_ms == 1234 and direction == 0 and bsz == 4096 and histo == [ 1, 2, 3, 4 ])
+        (time_ms, direction, bsz, histo) = raw_histo_log[1]
+        self.A(time_ms == 5678 and direction == 1 and bsz == 16384 and histo == [ 5, 6, 7, 8 ])
+
+    def test_b4_parse_non_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('12, 0, 4096, 1a, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('non-integer'))
+
+    def test_b5_parse_neg_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('-12, 0, 4096, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('negative integer'))
+
+    def test_b6_parse_too_few_int(self):
+        with open(self.fn, 'w') as f:
+            f.write('0, 0\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('too few numbers'))
+
+    def test_b7_parse_invalid_direction(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 2, 4096, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('invalid I/O direction'))
+
+    def test_b8_parse_bsz_too_big(self):
+        with open(self.fn+'_good', 'w') as f:
+            f.write('100, 1, %d, 1, 2, 3, 4\n' % (1<<24))
+        (raw_histo_log, _, _) = parse_hist_file(self.fn+'_good', 4, None)
+        with open(self.fn+'_bad', 'w') as f:
+            f.write('100, 1, 20000000, 1, 2, 3, 4\n')
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn+'_bad', 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).startswith('block size too large'))
+
+    def test_b9_parse_wrong_bucket_count(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 1, %d, 1, 2, 3, 4, 5\n' % (1<<24))
+        try:
+            (raw_histo_log, _, _) = parse_hist_file(self.fn, 4, None)
+            self.A(False)
+        except FioHistoLogExc as e:
+            self.A(str(e).__contains__('buckets per interval'))
+
+    def test_c1_time_ranges(self):
+        ranges = time_ranges(3, 2)  # fio_version defaults to 3
+        expected_ranges = [ # fio_version 3 is in nanoseconds
+                [0.000, 0.001], [0.001, 0.002],   # first group
+                [0.002, 0.003], [0.003, 0.004],   # second group same width
+                [0.004, 0.006], [0.006, 0.008]]   # subsequent groups double width
+        self.A(ranges == expected_ranges)
+        ranges = time_ranges(3, 2, fio_version=3)
+        self.A(ranges == expected_ranges)
+        ranges = time_ranges(3, 2, fio_version=2)
+        expected_ranges_v2 = [ [ 1000.0 * min_or_max for min_or_max in time_range ] 
+                               for time_range in expected_ranges ]
+        self.A(ranges == expected_ranges_v2)
+        # see fio V3 stat.h for why 29 groups and 2^6 buckets/group
+        normal_ranges_v3 = time_ranges(29, 64)
+        # for v3, bucket time intervals are measured in nanoseconds
+        self.A(len(normal_ranges_v3) == 29 * 64 and normal_ranges_v3[-1][1] == 64*(1<<(29-1))/1000.0)
+        normal_ranges_v2 = time_ranges(19, 64, fio_version=2)
+        # for v2, bucket time intervals are measured in microseconds so we have fewer buckets
+        self.A(len(normal_ranges_v2) == 19 * 64 and normal_ranges_v2[-1][1] == 64*(1<<(19-1)))
+
+    def test_d1_align_histo_log_1_quantum(self):
+        with open(self.fn, 'w') as f:
+            f.write('100, 1, 4096, 1, 2, 3, 4')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 100)
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 1)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 0 and h == [1., 2., 3., 4.])
+
+    # handle case with log_unix_epoch=1 timestamps, 1-second time quantum
+    # here both records will be separated into 2 aligned intervals
+
+    def test_d1a_align_2rec_histo_log_epoch_1_quantum_1sec(self):
+        with open(self.fn, 'w') as f:
+            f.write('1536504002123, 1, 4096, 1, 2, 3, 4\n')
+            f.write('1536504003123, 1, 4096, 4, 3, 2, 1\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 1536504001123 and max_timestamp_ms == 1536504003123)
+        aligned_log = align_histo_log(raw_histo_log, 1, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 3)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 1536504001123 and h == [0., 0., 0., 0.])
+        (time_ms1, h) = aligned_log[1]
+        self.A(time_ms1 == 1536504002123 and h == [1., 2., 3., 4.])
+        (time_ms2, h) = aligned_log[2]
+        self.A(time_ms2 == 1536504003123 and h == [4., 3., 2., 1.])
+
+    # handle case with log_unix_epoch=1 timestamps, 5-second time quantum
+    # here both records will be merged into a single aligned time interval
+
+    def test_d1b_align_2rec_histo_log_epoch_1_quantum_5sec(self):
+        with open(self.fn, 'w') as f:
+            f.write('1536504002123, 1, 4096, 1, 2, 3, 4\n')
+            f.write('1536504003123, 1, 4096, 4, 3, 2, 1\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 1536504001123 and max_timestamp_ms == 1536504003123)
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 1)
+        (time_ms0, h) = aligned_log[0]
+        self.A(time_ms0 == 1536504001123 and h == [5., 5., 5., 5.])
+
+    # we need this to compare 2 lists of floating point numbers for equality
+    # because of floating-point imprecision
+
+    def compare_2_floats(self, x, y):
+        if x == 0.0 or y == 0.0:
+            return (x+y) < 0.0000001
+        else:
+            return (math.fabs(x-y)/x) < 0.00001
+                
+    def is_close(self, buckets, buckets_expected):
+        if len(buckets) != len(buckets_expected):
+            return False
+        compare_buckets = lambda k: self.compare_2_floats(buckets[k], buckets_expected[k])
+        indices_close = list(filter(compare_buckets, range(0, len(buckets))))
+        return len(indices_close) == len(buckets)
+
+    def test_d2_align_histo_log_2_quantum(self):
+        with open(self.fn, 'w') as f:
+            f.write('2000, 1, 4096, 1, 2, 3, 4\n')
+            f.write('7000, 1, 4096, 1, 2, 3, 4\n')
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 4, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 7000)
+        (_, _, _, raw_buckets1) = raw_histo_log[0]
+        (_, _, _, raw_buckets2) = raw_histo_log[1]
+        aligned_log = align_histo_log(raw_histo_log, 5, 4, min_timestamp_ms, max_timestamp_ms)
+        self.A(len(aligned_log) == 2)
+        (time_ms1, h1) = aligned_log[0]
+        (time_ms2, h2) = aligned_log[1]
+        # because first record is from time interval [2000, 7000]
+        # we weight it according
+        expect1 = [float(b) * 0.6 for b in raw_buckets1]
+        expect2 = [float(b) * 0.4 for b in raw_buckets1]
+        for e in range(0, len(expect2)):
+            expect2[e] += raw_buckets2[e]
+        self.A(time_ms1 == 0    and self.is_close(h1, expect1))
+        self.A(time_ms2 == 5000 and self.is_close(h2, expect2))
+
+    # what to expect if histogram buckets are all equal
+    def test_e1_get_pctiles_flat_histo(self):
+        with open(self.fn, 'w') as f:
+            buckets = [ 100 for j in range(0, 128) ]
+            f.write('9000, 1, 4096, %s\n' % ', '.join([str(b) for b in buckets]))
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, 128, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 9000)
+        aligned_log = align_histo_log(raw_histo_log, 5, 128, min_timestamp_ms, max_timestamp_ms)
+        time_intervals = time_ranges(4, 32)
+        # since buckets are all equal, then median is halfway through time_intervals
+        # and max latency interval is at end of time_intervals
+        self.A(time_intervals[64][1] == 0.066 and time_intervals[127][1] == 0.256)
+        pctiles_wanted = [ 0, 50, 100 ]
+        pct_vs_time = []
+        for (time_ms, histo) in aligned_log:
+            pct_vs_time.append(get_pctiles(histo, pctiles_wanted, time_intervals))
+        self.A(pct_vs_time[0] == None)  # no I/O in this time interval
+        expected_pctiles = { 0:0.000, 50:0.064, 100:0.256 }
+        self.A(pct_vs_time[1] == expected_pctiles)
+
+    # what to expect if just the highest histogram bucket is used
+    def test_e2_get_pctiles_highest_pct(self):
+        fio_v3_bucket_count = 29 * 64
+        with open(self.fn, 'w') as f:
+            # make a empty fio v3 histogram
+            buckets = [ 0 for j in range(0, fio_v3_bucket_count) ]
+            # add one I/O request to last bucket
+            buckets[-1] = 1
+            f.write('9000, 1, 4096, %s\n' % ', '.join([str(b) for b in buckets]))
+        (raw_histo_log, min_timestamp_ms, max_timestamp_ms) = parse_hist_file(self.fn, fio_v3_bucket_count, None)
+        self.A(min_timestamp_ms == 0 and max_timestamp_ms == 9000)
+        aligned_log = align_histo_log(raw_histo_log, 5, fio_v3_bucket_count, min_timestamp_ms, max_timestamp_ms)
+        (time_ms, histo) = aligned_log[1]
+        time_intervals = time_ranges(29, 64)
+        expected_pctiles = { 100.0:(64*(1<<28))/1000.0 }
+        pct = get_pctiles( histo, [ 100.0 ], time_intervals )
+        self.A(pct == expected_pctiles)
+
+# we are using this module as a standalone program
+
+if __name__ == '__main__':
+    if os.getenv('UNITTEST'):
+        if unittest2_imported:
+            sys.exit(unittest2.main())
+        else:
+            raise Exception('you must install unittest2 module to run unit test')
+    else:
+        compute_percentiles_from_logs()
+
diff -Nru fio-2.1.3/tools/hist/fiologparser_hist.py fio-3.16/tools/hist/fiologparser_hist.py
--- fio-2.1.3/tools/hist/fiologparser_hist.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/hist/fiologparser_hist.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,612 @@
+#!/usr/bin/python2.7
+""" 
+    Utility for converting *_clat_hist* files generated by fio into latency statistics.
+    
+    Example usage:
+    
+            $ fiologparser_hist.py *_clat_hist*
+            end-time, samples, min, avg, median, 90%, 95%, 99%, max
+            1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+            2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+            4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+            ...
+    
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import os
+import sys
+import pandas
+import re
+import numpy as np
+
+runascmd = False
+
+err = sys.stderr.write
+
+class HistFileRdr():
+    """ Class to read a hist file line by line, buffering
+        a value array for the latest line, and allowing a preview
+        of the next timestamp in next line
+        Note: this does not follow a generator pattern, but must explicitly
+        get next bin array.
+    """
+    def __init__(self, file):
+        self.fp = open(file, 'r')
+        self.data = self.nextData()
+
+    def close(self):
+        self.fp.close()
+        self.fp = None
+
+    def nextData(self):
+        self.data = None
+        if self.fp:
+            line = self.fp.readline()
+            if line == "":
+                self.close()
+            else:
+                self.data = [int(x) for x in line.replace(' ', '').rstrip().split(',')]
+
+        return self.data
+
+    @property
+    def curTS(self):
+        ts = None
+        if self.data:
+            ts = self.data[0]
+        return ts
+
+    @property
+    def curDir(self):
+        d = None
+        if self.data:
+            d = self.data[1]
+        return d
+
+    @property
+    def curBins(self):
+        return self.data[3:]
+
+def weighted_percentile(percs, vs, ws):
+    """ Use linear interpolation to calculate the weighted percentile.
+        
+        Value and weight arrays are first sorted by value. The cumulative
+        distribution function (cdf) is then computed, after which np.interp
+        finds the two values closest to our desired weighted percentile(s)
+        and linearly interpolates them.
+        
+        percs  :: List of percentiles we want to calculate
+        vs     :: Array of values we are computing the percentile of
+        ws     :: Array of weights for our corresponding values
+        return :: Array of percentiles
+    """
+    idx = np.argsort(vs)
+    vs, ws = vs[idx], ws[idx] # weights and values sorted by value
+    cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
+    return np.interp(percs, cdf, vs) # linear interpolation
+
+def weights(start_ts, end_ts, start, end):
+    """ Calculate weights based on fraction of sample falling in the
+        given interval [start,end]. Weights computed using vector / array
+        computation instead of for-loops.
+
+        Note that samples with zero time length are effectively ignored
+        (we set their weight to zero).
+
+        start_ts :: Array of start times for a set of samples
+        end_ts   :: Array of end times for a set of samples
+        start    :: int
+        end      :: int
+        return   :: Array of weights
+    """
+    sbounds = np.maximum(start_ts, start).astype(float)
+    ebounds = np.minimum(end_ts,   end).astype(float)
+    ws = (ebounds - sbounds) / (end_ts - start_ts)
+    if np.any(np.isnan(ws)):
+      err("WARNING: zero-length sample(s) detected. Log file corrupt"
+          " / bad time values? Ignoring these samples.\n")
+    ws[np.where(np.isnan(ws))] = 0.0;
+    return ws
+
+def weighted_average(vs, ws):
+    return np.sum(vs * ws) / np.sum(ws)
+
+
+percs = None
+columns = None
+
+def gen_output_columns(ctx):
+    global percs,columns
+    strpercs = re.split('[,:]', ctx.percentiles)
+    percs = [50.0]  # always print 50% in 'median' column
+    percs.extend(list(map(float,strpercs)))
+    if ctx.directions:
+        columns = ["end-time", "dir", "samples", "min", "avg", "median"]
+    else:
+        columns = ["end-time", "samples", "min", "avg", "median"]
+    columns.extend(list(map(lambda x: x+'%', strpercs)))
+    columns.append("max")
+
+def fmt_float_list(ctx, num=1):
+  """ Return a comma separated list of float formatters to the required number
+      of decimal places. For instance:
+
+        fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
+  """
+  return ', '.join(["%%.%df" % ctx.decimals] * num)
+
+# Default values - see beginning of main() for how we detect number columns in
+# the input files:
+__HIST_COLUMNS = 1216
+__NON_HIST_COLUMNS = 3
+__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
+
+def read_chunk(rdr, sz):
+    """ Read the next chunk of size sz from the given reader. """
+    try:
+        """ StopIteration occurs when the pandas reader is empty, and AttributeError
+            occurs if rdr is None due to the file being empty. """
+        new_arr = rdr.read().values
+    except (StopIteration, AttributeError):
+        return None
+
+    # Let's leave the array as is, and let later code ignore the block size
+    return new_arr
+
+    #""" Extract array of the times, directions wo times, and histograms matrix without times column. """
+    #times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
+    #hists = new_arr[:,__NON_HIST_COLUMNS:]
+    #times = times.reshape((len(times),1))
+    #dirs  = rws.reshape((len(rws),1))
+    #arr = np.append(times, hists, axis=1)
+    #return arr
+
+def get_min(fps, arrs):
+    """ Find the file with the current first row with the smallest start time """
+    return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
+
+def histogram_generator(ctx, fps, sz):
+    
+    # Create a chunked pandas reader for each of the files:
+    rdrs = {}
+    for fp in fps:
+        try:
+            rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
+        except ValueError as e:
+            if e.message == 'No columns to parse from file':
+                if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
+                rdrs[fp] = None
+            else:
+                raise(e)
+
+    # Initial histograms from disk:
+    arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
+    while True:
+
+        try:
+            """ ValueError occurs when nothing more to read """
+            fp = get_min(fps, arrs)
+        except ValueError:
+            return
+        arr = arrs[fp]
+        arri = np.insert(arr[0], 1, fps.index(fp))
+        yield arri
+        arrs[fp] = arr[1:]
+
+        if arrs[fp].shape[0] == 0:
+            arrs[fp] = read_chunk(rdrs[fp], sz)
+
+def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
+    """ Taken from fio's stat.c for calculating the latency value of a bin
+        from that bin's index.
+        
+            idx  : the value of the index into the histogram bins
+            edge : fractional value in the range [0,1]** indicating how far into
+            the bin we wish to compute the latency value of.
+        
+        ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
+           respectively of the given bin index. """
+
+    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
+    # all bits of the sample as index
+    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
+        return idx 
+
+    # Find the group and compute the minimum value of that group
+    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
+    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
+
+    # Find its bucket number of the group
+    k = idx % FIO_IO_U_PLAT_VAL
+
+    # Return the mean (if edge=0.5) of the range of the bucket
+    return base + ((k + edge) * (1 << error_bits))
+    
+def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
+    """ Converts the given *coarse* index into a non-coarse index as used by fio
+        in stat.h:plat_idx_to_val(), subsequently computing the appropriate
+        latency value for that bin.
+        """
+
+    # Multiply the index by the power of 2 coarseness to get the bin
+    # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
+    stride = 1 << coarseness
+    idx = idx * stride
+    lower = _plat_idx_to_val(idx, edge=0.0)
+    upper = _plat_idx_to_val(idx + stride, edge=1.0)
+    return lower + (upper - lower) * edge
+
+def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx, dir=dir):
+    ps = weighted_percentile(percs, vs, ws)
+
+    avg = weighted_average(vs, ws)
+    values = [mn, avg] + list(ps) + [mx]
+    if ctx.directions:
+        row = [end, dir, ss_cnt]
+        fmt = "%d, %s, %d, "
+    else:
+        row = [end, ss_cnt]
+        fmt = "%d, %d, "
+    row = row + [float(x) / ctx.divisor for x in values]
+    if ctx.divisor > 1:
+        fmt = fmt + fmt_float_list(ctx, len(percs)+3)
+    else:
+        # max and min are decimal values if no divisor
+        fmt = fmt + "%d, " + fmt_float_list(ctx, len(percs)+1) + ", %d"
+
+    print (fmt % tuple(row))
+
+def update_extreme(val, fncn, new_val):
+    """ Calculate min / max in the presence of None values """
+    if val is None: return new_val
+    else: return fncn(val, new_val)
+
+# See beginning of main() for how bin_vals are computed
+bin_vals = []
+lower_bin_vals = [] # lower edge of each bin
+upper_bin_vals = [] # upper edge of each bin 
+
+def process_interval(ctx, iHist, iEnd, dir):
+    """ print estimated percentages for the given merged sample
+    """
+    ss_cnt = 0 # number of samples affecting this interval
+    mn_bin_val, mx_bin_val = None, None
+
+    # Update total number of samples affecting current interval histogram:
+    ss_cnt += np.sum(iHist)
+
+    # Update min and max bin values
+    idxs = np.nonzero(iHist != 0)[0]
+    if idxs.size > 0:
+        mn_bin_val = bin_vals[idxs[0]]
+        mx_bin_val = bin_vals[idxs[-1]]
+
+    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val, dir=dir)
+
+
+dir_map = ['r', 'w', 't']  # map of directional value in log to textual representation
+def process_weighted_interval(ctx, samples, iStart, iEnd, printdirs):
+    """ Construct the weighted histogram for the given interval by scanning
+        through all the histograms and figuring out which of their bins have
+        samples with latencies which overlap with the given interval
+        [iStart,iEnd].
+    """
+
+    times, files, dirs, sizes, hists = samples[:,0], samples[:,1], samples[:,2], samples[:,3], samples[:,4:]
+    iHist={}; ss_cnt = {}; mn_bin_val={}; mx_bin_val={}
+    for dir in printdirs:
+        iHist[dir] = np.zeros(__HIST_COLUMNS, dtype=float)
+        ss_cnt[dir] = 0 # number of samples affecting this interval
+        mn_bin_val[dir] = None
+        mx_bin_val[dir] = None
+
+    for end_time,file,dir,hist in zip(times,files,dirs,hists):
+
+        # Only look at bins of the current histogram sample which
+        # started before the end of the current time interval [start,end]
+        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / ctx.time_divisor
+        idx = np.where(start_times < iEnd)
+        s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
+
+        # Increment current interval histogram by weighted values of future histogram
+        # total number of samples
+        # and min and max values as necessary
+        textdir = dir_map[dir]
+        ws = hs * weights(s_ts, end_time, iStart, iEnd)
+        mmidx = np.where(hs != 0)[0]
+        if 'm' in printdirs:
+            iHist['m'][idx] += ws
+            ss_cnt['m'] += np.sum(hs)
+            if mmidx.size > 0:
+                mn_bin_val['m'] = update_extreme(mn_bin_val['m'], min, l_bvs[max(0,           mmidx[0]  - 1)])
+                mx_bin_val['m'] = update_extreme(mx_bin_val['m'], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
+        if textdir in printdirs:
+            iHist[textdir][idx] += ws
+            ss_cnt[textdir] += np.sum(hs)  # Update total number of samples affecting current interval histogram:
+            if mmidx.size > 0:
+                mn_bin_val[textdir] = update_extreme(mn_bin_val[textdir], min, l_bvs[max(0,           mmidx[0]  - 1)])
+                mx_bin_val[textdir] = update_extreme(mx_bin_val[textdir], max, u_bvs[min(len(hs) - 1, mmidx[-1] + 1)])
+
+    for textdir in sorted(printdirs):
+        if ss_cnt[textdir] > 0: print_all_stats(ctx, iEnd, mn_bin_val[textdir], ss_cnt[textdir], bin_vals, iHist[textdir], mx_bin_val[textdir], dir=textdir)
+
+def guess_max_from_bins(ctx, hist_cols):
+    """ Try to guess the GROUP_NR from given # of histogram
+        columns seen in an input file """
+    max_coarse = 8
+    if ctx.group_nr < 19 or ctx.group_nr > 26:
+        bins = [ctx.group_nr * (1 << 6)]
+    else:
+        bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+    coarses = range(max_coarse + 1)
+    fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
+    
+    arr = np.transpose(list(map(fncn, bins)))
+    idx = np.where(arr == hist_cols)
+    if len(idx[1]) == 0:
+        table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
+        errmsg = ("Unable to determine bin values from input clat_hist files. Namely \n"
+            "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
+            "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
+            "This number needs to be equal to one of the following numbers:\n\n"
+            + table + "\n\n"
+            "Possible reasons and corresponding solutions:\n"
+            "  - Input file(s) does not contain histograms.\n"
+            "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
+            "    new GROUP_NR on the command line with --group_nr\n")
+        if runascmd:
+            err(errmsg)
+            exit(1)
+        else:
+            raise RuntimeError(errmsg) 
+
+    return bins[idx[1][0]]
+
+def output_weighted_interval_data(ctx,printdirs):
+
+    fps = [open(f, 'r') for f in ctx.FILE]
+    gen = histogram_generator(ctx, fps, ctx.buff_size)
+
+    print(', '.join(columns))
+
+    try:
+        start, end = 0, ctx.interval
+        arr = np.empty(shape=(0,__TOTAL_COLUMNS + 1),dtype=int)
+        more_data = True
+        while more_data or len(arr) > 0:
+
+            # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
+            while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
+                try:
+                    new_arr = next(gen)
+                except StopIteration:
+                    more_data = False
+                    break
+                nashape  = new_arr.reshape((1,__TOTAL_COLUMNS + 1))
+                arr = np.append(arr, nashape, axis=0)
+            #arr = arr.astype(int)
+            
+            if arr.size > 0:
+                # Jump immediately to the start of the input, rounding
+                # down to the nearest multiple of the interval (useful when --log_unix_epoch
+                # was used to create these histograms):
+                if start == 0 and arr[0][0] - ctx.max_latency > end:
+                    start = arr[0][0] - ctx.max_latency
+                    start = start - (start % ctx.interval)
+                    end = start + ctx.interval
+
+                process_weighted_interval(ctx, arr, start, end, printdirs)
+                
+                # Update arr to throw away samples we no longer need - samples which
+                # end before the start of the next interval, i.e. the end of the
+                # current interval:
+                idx = np.where(arr[:,0] > end)
+                arr = arr[idx]
+            
+            start += ctx.interval
+            end = start + ctx.interval
+    finally:
+        for fp in fps:
+            fp.close()
+
+def output_interval_data(ctx,directions):
+    fps = [HistFileRdr(f) for f in ctx.FILE]
+
+    print(', '.join(columns))
+
+    start = 0
+    end = ctx.interval
+    while True:
+
+        more_data = False
+
+        # add bins from all files in target intervals
+        arr = None
+        numSamples = 0
+        while True:
+            foundSamples = False
+            for fp in fps:
+                ts = fp.curTS
+                if ts and ts+10 < end:  # shift sample time when very close to an end time
+                    curdirect = fp.curDir
+                    numSamples += 1
+                    foundSamples = True
+                    if arr is None:
+                        arr = {}
+                        for d in directions:
+                            arr[d] = np.zeros(shape=(__HIST_COLUMNS), dtype=int)
+                    if 'm' in arr:
+                        arr['m'] = np.add(arr['m'], fp.curBins)
+                    if 'r' in arr and curdirect == 0:
+                        arr['r'] = np.add(arr['r'], fp.curBins)
+                    if 'w' in arr and curdirect == 1:
+                        arr['w'] = np.add(arr['w'], fp.curBins)
+                    if 't' in arr and curdirect == 2:
+                        arr['t'] = np.add(arr['t'], fp.curBins)
+
+                    more_data = True
+                    fp.nextData()
+                elif ts:
+                    more_data = True
+
+            # reached end of all files
+            # or gone through all files without finding sample in interval
+            if not more_data or not foundSamples:
+                break
+
+        if arr is not None:
+            #print("{} size({}) samples({}) nonzero({}):".format(end, arr.size, numSamples, np.count_nonzero(arr)), str(arr), )
+            for d in sorted(arr.keys()):
+                aval = arr[d]
+                process_interval(ctx, aval, end, d)
+
+        # reach end of all files
+        if not more_data:
+            break
+
+        start += ctx.interval
+        end = start + ctx.interval
+
+def main(ctx):
+
+    if ctx.job_file:
+        try:
+            from configparser import SafeConfigParser, NoOptionError
+        except ImportError:
+            from ConfigParser import SafeConfigParser, NoOptionError
+
+        cp = SafeConfigParser(allow_no_value=True)
+        with open(ctx.job_file, 'r') as fp:
+            cp.readfp(fp)
+
+        if ctx.interval is None:
+            # Auto detect --interval value
+            for s in cp.sections():
+                try:
+                    hist_msec = cp.get(s, 'log_hist_msec')
+                    if hist_msec is not None:
+                        ctx.interval = int(hist_msec)
+                except NoOptionError:
+                    pass
+
+    if not hasattr(ctx, 'percentiles'):
+        ctx.percentiles = "90,95,99"
+
+    if ctx.directions:
+        ctx.directions = ctx.directions.lower()
+
+    if ctx.interval is None:
+        ctx.interval = 1000
+
+    if ctx.usbin:
+        ctx.time_divisor = 1000.0        # bins are in us
+    else:
+        ctx.time_divisor = 1000000.0     # bins are in ns
+
+    gen_output_columns(ctx)
+
+
+    # Automatically detect how many columns are in the input files,
+    # calculate the corresponding 'coarseness' parameter used to generate
+    # those files, and calculate the appropriate bin latency values:
+    with open(ctx.FILE[0], 'r') as fp:
+        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
+        __TOTAL_COLUMNS = len(fp.readline().split(','))
+        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
+
+        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
+        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
+        bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+
+    # indicate which directions to output (read(0), write(1), trim(2), mixed(3))
+    directions = set()
+    if not ctx.directions or 'm' in ctx.directions: directions.add('m')
+    if ctx.directions and 'r' in ctx.directions:    directions.add('r')
+    if ctx.directions and 'w' in ctx.directions:    directions.add('w')
+    if ctx.directions and 't' in ctx.directions:    directions.add('t')
+
+    if ctx.noweight:
+        output_interval_data(ctx, directions)
+    else:
+        output_weighted_interval_data(ctx, directions)
+
+
+if __name__ == '__main__':
+    import argparse
+    runascmd = True
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg("FILE", help='space separated list of latency log filenames', nargs='+')
+    arg('--buff_size',
+        default=10000,
+        type=int,
+        help='number of samples to buffer into numpy at a time')
+
+    arg('--max_latency',
+        default=20,
+        type=float,
+        help='number of seconds of data to process at a time')
+
+    arg('-i', '--interval',
+        type=int,
+        help='interval width (ms), default 1000 ms')
+
+    arg('--noweight',
+        action='store_true',
+        default=False,
+        help='do not perform weighting of samples between output intervals')
+
+    arg('-d', '--divisor',
+        required=False,
+        type=int,
+        default=1,
+        help='divide the results by this value.')
+
+    arg('--decimals',
+        default=3,
+        type=int,
+        help='number of decimal places to print floats to')
+
+    arg('--warn',
+        dest='warn',
+        action='store_true',
+        default=False,
+        help='print warning messages to stderr')
+
+    arg('--group_nr',
+        default=29,
+        type=int,
+        help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
+
+    arg('--job-file',
+        default=None,
+        type=str,
+        help='Optional argument pointing to the job file used to create the '
+             'given histogram files. Useful for auto-detecting --log_hist_msec and '
+             '--log_unix_epoch (in fio) values.')
+
+    arg('--percentiles',
+        default="90:95:99",
+        type=str,
+        help='Optional argument of comma or colon separated percentiles to print. '
+             'The default is "90.0:95.0:99.0".  min, median(50%%) and max percentiles are always printed')
+
+    arg('--usbin',
+        default=False,
+        action='store_true',
+        help='histogram bin latencies are in us (fio versions < 2.99. fio uses ns for version >= 2.99')
+
+    arg('--directions',
+        default=None,
+        type=str,
+        help='Optionally split results output by reads, writes, trims or mixed. '
+             'Value may be any combination of "rwtm" characters. '
+             'By default, only "mixed" results are output without a "dir" field. '
+             'But, specifying the --directions option '
+             'adds a "dir" field to the output content, and separate rows for each of the indicated '
+             'directions.')
+
+    main(p.parse_args())
+
diff -Nru fio-2.1.3/tools/hist/fiologparser_hist.py.1 fio-3.16/tools/hist/fiologparser_hist.py.1
--- fio-2.1.3/tools/hist/fiologparser_hist.py.1	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/hist/fiologparser_hist.py.1	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,220 @@
+.TH fiologparser_hist.py 1 "August 18, 2016"
+.SH NAME
+fiologparser_hist.py \- Calculate statistics from fio histograms
+.SH SYNOPSIS
+.B fiologparser_hist.py
+[\fIoptions\fR] [clat_hist_files]...
+.SH DESCRIPTION
+.B fiologparser_hist.py
+is a utility for converting *_clat_hist* files
+generated by fio into a CSV of latency statistics including minimum,
+average, maximum latency, and selectable percentiles.
+.SH EXAMPLES
+.PP
+.nf
+$ fiologparser_hist.py *_clat_hist*
+end-time, samples, min, avg, median, 90%, 95%, 99%, max
+1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+\[char46]..
+.fi
+.PP
+
+.SH OPTIONS
+.TP
+.BR \-\-help
+Print these options.
+.TP
+.BR \-\-buff_size \fR=\fPint
+Number of samples to buffer into numpy at a time. Default is 10,000.
+This can be adjusted to help performance.
+.TP
+.BR \-\-max_latency \fR=\fPint
+Number of seconds of data to process at a time. Defaults to 20 seconds,
+in order to handle the 17 second upper bound on latency in histograms
+reported by fio. This should be increased if fio has been
+run with a larger maximum latency. Lowering this when a lower maximum
+latency is known can improve performance. See NOTES for more details.
+.TP
+.BR \-i ", " \-\-interval \fR=\fPint
+Interval at which statistics are reported. Defaults to 1000 ms. This
+should be set a minimum of the value for \fBlog_hist_msec\fR as given
+to fio.
+.TP
+.BR \-\-noweight
+Do not perform weighting of samples between output intervals. Default is False.
+.TP
+.BR \-d ", " \-\-divisor \fR=\fPint
+Divide statistics by this value. Defaults to 1. Useful if you want to
+convert latencies from milliseconds to seconds (\fBdivisor\fR=\fP1000\fR).
+.TP
+.BR \-\-warn
+Enables warning messages printed to stderr, useful for debugging.
+.TP
+.BR \-\-group_nr \fR=\fPint
+Set this to the value of \fIFIO_IO_U_PLAT_GROUP_NR\fR as defined in
+\fPstat.h\fR if fio has been recompiled. Defaults to 19, the
+current value used in fio. See NOTES for more details.
+.TP
+.BR \-\-percentiles \fR=\fPstr
+Pass desired list of comma or colon separated percentiles to print.
+The default is "90.0:95.0:99.0", but min, median(50%) and max percentiles are always printed
+.TP
+.BR \-\-usbin
+Use to indicate to parser that histogram bin latencies values are in microseconds.
+The default is to use nanoseconds, but histogram logs from fio versions <= 2.99 are in microseconds.
+.TP
+.BR \-\-directions \fR=\fPstr
+By default, all directions (e.g read and write) histogram bins are combined
+producing one 'mixed' result.
+To produce independent directional results, pass some combination of
+\'rwtm\' characters with the \-\-directions\fR=\fPrwtm option.
+A \'dir\' column is added indicating the result direction for a row.
+
+.SH NOTES
+end-times are calculated to be uniform increments of the \fB\-\-interval\fR value given,
+regardless of when histogram samples are reported. Of note:
+
+.RS
+Intervals with no samples are omitted. In the example above this means
+"no statistics from 2 to 3 seconds" and "39 samples influenced the statistics
+of the interval from 3 to 4 seconds".
+.LP
+Intervals with a single sample will have the same value for all statistics
+.RE
+
+.PP
+The number of samples is unweighted, corresponding to the total number of samples
+which have any effect whatsoever on the interval.
+
+Min statistics are computed using value of the lower boundary of the first bin
+(in increasing bin order) with non-zero samples in it. Similarly for max,
+we take the upper boundary of the last bin with non-zero samples in it.
+This is semantically identical to taking the 0th and 100th percentiles with a
+50% bin-width buffer (because percentiles are computed using mid-points of
+the bins). This enforces the following nice properties:
+
+.RS
+min <= 50th <= 90th <= 95th <= 99th <= max
+.LP
+min and max are strict lower and upper bounds on the actual
+min / max seen by fio (and reported in *_clat.* with averaging turned off).
+.RE
+
+.PP
+Average statistics use a standard weighted arithmetic mean.
+
+When --noweights option is false (the default)
+percentile statistics are computed using the weighted percentile method as
+described here: \fIhttps://en.wikipedia.org/wiki/Percentile#Weighted_percentile\fR.
+See weights() method for details on how weights are computed for individual
+samples. In process_interval() we further multiply by the height of each bin
+to get weighted histograms.
+
+We convert files given on the command line, assumed to be fio histogram files,
+An individual histogram file can contain the
+histograms for multiple different r/w directions (notably when \fB\-\-rw\fR=\fPrandrw\fR). This
+is accounted for by tracking each r/w direction separately. In the statistics
+reported we ultimately merge *all* histograms (regardless of r/w direction).
+
+The value of *_GROUP_NR in \fIstat.h\fR (and *_BITS) determines how many latency bins
+fio outputs when histogramming is enabled. Namely for the current default of
+GROUP_NR=19, we get 1,216 bins with a maximum latency of approximately 17
+seconds. For certain applications this may not be sufficient. With GROUP_NR=24
+we have 1,536 bins, giving us a maximum latency of 541 seconds (~ 9 minutes). If
+you expect your application to experience latencies greater than 17 seconds,
+you will need to recompile fio with a larger GROUP_NR, e.g. with:
+
+.RS
+.PP
+.nf
+sed -i.bak 's/^#define FIO_IO_U_PLAT_GROUP_NR 19\n/#define FIO_IO_U_PLAT_GROUP_NR 24/g' stat.h
+make fio
+.fi
+.PP
+.RE
+
+.PP
+Quick reference table for the max latency corresponding to a sampling of
+values for GROUP_NR:
+
+.RS
+.PP
+.nf
+GROUP_NR | # bins | max latency bin value
+19       | 1216   | 16.9 sec
+20       | 1280   | 33.8 sec
+21       | 1344   | 67.6 sec
+22       | 1408   | 2  min, 15 sec
+23       | 1472   | 4  min, 32 sec
+24       | 1536   | 9  min, 4  sec
+25       | 1600   | 18 min, 8  sec
+26       | 1664   | 36 min, 16 sec
+.fi
+.PP
+.RE
+
+.PP
+At present this program automatically detects the number of histogram bins in
+the log files, and adjusts the bin latency values accordingly. In particular if
+you use the \fB\-\-log_hist_coarseness\fR parameter of fio, you get output files with
+a number of bins according to the following table (note that the first
+row is identical to the table above):
+
+.RS
+.PP
+.nf
+coarse \\ GROUP_NR
+        19     20    21     22     23     24     25     26
+   -------------------------------------------------------
+  0  [[ 1216,  1280,  1344,  1408,  1472,  1536,  1600,  1664],
+  1   [  608,   640,   672,   704,   736,   768,   800,   832],
+  2   [  304,   320,   336,   352,   368,   384,   400,   416],
+  3   [  152,   160,   168,   176,   184,   192,   200,   208],
+  4   [   76,    80,    84,    88,    92,    96,   100,   104],
+  5   [   38,    40,    42,    44,    46,    48,    50,    52],
+  6   [   19,    20,    21,    22,    23,    24,    25,    26],
+  7   [  N/A,    10,   N/A,    11,   N/A,    12,   N/A,    13],
+  8   [  N/A,     5,   N/A,   N/A,   N/A,     6,   N/A,   N/A]]
+.fi
+.PP
+.RE
+
+.PP
+For other values of GROUP_NR and coarseness, this table can be computed like this:
+
+.RS
+.PP
+.nf
+bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+max_coarse = 8
+fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else nan, range(max_coarse + 1)))
+np.transpose(list(map(fncn, bins)))
+.fi
+.PP
+.RE
+
+.PP
+If you have not adjusted GROUP_NR for your (high latency) application, then you
+will see the percentiles computed by this tool max out at the max latency bin
+value as in the first table above, and in this plot (where GROUP_NR=19 and thus we see
+a max latency of ~16.7 seconds in the red line):
+
+.RS
+\fIhttps://www.cronburg.com/fio/max_latency_bin_value_bug.png
+.RE
+
+.PP
+Motivation for, design decisions, and the implementation process are
+described in further detail here:
+
+.RS
+\fIhttps://www.cronburg.com/fio/cloud-latency-problem-measurement/
+.RE
+
+.SH AUTHOR
+.B fiologparser_hist.py
+and this manual page were written by Karl Cronburg <karl.cronburg@gmail.com>.
+.SH "REPORTING BUGS"
+Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
diff -Nru fio-2.1.3/tools/hist/.gitignore fio-3.16/tools/hist/.gitignore
--- fio-2.1.3/tools/hist/.gitignore	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/hist/.gitignore	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,3 @@
+*.pyc
+*.ipynb
+.ipynb_checkpoints
diff -Nru fio-2.1.3/tools/hist/half-bins.py fio-3.16/tools/hist/half-bins.py
--- fio-2.1.3/tools/hist/half-bins.py	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/tools/hist/half-bins.py	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,37 @@
+#!/usr/bin/python2.7
+""" Cut the number bins in half in fio histogram output. Example usage:
+
+        $ half-bins.py -c 2 output_clat_hist.1.log > smaller_clat_hist.1.log
+
+    Which merges e.g. bins [0 .. 3], [4 .. 7], ..., [1212 .. 1215] resulting in
+    304 = 1216 / (2**2) merged bins per histogram sample.
+
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import sys
+
+def main(ctx):
+    stride = 1 << ctx.coarseness
+    with open(ctx.FILENAME, 'r') as fp:
+        for line in fp.readlines():
+            vals = line.split(', ')
+            sys.stdout.write("%s, %s, %s, " % tuple(vals[:3]))
+
+            hist = list(map(int, vals[3:]))
+            for i in range(0, len(hist) - stride, stride):
+                sys.stdout.write("%d, " % sum(hist[i : i + stride],))
+            sys.stdout.write("%d\n" % sum(hist[len(hist) - stride:]))
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg( 'FILENAME', help='clat_hist file for which we will reduce'
+                         ' (by half or more) the number of bins.')
+    arg('-c', '--coarseness',
+       default=1,
+       type=int,
+       help='number of times to reduce number of bins by half, '
+            'e.g. coarseness of 4 merges each 2^4 = 16 consecutive '
+            'bins.')
+    main(p.parse_args())
diff -Nru fio-2.1.3/tools/plot/fio2gnuplot fio-3.16/tools/plot/fio2gnuplot
--- fio-2.1.3/tools/plot/fio2gnuplot	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/plot/fio2gnuplot	2019-09-20 01:01:52.000000000 +0000
@@ -1,4 +1,5 @@
-#!/usr/bin/python
+#!/usr/bin/python2.7
+# Note: this script is python2 and python3 compatible.
 #
 #  Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
 #  Author: Erwan Velu  <erwan@enovance.com>
@@ -17,8 +18,10 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 
+from __future__ import absolute_import
+from __future__ import print_function
 import os
 import fnmatch
 import sys
@@ -26,27 +29,29 @@
 import re
 import math
 import shutil
+from six.moves import map
+from six.moves import range
 
 def find_file(path, pattern):
 	fio_data_file=[]
 	# For all the local files
 	for file in os.listdir(path):
-	    # If the file math the regexp
-	    if fnmatch.fnmatch(file, pattern):
-		# Let's consider this file
-		fio_data_file.append(file)
+		# If the file matches the glob
+		if fnmatch.fnmatch(file, pattern):
+			# Let's consider this file
+			fio_data_file.append(file)
 
 	return fio_data_file
 
 def generate_gnuplot_script(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir):
-	if verbose: print "Generating rendering scripts"
+	if verbose: print("Generating rendering scripts")
 	filename=gnuplot_output_dir+'mygraph'
 	temporary_files.append(filename)
 	f=open(filename,'w')
 
 	# Plotting 3D or comparing graphs doesn't have a meaning unless if there is at least 2 traces
 	if len(fio_data_file) > 1:
-        	f.write("call \'%s/graph3D.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\'\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode))
+		f.write("call \'%s/graph3D.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\'\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode))
 
 		# Setting up the compare files that will be plot later
 		compare=open(gnuplot_output_dir + 'compare.gnuplot','w')
@@ -88,10 +93,10 @@
 		compare_smooth.write("plot %s w l ls 1 ti 'Global average value (%.2f)'" % (global_avg,global_avg));
 		compare_trend.write("plot %s w l ls 1 ti 'Global average value (%.2f)'" % (global_avg,global_avg));
 
-        pos=0
-        # Let's create a temporary file for each selected fio file
-        for file in fio_data_file:
-                tmp_filename = "gnuplot_temp_file.%d" % pos
+		pos=0
+		# Let's create a temporary file for each selected fio file
+		for file in fio_data_file:
+			tmp_filename = "gnuplot_temp_file.%d" % pos
 
 		# Plotting comparing graphs doesn't have a meaning unless if there is at least 2 traces
 		if len(fio_data_file) > 1:
@@ -101,12 +106,12 @@
 			compare_trend.write(",\\\n'%s' using 2:3 smooth bezier title '%s'" % (tmp_filename,fio_data_file[pos]))
 
 		png_file=file.replace('.log','')
-                raw_filename = "%s-2Draw" % (png_file)
-                smooth_filename = "%s-2Dsmooth" % (png_file)
-                trend_filename = "%s-2Dtrend" % (png_file)
-                avg  = average(disk_perf[pos])
-                f.write("call \'%s/graph2D.gpm\' \'%s' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%f\'\n" % (gpm_dir,title,tmp_filename,fio_data_file[pos],raw_filename,mode,smooth_filename,trend_filename,avg))
-                pos = pos +1
+		raw_filename = "%s-2Draw" % (png_file)
+		smooth_filename = "%s-2Dsmooth" % (png_file)
+		trend_filename = "%s-2Dtrend" % (png_file)
+		avg  = average(disk_perf[pos])
+		f.write("call \'%s/graph2D.gpm\' \'%s' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%s\' \'%f\'\n" % (gpm_dir,title,tmp_filename,fio_data_file[pos],raw_filename,mode,smooth_filename,trend_filename,avg))
+		pos = pos +1
 
 	# Plotting comparing graphs doesn't have a meaning unless if there is at least 2 traces
 	if len(fio_data_file) > 1:
@@ -120,11 +125,11 @@
 	filename=gnuplot_output_dir+'mymath';
 	temporary_files.append(filename)
 	f=open(filename,'a')
-        f.write("call \'%s/math.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\' %s\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode,average))
+	f.write("call \'%s/math.gpm\' \'%s' \'%s\' \'\' \'%s\' \'%s\' %s\n" % (gpm_dir,title,gnuplot_output_filename,gnuplot_output_filename,mode,average))
 	f.close()
 
 def compute_aggregated_file(fio_data_file, gnuplot_output_filename, gnuplot_output_dir):
-	if verbose: print "Processing data file 2/2"
+	if verbose: print("Processing data file 2/2")
 	temp_files=[]
 	pos=0
 
@@ -152,7 +157,7 @@
 	end_time=max_time
 	if end_time == -1:
 		end_time="infinite"
-	if verbose: print "Processing data file 1/2 with %s<time<%s" % (min_time,end_time)
+	if verbose: print("Processing data file 1/2 with %s<time<%s" % (min_time,end_time))
 	files=[]
 	temp_outfile=[]
 	blk_size=0
@@ -198,8 +203,8 @@
 				try:
 					blk_size=int(block_size)
 				except:
-					print "Error while reading the following line :"
-					print line
+					print("Error while reading the following line :")
+					print(line)
 					sys.exit(1);
 
 			# We ignore the first 500msec as it doesn't seems to be part of the real benchmark
@@ -225,7 +230,7 @@
 	return blk_size
 
 def compute_math(fio_data_file, title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir):
-	if verbose: print "Computing Maths"
+	if verbose: print("Computing Maths")
 	global_min=[]
 	global_max=[]
 	average_file=open(gnuplot_output_dir+gnuplot_output_filename+'.average', 'w')
@@ -243,14 +248,14 @@
 	max_file.write('DiskName %s\n'% mode)
 	average_file.write('DiskName %s\n'% mode)
 	stddev_file.write('DiskName %s\n'% mode )
-	for disk in xrange(len(fio_data_file)):
+	for disk in range(len(fio_data_file)):
 #		print disk_perf[disk]
-	    	min_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
-	    	max_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
-	    	average_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
-	    	stddev_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		min_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		max_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		average_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
+		stddev_file.write("# Disk%d was coming from %s\n" % (disk,fio_data_file[disk]))
 		avg  = average(disk_perf[disk])
-		variance = map(lambda x: (x - avg)**2, disk_perf[disk])
+		variance = [(x - avg)**2 for x in disk_perf[disk]]
 		standard_deviation = math.sqrt(average(variance))
 #		print "Disk%d [ min=%.2f max=%.2f avg=%.2f stddev=%.2f \n" % (disk,min(disk_perf[disk]),max(disk_perf[disk]),avg, standard_deviation)
 		average_file.write('%d %d\n' % (disk, avg))
@@ -264,7 +269,7 @@
 
 	global_disk_perf = sum(disk_perf, [])
 	avg  = average(global_disk_perf)
-	variance = map(lambda x: (x - avg)**2, global_disk_perf)
+	variance = [(x - avg)**2 for x in global_disk_perf]
 	standard_deviation = math.sqrt(average(variance))
 
 	global_file.write('min=%.2f\n' % min(global_disk_perf))
@@ -331,52 +336,52 @@
 					max_file=file
 	# Let's print the avg output
 	if global_search == "avg":
-		print "Biggest aggregated value of %s was %2.f in file %s\n" % (global_search, max_result, max_file)
+		print("Biggest aggregated value of %s was %2.f in file %s\n" % (global_search, max_result, max_file))
 	else:
-		print "Global search %s is not yet implemented\n" % global_search
+		print("Global search %s is not yet implemented\n" % global_search)
 
 def render_gnuplot(fio_data_file, gnuplot_output_dir):
-	print "Running gnuplot Rendering"
+	print("Running gnuplot Rendering")
 	try:
 		# Let's render all the compared files if some
 		if len(fio_data_file) > 1:
-			if verbose: print " |-> Rendering comparing traces"
+			if verbose: print(" |-> Rendering comparing traces")
 			os.system("cd %s; for i in *.gnuplot; do gnuplot $i; done" % gnuplot_output_dir)
-		if verbose: print " |-> Rendering math traces"
+		if verbose: print(" |-> Rendering math traces")
 		os.system("cd %s; gnuplot mymath" % gnuplot_output_dir)
-		if verbose: print " |-> Rendering 2D & 3D traces"
+		if verbose: print(" |-> Rendering 2D & 3D traces")
 		os.system("cd %s; gnuplot mygraph" % gnuplot_output_dir)
 
 		name_of_directory="the current"
 		if gnuplot_output_dir != "./":
 			name_of_directory=gnuplot_output_dir
-		print "\nRendering traces are available in %s directory" % name_of_directory
+		print("\nRendering traces are available in %s directory" % name_of_directory)
 		global keep_temp_files
 		keep_temp_files=False
 	except:
-		print "Could not run gnuplot on mymath or mygraph !\n"
+		print("Could not run gnuplot on mymath or mygraph !\n")
 		sys.exit(1);
 
 def print_help():
-    print 'fio2gnuplot -ghbiodvk -t <title> -o <outputfile> -p <pattern> -G <type> -m <time> -M <time>'
-    print
-    print '-h --help                           : Print this help'
-    print '-p <pattern> or --pattern <pattern> : A pattern in regexp to select fio input files'
-    print '-b           or --bandwidth         : A predefined pattern for selecting *_bw.log files'
-    print '-i           or --iops              : A predefined pattern for selecting *_iops.log files'
-    print '-g           or --gnuplot           : Render gnuplot traces before exiting'
-    print '-o           or --outputfile <file> : The basename for gnuplot traces'
-    print '                                       - Basename is set with the pattern if defined'
-    print '-d           or --outputdir <dir>   : The directory where gnuplot shall render files'
-    print '-t           or --title <title>     : The title of the gnuplot traces'
-    print '                                       - Title is set with the block size detected in fio traces'
-    print '-G           or --Global <type>     : Search for <type> in .global files match by a pattern'
-    print '                                       - Available types are : min, max, avg, stddev'
-    print '                                       - The .global extension is added automatically to the pattern'
-    print '-m           or --min_time <time>   : Only consider data starting from <time> seconds (default is 0)'
-    print '-M           or --max_time <time>   : Only consider data ending before <time> seconds (default is -1 aka nolimit)'
-    print '-v           or --verbose           : Increasing verbosity'
-    print '-k           or --keep              : Keep all temporary files from gnuplot\'s output dir'
+    print('fio2gnuplot -ghbiodvk -t <title> -o <outputfile> -p <pattern> -G <type> -m <time> -M <time>')
+    print()
+    print('-h --help                           : Print this help')
+    print('-p <pattern> or --pattern <pattern> : A glob pattern to select fio input files')
+    print('-b           or --bandwidth         : A predefined pattern for selecting *_bw.log files')
+    print('-i           or --iops              : A predefined pattern for selecting *_iops.log files')
+    print('-g           or --gnuplot           : Render gnuplot traces before exiting')
+    print('-o           or --outputfile <file> : The basename for gnuplot traces')
+    print('                                       - Basename is set with the pattern if defined')
+    print('-d           or --outputdir <dir>   : The directory where gnuplot shall render files')
+    print('-t           or --title <title>     : The title of the gnuplot traces')
+    print('                                       - Title is set with the block size detected in fio traces')
+    print('-G           or --Global <type>     : Search for <type> in .global files match by a pattern')
+    print('                                       - Available types are : min, max, avg, stddev')
+    print('                                       - The .global extension is added automatically to the pattern')
+    print('-m           or --min_time <time>   : Only consider data starting from <time> seconds (default is 0)')
+    print('-M           or --max_time <time>   : Only consider data ending before <time> seconds (default is -1 aka nolimit)')
+    print('-v           or --verbose           : Increasing verbosity')
+    print('-k           or --keep              : Keep all temporary files from gnuplot\'s output dir')
 
 def main(argv):
     mode='unknown'
@@ -401,116 +406,126 @@
     force_keep_temp_files=False
 
     if not os.path.isfile(gpm_dir+'math.gpm'):
-	    gpm_dir="/usr/local/share/fio/"
-    	    if not os.path.isfile(gpm_dir+'math.gpm'):
-		    print "Looks like fio didn't got installed properly as no gpm files found in '/usr/share/fio' or '/usr/local/share/fio'\n"
-		    sys.exit(3)
+        gpm_dir="/usr/local/share/fio/"
+        if not os.path.isfile(gpm_dir+'math.gpm'):
+            print("Looks like fio didn't get installed properly as no gpm files found in '/usr/share/fio' or '/usr/local/share/fio'\n")
+            sys.exit(3)
 
     try:
-	    opts, args = getopt.getopt(argv[1:],"ghkbivo:d:t:p:G:m:M:",['bandwidth', 'iops', 'pattern', 'outputfile', 'outputdir', 'title', 'min_time', 'max_time', 'gnuplot', 'Global', 'help', 'verbose','keep'])
+        opts, args = getopt.getopt(argv[1:],"ghkbivo:d:t:p:G:m:M:",['bandwidth', 'iops', 'pattern', 'outputfile', 'outputdir', 'title', 'min_time', 'max_time', 'gnuplot', 'Global', 'help', 'verbose','keep'])
     except getopt.GetoptError:
-	 print "Error: One of the option passed to the cmdline was not supported"
-	 print "Please fix your command line or read the help (-h option)"
-         sys.exit(2)
+        print("Error: One of the options passed to the cmdline was not supported")
+        print("Please fix your command line or read the help (-h option)")
+        sys.exit(2)
 
     for opt, arg in opts:
-      if opt in ("-b", "--bandwidth"):
-         pattern='*_bw.log'
-      elif opt in ("-i", "--iops"):
-         pattern='*_iops.log'
-      elif opt in ("-v", "--verbose"):
-	 verbose=True
-      elif opt in ("-k", "--keep"):
-	 #User really wants to keep the temporary files
-	 force_keep_temp_files=True
-      elif opt in ("-p", "--pattern"):
-         pattern_set_by_user=True
-	 pattern=arg
-	 pattern=pattern.replace('\\','')
-      elif opt in ("-o", "--outputfile"):
-         gnuplot_output_filename=arg
-      elif opt in ("-d", "--outputdir"):
-         gnuplot_output_dir=arg
-	 if not gnuplot_output_dir.endswith('/'):
-		gnuplot_output_dir=gnuplot_output_dir+'/'
-	 if not os.path.exists(gnuplot_output_dir):
-		os.makedirs(gnuplot_output_dir)
-      elif opt in ("-t", "--title"):
-         title=arg
-      elif opt in ("-m", "--min_time"):
-	 min_time=arg
-      elif opt in ("-M", "--max_time"):
-	 max_time=arg
-      elif opt in ("-g", "--gnuplot"):
-	 run_gnuplot=True
-      elif opt in ("-G", "--Global"):
-	 parse_global=True
-	 global_search=arg
-      elif opt in ("-h", "--help"):
-	  print_help()
-	  sys.exit(1)
+        if opt in ("-b", "--bandwidth"):
+            pattern='*_bw.log'
+        elif opt in ("-i", "--iops"):
+            pattern='*_iops.log'
+        elif opt in ("-v", "--verbose"):
+            verbose=True
+        elif opt in ("-k", "--keep"):
+            #User really wants to keep the temporary files
+            force_keep_temp_files=True
+        elif opt in ("-p", "--pattern"):
+            pattern_set_by_user=True
+            pattern=arg
+            pattern=pattern.replace('\\','')
+        elif opt in ("-o", "--outputfile"):
+            gnuplot_output_filename=arg
+        elif opt in ("-d", "--outputdir"):
+            gnuplot_output_dir=arg
+            if not gnuplot_output_dir.endswith('/'):
+                gnuplot_output_dir=gnuplot_output_dir+'/'
+            if not os.path.exists(gnuplot_output_dir):
+                os.makedirs(gnuplot_output_dir)
+        elif opt in ("-t", "--title"):
+            title=arg
+        elif opt in ("-m", "--min_time"):
+            min_time=arg
+        elif opt in ("-M", "--max_time"):
+            max_time=arg
+        elif opt in ("-g", "--gnuplot"):
+            run_gnuplot=True
+        elif opt in ("-G", "--Global"):
+            parse_global=True
+            global_search=arg
+        elif opt in ("-h", "--help"):
+            print_help()
+            sys.exit(1)
 
     # Adding .global extension to the file
     if parse_global==True:
-	    if not gnuplot_output_filename.endswith('.global'):
-	    	pattern = pattern+'.global'
+        if not gnuplot_output_filename.endswith('.global'):
+            pattern = pattern+'.global'
 
     fio_data_file=find_file('.',pattern)
     if len(fio_data_file) == 0:
-	    print "No log file found with pattern %s!" % pattern
-	    sys.exit(1)
+        print("No log file found with pattern %s!" % pattern)
+        # Try numjob log file format if per_numjob_logs=1
+        if (pattern == '*_bw.log'):
+            fio_data_file=find_file('.','*_bw.*.log')
+        if (pattern == '*_iops.log'):
+            fio_data_file=find_file('.','*_iops.*.log')
+        if len(fio_data_file) == 0:
+            sys.exit(1)
+        else:
+            print("Using log file per job format instead")
     else:
-	    print "%d files Selected with pattern '%s'" % (len(fio_data_file), pattern)
+        print("%d files Selected with pattern '%s'" % (len(fio_data_file), pattern))
 
     fio_data_file=sorted(fio_data_file, key=str.lower)
     for file in fio_data_file:
-	print ' |-> %s' % file
-	if "_bw.log" in file :
-		mode="Bandwidth (KB/sec)"
-	if "_iops.log" in file :
-		mode="IO per Seconds (IO/sec)"
+        print(' |-> %s' % file)
+        if "_bw.log" in file :
+            mode="Bandwidth (KB/sec)"
+        if "_iops.log" in file :
+            mode="IO per Seconds (IO/sec)"
     if (title == 'No title') and (mode != 'unknown'):
-	    if "Bandwidth" in mode:
-		    title='Bandwidth benchmark with %d fio results' % len(fio_data_file)
-	    if "IO" in mode:
-		    title='IO benchmark with %d fio results' % len(fio_data_file)
+        if "Bandwidth" in mode:
+            title='Bandwidth benchmark with %d fio results' % len(fio_data_file)
+        if "IO" in mode:
+            title='IO benchmark with %d fio results' % len(fio_data_file)
 
-    print
+    print()
     #We need to adjust the output filename regarding the pattern required by the user
     if (pattern_set_by_user == True):
-	    gnuplot_output_filename=pattern
-	    # As we do have some regexp in the pattern, let's make this simpliest
-	    # We do remove the simpliest parts of the expression to get a clear file name
-	    gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
-	    gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
-	    gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
-	    gnuplot_output_filename=gnuplot_output_filename.replace('.log','')
-	    # Insure that we don't have any starting or trailing dash to the filename
-	    gnuplot_output_filename = gnuplot_output_filename[:-1] if gnuplot_output_filename.endswith('-') else gnuplot_output_filename
-	    gnuplot_output_filename = gnuplot_output_filename[1:] if gnuplot_output_filename.startswith('-') else gnuplot_output_filename
+        gnuplot_output_filename=pattern
+        # As we do have some glob in the pattern, let's make this simpliest
+        # We do remove the simpliest parts of the expression to get a clear file name
+        gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
+        gnuplot_output_filename=gnuplot_output_filename.replace('.log','')
+        # Insure that we don't have any starting or trailing dash to the filename
+        gnuplot_output_filename = gnuplot_output_filename[:-1] if gnuplot_output_filename.endswith('-') else gnuplot_output_filename
+        gnuplot_output_filename = gnuplot_output_filename[1:] if gnuplot_output_filename.startswith('-') else gnuplot_output_filename
+        if (gnuplot_output_filename == ''):
+            gnuplot_output_filename='default'
 
     if parse_global==True:
-	parse_global_files(fio_data_file, global_search)
+        parse_global_files(fio_data_file, global_search)
     else:
-    	blk_size=compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir,min_time,max_time)
-    	title="%s @ Blocksize = %dK" % (title,blk_size/1024)
-    	compute_aggregated_file(fio_data_file, gnuplot_output_filename, gnuplot_output_dir)
-    	compute_math(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
-    	generate_gnuplot_script(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
-
-    	if (run_gnuplot==True):
-    		render_gnuplot(fio_data_file, gnuplot_output_dir)
-
-	# Shall we clean the temporary files ?
-	if keep_temp_files==False and force_keep_temp_files==False:
-	    	# Cleaning temporary files
-		if verbose: print "Cleaning temporary files"
-		for f in enumerate(temporary_files):
-		    	if verbose: print " -> %s"%f[1]
-			try:
-			    os.remove(f[1])
-			except:
-			    True
+        blk_size=compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir,min_time,max_time)
+        title="%s @ Blocksize = %dK" % (title,blk_size/1024)
+        compute_aggregated_file(fio_data_file, gnuplot_output_filename, gnuplot_output_dir)
+        compute_math(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
+        generate_gnuplot_script(fio_data_file,title,gnuplot_output_filename,gnuplot_output_dir,mode,disk_perf,gpm_dir)
+
+        if (run_gnuplot==True):
+            render_gnuplot(fio_data_file, gnuplot_output_dir)
+
+        # Shall we clean the temporary files ?
+        if keep_temp_files==False and force_keep_temp_files==False:
+            # Cleaning temporary files
+            if verbose: print("Cleaning temporary files")
+            for f in enumerate(temporary_files):
+                if verbose: print(" -> %s"%f[1])
+                try:
+                    os.remove(f[1])
+                except:
+                    True
 
 #Main
 if __name__ == "__main__":
diff -Nru fio-2.1.3/tools/plot/fio2gnuplot.1 fio-3.16/tools/plot/fio2gnuplot.1
--- fio-2.1.3/tools/plot/fio2gnuplot.1	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/plot/fio2gnuplot.1	2019-09-20 01:01:52.000000000 +0000
@@ -1,5 +1,5 @@
 .\" Text automatically generated by txt2man
-.TH fio2gnuplot  "07 août 2013" "" ""
+.TH fio2gnuplot 1 "August 2013"
 .SH NAME
 \fBfio2gnuplot \fP- Render fio's output files with gnuplot
 .SH SYNOPSIS
diff -Nru fio-2.1.3/tools/plot/graph2D.gpm fio-3.16/tools/plot/graph2D.gpm
--- fio-2.1.3/tools/plot/graph2D.gpm	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/plot/graph2D.gpm	2019-09-20 01:01:52.000000000 +0000
@@ -1,9 +1,30 @@
 # This Gnuplot file has been generated by eNovance
 
-set title '$0'
+needed_args = 8
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+	ARG7 = "$6"; \
+	ARG8 = "$7"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+avg_num = ARG8 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 #set terminal x11
 
 #Preparing Axes
@@ -12,7 +33,7 @@
 #set data style lines
 set key top left reverse
 set xlabel "Time (Seconds)"
-set ylabel '$4'
+set ylabel ARG5
 set xrange [0:]
 set yrange [0:]
 
@@ -22,13 +43,13 @@
 set style line 100 lt 7 lw 0.5
 set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
 
-plot '$1' using 2:3 with linespoints title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+plot ARG2 using 2:3 with linespoints title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
 
-set output '$5.png'
-plot '$1' using 2:3 smooth csplines title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+set output ARG6 . '.png'
+plot ARG2 using 2:3 smooth csplines title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
 
-set output '$6.png'
-plot '$1' using 2:3 smooth bezier title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+set output ARG7 . '.png'
+plot ARG2 using 2:3 smooth bezier title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str .')'
 
 #pause -1
 #The End
diff -Nru fio-2.1.3/tools/plot/graph3D.gpm fio-3.16/tools/plot/graph3D.gpm
--- fio-2.1.3/tools/plot/graph3D.gpm	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/plot/graph3D.gpm	2019-09-20 01:01:52.000000000 +0000
@@ -1,9 +1,24 @@
 # This Gnuplot file has been generated by eNovance
 
-set title '$0'
+needed_args = 5
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 #set terminal x11
 #3D Config
 set isosamples 30
@@ -19,7 +34,7 @@
 set key top left reverse
 set ylabel "Disk"
 set xlabel "Time (Seconds)"
-set zlabel '$4'
+set zlabel ARG5
 set cbrange [0:]
 set zrange [0:]
 
@@ -35,7 +50,7 @@
 set size 0.5,0.5
 set view 64,216
 set origin 0,0.5
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Top Right View
 set size 0.5,0.5
@@ -43,7 +58,7 @@
 set view 90,0
 set pm3d at s solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Bottom Right View
 set size 0.5,0.5
@@ -51,13 +66,13 @@
 set view 63,161
 set pm3d at s solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Bottom Left View
 set size 0.5,0.5
 set origin 0,0
 set pm3d map
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Unsetting multiplotting
 unset multiplot
@@ -66,7 +81,7 @@
 #Preparing 3D Interactive view
 set mouse
 set terminal png size 1024,768
-set output '$3-3D.png'
+set output ARG4 . '-3D.png'
 
 #set term x11
 set view 64,216
@@ -74,7 +89,7 @@
 set size 1,1
 set pm3d at bs solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #pause -1
 #The End
diff -Nru fio-2.1.3/tools/plot/math.gpm fio-3.16/tools/plot/math.gpm
--- fio-2.1.3/tools/plot/math.gpm	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/tools/plot/math.gpm	2019-09-20 01:01:52.000000000 +0000
@@ -1,15 +1,32 @@
 # This Gnuplot file has been generated by eNovance
+if (exists("ARGC") && ARGC > 5) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" > 5) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
 
-set title '$0'
+avg_num = ARG6 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 
 set palette rgbformulae 7,5,15
 set style line 100 lt 7 lw 0.5
 set style fill transparent solid 0.9 noborder
 set auto x
-set ylabel '$4'
+set ylabel ARG5
 set xlabel "Disk"
 set yrange [0:]
 set style data histogram
@@ -22,4 +39,4 @@
 set xtic rotate by 45 scale 0 font ",8" autojustify
 set xtics offset 0,-1 border -5,1,5
 set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
-plot '$1' using 2:xtic(1) ti col, $5 w l ls 1 ti 'Global average value ($5)'
+plot ARG2 using 2:xtic(1) ti col, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
diff -Nru fio-2.1.3/.travis.yml fio-3.16/.travis.yml
--- fio-2.1.3/.travis.yml	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/.travis.yml	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,55 @@
+language: c
+os:
+  - linux
+compiler:
+  - clang
+  - gcc
+env:
+  matrix:
+    - BUILD_ARCH="x86"
+    - BUILD_ARCH="x86_64"
+  global:
+    - MAKEFLAGS="-j 2"
+matrix:
+  include:
+    - os: osx
+      compiler: clang # Workaround travis setting CC=["clang", "gcc"]
+      env: BUILD_ARCH="x86_64"
+    # Build using the 10.12 SDK but target and run on OSX 10.11
+#   - os: osx
+#     compiler: clang
+#     osx_image: xcode8
+#     env: SDKROOT=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk MACOSX_DEPLOYMENT_TARGET=10.11
+    # Build on the latest OSX version (will eventually become obsolete)
+    - os: osx
+      compiler: clang
+      osx_image: xcode8.3
+      env: BUILD_ARCH="x86_64"
+    - os: osx
+      compiler: clang
+      osx_image: xcode9.4
+      env: BUILD_ARCH="x86_64"
+  exclude:
+    - os: osx
+      compiler: gcc
+  exclude:
+    - os: linux
+      compiler: clang
+      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
+before_install:
+  - EXTRA_CFLAGS="-Werror"
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+        pkgs=(libaio-dev libnuma-dev libz-dev librbd-dev libibverbs-dev librdmacm-dev);
+        if [[ "$BUILD_ARCH" == "x86" ]]; then
+            pkgs=("${pkgs[@]/%/:i386}");
+            pkgs+=(gcc-multilib);
+            EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32";
+        else
+            pkgs+=(glusterfs-common);
+        fi;
+        sudo apt-get -qq update;
+        sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}";
+    fi
+script:
+  - ./configure --extra-cflags="${EXTRA_CFLAGS}" && make
+  - make test
diff -Nru fio-2.1.3/trim.c fio-3.16/trim.c
--- fio-2.1.3/trim.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/trim.c	2019-09-20 01:01:52.000000000 +0000
@@ -1,17 +1,14 @@
 /*
  * TRIM/DISCARD support
  */
-#include <unistd.h>
-#include <fcntl.h>
 #include <string.h>
 #include <assert.h>
-#include <pthread.h>
 
 #include "fio.h"
 #include "trim.h"
 
 #ifdef FIO_HAVE_TRIM
-int get_next_trim(struct thread_data *td, struct io_u *io_u)
+bool get_next_trim(struct thread_data *td, struct io_u *io_u)
 {
 	struct io_piece *ipo;
 
@@ -19,12 +16,12 @@
 	 * this io_u is from a requeue, we already filled the offsets
 	 */
 	if (io_u->file)
-		return 0;
+		return true;
 	if (flist_empty(&td->trim_list))
-		return 1;
+		return false;
 
 	assert(td->trim_entries);
-	ipo = flist_entry(td->trim_list.next, struct io_piece, trim_list);
+	ipo = flist_first_entry(&td->trim_list, struct io_piece, trim_list);
 	remove_trim_entry(td, ipo);
 
 	io_u->offset = ipo->offset;
@@ -53,7 +50,7 @@
 		if (r) {
 			dprint(FD_VERIFY, "failed file %s open\n",
 					io_u->file->file_name);
-			return 1;
+			return false;
 		}
 	}
 
@@ -64,24 +61,21 @@
 	io_u->xfer_buflen = io_u->buflen;
 
 	dprint(FD_VERIFY, "get_next_trim: ret io_u %p\n", io_u);
-	return 0;
+	return true;
 }
 
-int io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
 {
 	unsigned long long val;
+	uint64_t frand_max;
 	unsigned long r;
 
 	if (!td->o.trim_percentage)
-		return 0;
+		return false;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->trim_state);
-		val = (OS_RAND_MAX / 100ULL);
-	} else {
-		r = __rand(&td->__trim_state);
-		val = (FRAND_MAX / 100ULL);
-	}
+	frand_max = rand_max(&td->trim_state);
+	r = __rand(&td->trim_state);
+	val = (frand_max / 100ULL);
 
 	val *= (unsigned long long) td->o.trim_percentage;
 	return r <= val;
diff -Nru fio-2.1.3/trim.h fio-3.16/trim.h
--- fio-2.1.3/trim.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/trim.h	2019-09-20 01:01:52.000000000 +0000
@@ -1,11 +1,15 @@
 #ifndef FIO_TRIM_H
 #define FIO_TRIM_H
 
-#include "fio.h"
-
 #ifdef FIO_HAVE_TRIM
-extern int __must_check get_next_trim(struct thread_data *td, struct io_u *io_u);
-extern int io_u_should_trim(struct thread_data *td, struct io_u *io_u);
+#include "flist.h"
+#include "iolog.h"
+#include "compiler/compiler.h"
+#include "lib/types.h"
+#include "os/os.h"
+
+extern bool __must_check get_next_trim(struct thread_data *td, struct io_u *io_u);
+extern bool io_u_should_trim(struct thread_data *td, struct io_u *io_u);
 
 /*
  * Determine whether a given io_u should be logged for verify or
@@ -20,13 +24,13 @@
 }
 
 #else
-static inline int get_next_trim(struct thread_data *td, struct io_u *io_u)
+static inline bool get_next_trim(struct thread_data *td, struct io_u *io_u)
 {
-	return 1;
+	return false;
 }
-static inline int io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+static inline bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
 {
-	return 0;
+	return false;
 }
 static inline void remove_trim_entry(struct thread_data *td, struct io_piece *ipo)
 {
diff -Nru fio-2.1.3/unittests/lib/memalign.c fio-3.16/unittests/lib/memalign.c
--- fio-2.1.3/unittests/lib/memalign.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/lib/memalign.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,27 @@
+#include "../unittest.h"
+
+#include "../../lib/memalign.h"
+
+static void test_memalign_1(void)
+{
+	size_t align = 4096;
+	void *p = fio_memalign(align, 1234, false);
+
+	if (p)
+		CU_ASSERT_EQUAL(((int)(uintptr_t)p) & (align - 1), 0);
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "memalign/1",
+		.fn	= test_memalign_1,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_lib_memalign(void)
+{
+	return fio_unittest_add_suite("lib/memalign.c", NULL, NULL, tests);
+}
diff -Nru fio-2.1.3/unittests/lib/strntol.c fio-3.16/unittests/lib/strntol.c
--- fio-2.1.3/unittests/lib/strntol.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/lib/strntol.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,59 @@
+#include "../unittest.h"
+
+#include "../../lib/strntol.h"
+
+static void test_strntol_1(void)
+{
+	char s[] = "12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 10);
+
+	CU_ASSERT_EQUAL(ret, 12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static void test_strntol_2(void)
+{
+	char s[] = "     12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 10);
+
+	CU_ASSERT_EQUAL(ret, 12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static void test_strntol_3(void)
+{
+	char s[] = "0x12345";
+	char *endp = NULL;
+	long ret = strntol(s, strlen(s), &endp, 16);
+
+	CU_ASSERT_EQUAL(ret, 0x12345);
+	CU_ASSERT_NOT_EQUAL(endp, NULL);
+	CU_ASSERT_EQUAL(*endp, '\0');
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strntol/1",
+		.fn	= test_strntol_1,
+	},
+	{
+		.name	= "strntol/2",
+		.fn	= test_strntol_2,
+	},
+	{
+		.name	= "strntol/3",
+		.fn	= test_strntol_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_lib_strntol(void)
+{
+	return fio_unittest_add_suite("lib/strntol.c", NULL, NULL, tests);
+}
diff -Nru fio-2.1.3/unittests/oslib/strlcat.c fio-3.16/unittests/oslib/strlcat.c
--- fio-2.1.3/unittests/oslib/strlcat.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/oslib/strlcat.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,52 @@
+#include "../unittest.h"
+
+#ifndef CONFIG_STRLCAT
+#include "../../oslib/strlcat.h"
+#else
+#include <string.h>
+#endif
+
+static void test_strlcat_1(void)
+{
+	char dst[32];
+	char src[] = "test";
+	size_t ret;
+
+	dst[0] = '\0';
+	ret = strlcat(dst, src, sizeof(dst));
+
+	CU_ASSERT_EQUAL(strcmp(dst, "test"), 0);
+	CU_ASSERT_EQUAL(ret, 4); /* total length it tried to create */
+}
+
+static void test_strlcat_2(void)
+{
+	char dst[32];
+	char src[] = "test";
+	size_t ret;
+
+	dst[0] = '\0';
+	ret = strlcat(dst, src, strlen(dst));
+
+	CU_ASSERT_EQUAL(strcmp(dst, ""), 0);
+	CU_ASSERT_EQUAL(ret, 4); /* total length it tried to create */
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strlcat/1",
+		.fn	= test_strlcat_1,
+	},
+	{
+		.name	= "strlcat/2",
+		.fn	= test_strlcat_2,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strlcat(void)
+{
+	return fio_unittest_add_suite("oslib/strlcat.c", NULL, NULL, tests);
+}
diff -Nru fio-2.1.3/unittests/oslib/strndup.c fio-3.16/unittests/oslib/strndup.c
--- fio-2.1.3/unittests/oslib/strndup.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/oslib/strndup.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,63 @@
+#include "../unittest.h"
+
+#ifndef CONFIG_HAVE_STRNDUP
+#include "../../oslib/strndup.h"
+#else
+#include <string.h>
+#endif
+
+static void test_strndup_1(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 3);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, "tes"), 0);
+		CU_ASSERT_EQUAL(strlen(p), 3);
+	}
+}
+
+static void test_strndup_2(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 4);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, s), 0);
+		CU_ASSERT_EQUAL(strlen(p), 4);
+	}
+}
+
+static void test_strndup_3(void)
+{
+	char s[] = "test";
+	char *p = strndup(s, 5);
+
+	if (p) {
+		CU_ASSERT_EQUAL(strcmp(p, s), 0);
+		CU_ASSERT_EQUAL(strlen(p), 4);
+	}
+}
+
+static struct fio_unittest_entry tests[] = {
+	{
+		.name	= "strndup/1",
+		.fn	= test_strndup_1,
+	},
+	{
+		.name	= "strndup/2",
+		.fn	= test_strndup_2,
+	},
+	{
+		.name	= "strndup/3",
+		.fn	= test_strndup_3,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+CU_ErrorCode fio_unittest_oslib_strndup(void)
+{
+	return fio_unittest_add_suite("oslib/strndup.c", NULL, NULL, tests);
+}
diff -Nru fio-2.1.3/unittests/unittest.c fio-3.16/unittests/unittest.c
--- fio-2.1.3/unittests/unittest.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/unittest.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,71 @@
+/*
+ * fio unittest
+ * Copyright (C) 2018 Tomohiro Kusumi <kusumi.tomohiro@osnexus.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./unittest.h"
+
+/* XXX workaround lib/memalign.c's dependency on smalloc.c */
+void *smalloc(size_t size)
+{
+	return malloc(size);
+}
+
+void sfree(void *ptr)
+{
+	free(ptr);
+}
+
+CU_ErrorCode fio_unittest_add_suite(const char *name, CU_InitializeFunc initfn,
+	CU_CleanupFunc cleanfn, struct fio_unittest_entry *tvec)
+{
+	CU_pSuite pSuite;
+	struct fio_unittest_entry *t;
+
+	pSuite = CU_add_suite(name, initfn, cleanfn);
+	if (!pSuite) {
+		CU_cleanup_registry();
+		return CU_get_error();
+	}
+
+	t = tvec;
+	while (t && t->name) {
+		if (!CU_add_test(pSuite, t->name, t->fn)) {
+			CU_cleanup_registry();
+			return CU_get_error();
+		}
+		t++;
+	}
+
+	return CUE_SUCCESS;
+}
+
+static void fio_unittest_register(CU_ErrorCode (*fn)(void))
+{
+	if (fn && fn() != CUE_SUCCESS) {
+		fprintf(stderr, "%s\n", CU_get_error_msg());
+		exit(1);
+	}
+}
+
+int main(void)
+{
+	if (CU_initialize_registry() != CUE_SUCCESS) {
+		fprintf(stderr, "%s\n", CU_get_error_msg());
+		exit(1);
+	}
+
+	fio_unittest_register(fio_unittest_lib_memalign);
+	fio_unittest_register(fio_unittest_lib_strntol);
+	fio_unittest_register(fio_unittest_oslib_strlcat);
+	fio_unittest_register(fio_unittest_oslib_strndup);
+
+	CU_basic_set_mode(CU_BRM_VERBOSE);
+	CU_basic_run_tests();
+	CU_cleanup_registry();
+
+	return CU_get_error();
+}
diff -Nru fio-2.1.3/unittests/unittest.h fio-3.16/unittests/unittest.h
--- fio-2.1.3/unittests/unittest.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/unittests/unittest.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,26 @@
+#ifndef FIO_UNITTEST_H
+#define FIO_UNITTEST_H
+
+#include <sys/types.h>
+
+#include <CUnit/CUnit.h>
+#include <CUnit/Basic.h>
+
+struct fio_unittest_entry {
+	const char *name;
+	CU_TestFunc fn;
+};
+
+/* XXX workaround lib/memalign.c's dependency on smalloc.c */
+void *smalloc(size_t);
+void sfree(void*);
+
+CU_ErrorCode fio_unittest_add_suite(const char*, CU_InitializeFunc,
+	CU_CleanupFunc, struct fio_unittest_entry*);
+
+CU_ErrorCode fio_unittest_lib_memalign(void);
+CU_ErrorCode fio_unittest_lib_strntol(void);
+CU_ErrorCode fio_unittest_oslib_strlcat(void);
+CU_ErrorCode fio_unittest_oslib_strndup(void);
+
+#endif
diff -Nru fio-2.1.3/verify.c fio-3.16/verify.c
--- fio-2.1.3/verify.c	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/verify.c	2019-09-20 01:01:52.000000000 +0000
@@ -13,6 +13,7 @@
 #include "trim.h"
 #include "lib/rand.h"
 #include "lib/hweight.h"
+#include "lib/pattern.h"
 
 #include "crc/md5.h"
 #include "crc/crc64.h"
@@ -23,75 +24,89 @@
 #include "crc/sha256.h"
 #include "crc/sha512.h"
 #include "crc/sha1.h"
+#include "crc/xxhash.h"
+#include "crc/sha3.h"
 
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
 			 struct verify_header *hdr, unsigned int header_num,
 			 unsigned int header_len);
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed);
 
-void fill_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, unsigned long seed, int use_seed)
+void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len)
 {
-	switch (td->o.verify_pattern_bytes) {
-	case 0:
+	(void)cpy_pattern(td->o.buffer_pattern, td->o.buffer_pattern_bytes, p, len);
+}
+
+static void __fill_buffer(struct thread_options *o, uint64_t seed, void *p,
+			  unsigned int len)
+{
+	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
+}
+
+static uint64_t fill_buffer(struct thread_data *td, void *p,
+			    unsigned int len)
+{
+	struct frand_state *fs = &td->verify_state;
+	struct thread_options *o = &td->o;
+
+	return fill_random_buf_percentage(fs, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
+}
+
+void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
+			 struct io_u *io_u, uint64_t seed, int use_seed)
+{
+	struct thread_options *o = &td->o;
+
+	if (!o->verify_pattern_bytes) {
 		dprint(FD_VERIFY, "fill random bytes len=%u\n", len);
+
 		if (use_seed)
-			__fill_random_buf(p, len, seed);
+			__fill_buffer(o, seed, p, len);
 		else
-			io_u->rand_seed = fill_random_buf(&td->buf_state, p, len);
-		break;
-	case 1:
-		if (io_u->buf_filled_len >= len) {
-			dprint(FD_VERIFY, "using already filled verify pattern b=0 len=%u\n", len);
-			return;
-		}
-		dprint(FD_VERIFY, "fill verify pattern b=0 len=%u\n", len);
-		memset(p, td->o.verify_pattern[0], len);
-		io_u->buf_filled_len = len;
-		break;
-	default: {
-		unsigned int i = 0, size = 0;
-		unsigned char *b = p;
-
-		if (io_u->buf_filled_len >= len) {
-			dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n",
-					td->o.verify_pattern_bytes, len);
-			return;
-		}
-
-		dprint(FD_VERIFY, "fill verify pattern b=%d len=%u\n",
-					td->o.verify_pattern_bytes, len);
+			io_u->rand_seed = fill_buffer(td, p, len);
+		return;
+	}
 
-		while (i < len) {
-			size = td->o.verify_pattern_bytes;
-			if (size > (len - i))
-				size = len - i;
-			memcpy(b+i, td->o.verify_pattern, size);
-			i += size;
-		}
-		io_u->buf_filled_len = len;
-		break;
-		}
+	/* Skip if we were here and we do not need to patch pattern
+	 * with format */
+	if (!td->o.verify_fmt_sz && io_u->buf_filled_len >= len) {
+		dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n",
+			o->verify_pattern_bytes, len);
+		return;
 	}
+
+	(void)paste_format(td->o.verify_pattern, td->o.verify_pattern_bytes,
+			   td->o.verify_fmt, td->o.verify_fmt_sz,
+			   p, len, io_u);
+	io_u->buf_filled_len = len;
 }
 
 static unsigned int get_hdr_inc(struct thread_data *td, struct io_u *io_u)
 {
 	unsigned int hdr_inc;
 
+	/*
+	 * If we use bs_unaligned, buflen can be larger than the verify
+	 * interval (which just defaults to the smallest blocksize possible).
+	 */
 	hdr_inc = io_u->buflen;
-	if (td->o.verify_interval && td->o.verify_interval <= io_u->buflen)
+	if (td->o.verify_interval && td->o.verify_interval <= io_u->buflen &&
+	    !td->o.bs_unaligned)
 		hdr_inc = td->o.verify_interval;
 
 	return hdr_inc;
 }
 
 static void fill_pattern_headers(struct thread_data *td, struct io_u *io_u,
-				 unsigned long seed, int use_seed)
+				 uint64_t seed, int use_seed)
 {
 	unsigned int hdr_inc, header_num;
 	struct verify_header *hdr;
 	void *p = io_u->buf;
 
-	fill_pattern(td, p, io_u->buflen, io_u, seed, use_seed);
+	fill_verify_pattern(td, p, io_u->buflen, io_u, seed, use_seed);
 
 	hdr_inc = get_hdr_inc(td, io_u);
 	header_num = 0;
@@ -124,7 +139,7 @@
 }
 
 /*
- * Prepare for seperation of verify_header and checksum header
+ * Prepare for separation of verify_header and checksum header
  */
 static inline unsigned int __hdr_size(int verify_type)
 {
@@ -132,7 +147,9 @@
 
 	switch (verify_type) {
 	case VERIFY_NONE:
+	case VERIFY_HDR_ONLY:
 	case VERIFY_NULL:
+	case VERIFY_PATTERN:
 		len = 0;
 		break;
 	case VERIFY_MD5:
@@ -158,15 +175,26 @@
 	case VERIFY_SHA512:
 		len = sizeof(struct vhdr_sha512);
 		break;
-	case VERIFY_META:
-		len = sizeof(struct vhdr_meta);
+	case VERIFY_SHA3_224:
+		len = sizeof(struct vhdr_sha3_224);
+		break;
+	case VERIFY_SHA3_256:
+		len = sizeof(struct vhdr_sha3_256);
+		break;
+	case VERIFY_SHA3_384:
+		len = sizeof(struct vhdr_sha3_384);
+		break;
+	case VERIFY_SHA3_512:
+		len = sizeof(struct vhdr_sha3_512);
+		break;
+	case VERIFY_XXHASH:
+		len = sizeof(struct vhdr_xxhash);
 		break;
 	case VERIFY_SHA1:
 		len = sizeof(struct vhdr_sha1);
 		break;
-	case VERIFY_PATTERN:
-		len = 0;
-		break;
+	case VERIFY_PATTERN_NO_HDR:
+		return 0;
 	default:
 		log_err("fio: unknown verify header!\n");
 		assert(0);
@@ -175,8 +203,12 @@
 	return len + sizeof(struct verify_header);
 }
 
-static inline unsigned int hdr_size(struct verify_header *hdr)
+static inline unsigned int hdr_size(struct thread_data *td,
+				    struct verify_header *hdr)
 {
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR)
+		return 0;
+
 	return __hdr_size(hdr->verify_type);
 }
 
@@ -208,21 +240,28 @@
 	unsigned int crc_len;
 };
 
+#define DUMP_BUF_SZ	255
+
 static void dump_buf(char *buf, unsigned int len, unsigned long long offset,
 		     const char *type, struct fio_file *f)
 {
-	char *ptr, fname[256];
+	char *ptr, *fname;
+	char sep[2] = { FIO_OS_PATH_SEPARATOR, 0 };
 	int ret, fd;
 
 	ptr = strdup(f->file_name);
-	strcpy(fname, basename(ptr));
 
-	sprintf(fname + strlen(fname), ".%llu.%s", offset, type);
+	if (asprintf(&fname, "%s%s%s.%llu.%s", aux_path ? : "",
+		     aux_path ? sep : "", basename(ptr), offset, type) < 0) {
+		if (!fio_did_warn(FIO_WARN_VERIFY_BUF))
+			log_err("fio: not enough memory for dump buffer filename\n");
+		goto free_ptr;
+	}
 
 	fd = open(fname, O_CREAT | O_TRUNC | O_WRONLY, 0644);
 	if (fd < 0) {
 		perror("open verify buf file");
-		return;
+		goto free_fname;
 	}
 
 	while (len) {
@@ -239,6 +278,11 @@
 
 	close(fd);
 	log_err("       %s data dumped as %s\n", type, fname);
+
+free_fname:
+	free(fname);
+
+free_ptr:
 	free(ptr);
 }
 
@@ -246,7 +290,7 @@
  * Dump the contents of the read block and re-generate the correct data
  * and dump that too.
  */
-static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+static void __dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
 {
 	struct thread_data *td = vc->td;
 	struct io_u *io_u = vc->io_u;
@@ -282,14 +326,29 @@
 	free(buf);
 }
 
+static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+{
+	struct thread_data *td = vc->td;
+	struct verify_header shdr;
+
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR) {
+		__fill_hdr(td, vc->io_u, &shdr, 0, vc->io_u->buflen, 0);
+		hdr = &shdr;
+	}
+
+	__dump_verify_buffers(hdr, vc);
+}
+
 static void log_verify_failure(struct verify_header *hdr, struct vcont *vc)
 {
 	unsigned long long offset;
 
 	offset = vc->io_u->offset;
 	offset += vc->hdr_num * hdr->len;
-	log_err("%.8s: verify failed at file %s offset %llu, length %u\n",
-			vc->name, vc->io_u->file->file_name, offset, hdr->len);
+	log_err("%.8s: verify failed at file %s offset %llu, length %u"
+			" (requested block: offset=%llu, length=%llu)\n",
+			vc->name, vc->io_u->file->file_name, offset, hdr->len,
+			vc->io_u->offset, vc->io_u->buflen);
 
 	if (vc->good_crc && vc->bad_crc) {
 		log_err("       Expected CRC: ");
@@ -306,7 +365,7 @@
  */
 static inline void *io_u_verify_off(struct verify_header *hdr, struct vcont *vc)
 {
-	return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(hdr);
+	return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(vc->td, hdr);
 }
 
 static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc)
@@ -315,35 +374,37 @@
 	struct io_u *io_u = vc->io_u;
 	char *buf, *pattern;
 	unsigned int header_size = __hdr_size(td->o.verify);
-	unsigned int len, mod, i, size, pattern_size;
+	unsigned int len, mod, i, pattern_size;
+	int rc;
 
 	pattern = td->o.verify_pattern;
 	pattern_size = td->o.verify_pattern_bytes;
-	if (pattern_size <= 1)
-		pattern_size = MAX_PATTERN_SIZE;
-	buf = (void *) hdr + header_size;
+	assert(pattern_size != 0);
+
+	(void)paste_format_inplace(pattern, pattern_size,
+				   td->o.verify_fmt, td->o.verify_fmt_sz, io_u);
+
+	buf = (char *) hdr + header_size;
 	len = get_hdr_inc(td, io_u) - header_size;
-	mod = header_size % pattern_size;
+	mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size;
 
-	for (i = 0; i < len; i += size) {
-		size = pattern_size - mod;
-		if (size > (len - i))
-			size = len - i;
-		if (memcmp(buf + i, pattern + mod, size))
-			/* Let the slow compare find the first mismatch byte. */
-			break;
-		mod = 0;
-	}
+	rc = cmp_pattern(pattern, pattern_size, mod, buf, len);
+	if (!rc)
+		return 0;
 
-	for (; i < len; i++) {
+	/* Slow path, compare each byte */
+	for (i = 0; i < len; i++) {
 		if (buf[i] != pattern[mod]) {
 			unsigned int bits;
 
 			bits = hweight8(buf[i] ^ pattern[mod]);
-			log_err("fio: got pattern %x, wanted %x. Bad bits %d\n",
-				buf[i], pattern[mod], bits);
+			log_err("fio: got pattern '%02x', wanted '%02x'. Bad bits %d\n",
+				(unsigned char)buf[i],
+				(unsigned char)pattern[mod],
+				bits);
 			log_err("fio: bad pattern block offset %u\n", i);
-			dump_verify_buffers(hdr, vc);
+			vc->name = "pattern";
+			log_verify_failure(hdr, vc);
 			return EILSEQ;
 		}
 		mod++;
@@ -351,30 +412,111 @@
 			mod = 0;
 	}
 
-	return 0;
+	/* Unreachable line */
+	assert(0);
+	return EILSEQ;
 }
 
-static int verify_io_u_meta(struct verify_header *hdr, struct vcont *vc)
+static int verify_io_u_xxhash(struct verify_header *hdr, struct vcont *vc)
 {
-	struct thread_data *td = vc->td;
-	struct vhdr_meta *vh = hdr_priv(hdr);
-	struct io_u *io_u = vc->io_u;
-	int ret = EILSEQ;
+	void *p = io_u_verify_off(hdr, vc);
+	struct vhdr_xxhash *vh = hdr_priv(hdr);
+	uint32_t hash;
+	void *state;
 
-	dprint(FD_VERIFY, "meta verify io_u %p, len %u\n", io_u, hdr->len);
+	dprint(FD_VERIFY, "xxhash verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	if (vh->offset == io_u->offset + vc->hdr_num * td->o.verify_interval)
-		ret = 0;
+	state = XXH32_init(1);
+	XXH32_update(state, p, hdr->len - hdr_size(vc->td, hdr));
+	hash = XXH32_digest(state);
+
+	if (vh->hash == hash)
+		return 0;
+
+	vc->name = "xxhash";
+	vc->good_crc = &vh->hash;
+	vc->bad_crc = &hash;
+	vc->crc_len = sizeof(hash);
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3(struct verify_header *hdr, struct vcont *vc,
+			    struct fio_sha3_ctx *sha3_ctx, uint8_t *sha,
+			    unsigned int sha_size, const char *name)
+{
+	void *p = io_u_verify_off(hdr, vc);
+
+	dprint(FD_VERIFY, "%s verify io_u %p, len %u\n", name, vc->io_u, hdr->len);
 
-	if (td->o.verify_pattern_bytes)
-		ret |= verify_io_u_pattern(hdr, vc);
+	fio_sha3_update(sha3_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha3_final(sha3_ctx);
 
-	if (!ret)
+	if (!memcmp(sha, sha3_ctx->sha, sha_size))
 		return 0;
 
-	vc->name = "meta";
+	vc->name = name;
+	vc->good_crc = sha;
+	vc->bad_crc = sha3_ctx->sha;
+	vc->crc_len = sha_size;
 	log_verify_failure(hdr, vc);
-	return ret;
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3_224(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_224_DIGEST_SIZE, "sha3-224");
+}
+
+static int verify_io_u_sha3_256(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_256_DIGEST_SIZE, "sha3-256");
+}
+
+static int verify_io_u_sha3_384(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_384_DIGEST_SIZE, "sha3-384");
+}
+
+static int verify_io_u_sha3_512(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_512_DIGEST_SIZE, "sha3-512");
 }
 
 static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc)
@@ -389,7 +531,7 @@
 	dprint(FD_VERIFY, "sha512 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha512_init(&sha512_ctx);
-	fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (!memcmp(vh->sha512, sha512_ctx.buf, sizeof(sha512)))
 		return 0;
@@ -414,7 +556,8 @@
 	dprint(FD_VERIFY, "sha256 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha256_init(&sha256_ctx);
-	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha256_final(&sha256_ctx);
 
 	if (!memcmp(vh->sha256, sha256_ctx.buf, sizeof(sha256)))
 		return 0;
@@ -439,7 +582,8 @@
 	dprint(FD_VERIFY, "sha1 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha1_init(&sha1_ctx);
-	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha1_final(&sha1_ctx);
 
 	if (!memcmp(vh->sha1, sha1_ctx.H, sizeof(sha1)))
 		return 0;
@@ -460,7 +604,7 @@
 
 	dprint(FD_VERIFY, "crc7 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc7(p, hdr->len - hdr_size(hdr));
+	c = fio_crc7(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc7)
 		return 0;
@@ -481,7 +625,7 @@
 
 	dprint(FD_VERIFY, "crc16 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc16(p, hdr->len - hdr_size(hdr));
+	c = fio_crc16(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc16)
 		return 0;
@@ -502,7 +646,7 @@
 
 	dprint(FD_VERIFY, "crc64 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc64(p, hdr->len - hdr_size(hdr));
+	c = fio_crc64(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc64)
 		return 0;
@@ -523,7 +667,7 @@
 
 	dprint(FD_VERIFY, "crc32 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc32(p, hdr->len - hdr_size(hdr));
+	c = fio_crc32(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc32)
 		return 0;
@@ -544,7 +688,7 @@
 
 	dprint(FD_VERIFY, "crc32c verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc32c(p, hdr->len - hdr_size(hdr));
+	c = fio_crc32c(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc32)
 		return 0;
@@ -569,7 +713,8 @@
 	dprint(FD_VERIFY, "md5 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_md5_init(&md5_ctx);
-	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(hdr));
+	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_md5_final(&md5_ctx);
 
 	if (!memcmp(vh->md5_digest, md5_ctx.hash, sizeof(hash)))
 		return 0;
@@ -585,88 +730,173 @@
 /*
  * Push IO verification to a separate thread
  */
-int verify_io_u_async(struct thread_data *td, struct io_u *io_u)
+int verify_io_u_async(struct thread_data *td, struct io_u **io_u_ptr)
 {
-	if (io_u->file)
-		put_file_log(td, io_u->file);
+	struct io_u *io_u = *io_u_ptr;
 
 	pthread_mutex_lock(&td->io_u_lock);
 
+	if (io_u->file)
+		put_file_log(td, io_u->file);
+
 	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
-		io_u->flags &= ~IO_U_F_IN_CUR_DEPTH;
+		io_u_clear(td, io_u, IO_U_F_IN_CUR_DEPTH);
 	}
 	flist_add_tail(&io_u->verify_list, &td->verify_list);
-	io_u->flags |= IO_U_F_FREE_DEF;
-	pthread_mutex_unlock(&td->io_u_lock);
+	*io_u_ptr = NULL;
 
 	pthread_cond_signal(&td->verify_cond);
+	pthread_mutex_unlock(&td->io_u_lock);
 	return 0;
 }
 
+/*
+ * Thanks Rusty, for spending the time so I don't have to.
+ *
+ * http://rusty.ozlabs.org/?p=560
+ */
+static int mem_is_zero(const void *data, size_t length)
+{
+	const unsigned char *p = data;
+	size_t len;
+
+	/* Check first 16 bytes manually */
+	for (len = 0; len < 16; len++) {
+		if (!length)
+			return 1;
+		if (*p)
+			return 0;
+		p++;
+		length--;
+	}
+
+	/* Now we know that's zero, memcmp with self. */
+	return memcmp(data, p, length) == 0;
+}
+
+static int mem_is_zero_slow(const void *data, size_t length, size_t *offset)
+{
+	const unsigned char *p = data;
+
+	*offset = 0;
+	while (length) {
+		if (*p)
+			break;
+		(*offset)++;
+		length--;
+		p++;
+	}
+
+	return !length;
+}
+
 static int verify_trimmed_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	static char zero_buf[1024];
-	unsigned int this_len, len;
-	int ret = 0;
-	void *p;
+	size_t offset;
 
 	if (!td->o.trim_zero)
 		return 0;
 
-	len = io_u->buflen;
-	p = io_u->buf;
-	do {
-		this_len = sizeof(zero_buf);
-		if (this_len > len)
-			this_len = len;
-		if (memcmp(p, zero_buf, this_len)) {
-			ret = EILSEQ;
-			break;
-		}
-		len -= this_len;
-		p += this_len;
-	} while (len);
-
-	if (!ret)
+	if (mem_is_zero(io_u->buf, io_u->buflen))
 		return 0;
 
-	log_err("trim: verify failed at file %s offset %llu, length %lu"
+	mem_is_zero_slow(io_u->buf, io_u->buflen, &offset);
+
+	log_err("trim: verify failed at file %s offset %llu, length %llu"
 		", block offset %lu\n",
 			io_u->file->file_name, io_u->offset, io_u->buflen,
-			(unsigned long) (p - io_u->buf));
-	return ret;
+			(unsigned long) offset);
+	return EILSEQ;
 }
 
-static int verify_header(struct io_u *io_u, struct verify_header *hdr)
+static int verify_header(struct io_u *io_u, struct thread_data *td,
+			 struct verify_header *hdr, unsigned int hdr_num,
+			 unsigned int hdr_len)
 {
 	void *p = hdr;
 	uint32_t crc;
 
-	if (hdr->magic != FIO_HDR_MAGIC)
-		return 0;
-	if (hdr->len > io_u->buflen) {
-		log_err("fio: verify header exceeds buffer length (%u > %lu)\n", hdr->len, io_u->buflen);
-		return 0;
+	if (hdr->magic != FIO_HDR_MAGIC) {
+		log_err("verify: bad magic header %x, wanted %x",
+			hdr->magic, FIO_HDR_MAGIC);
+		goto err;
+	}
+	if (hdr->len != hdr_len) {
+		log_err("verify: bad header length %u, wanted %u",
+			hdr->len, hdr_len);
+		goto err;
+	}
+	if (hdr->rand_seed != io_u->rand_seed) {
+		log_err("verify: bad header rand_seed %"PRIu64
+			", wanted %"PRIu64,
+			hdr->rand_seed, io_u->rand_seed);
+		goto err;
+	}
+	if (hdr->offset != io_u->offset + hdr_num * td->o.verify_interval) {
+		log_err("verify: bad header offset %"PRIu64
+			", wanted %llu",
+			hdr->offset, io_u->offset);
+		goto err;
 	}
 
-	crc = fio_crc32c(p, offsetof(struct verify_header, crc32));
-	if (crc == hdr->crc32)
-		return 1;
+	/*
+	 * For read-only workloads, the program cannot be certain of the
+	 * last numberio written to a block. Checking of numberio will be
+	 * done only for workloads that write data.  For verify_only,
+	 * numberio will be checked in the last iteration when the correct
+	 * state of numberio, that would have been written to each block
+	 * in a previous run of fio, has been reached.
+	 */
+	if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) &&
+	    !td->o.time_based)
+		if (!td->o.verify_only || td->o.loops == 0)
+			if (hdr->numberio != io_u->numberio) {
+				log_err("verify: bad header numberio %"PRIu16
+					", wanted %"PRIu16,
+					hdr->numberio, io_u->numberio);
+				goto err;
+			}
 
-	log_err("fio: verify header crc %x, calculated %x\n", hdr->crc32, crc);
+	crc = fio_crc32c(p, offsetof(struct verify_header, crc32));
+	if (crc != hdr->crc32) {
+		log_err("verify: bad header crc %x, calculated %x",
+			hdr->crc32, crc);
+		goto err;
+	}
 	return 0;
+
+err:
+	log_err(" at file %s offset %llu, length %u"
+		" (requested block: offset=%llu, length=%llu)\n",
+		io_u->file->file_name,
+		io_u->offset + hdr_num * hdr_len, hdr_len,
+		io_u->offset, io_u->buflen);
+
+	if (td->o.verify_dump)
+		dump_buf(p, hdr_len, io_u->offset + hdr_num * hdr_len,
+				"hdr_fail", io_u->file);
+
+	return EILSEQ;
 }
 
-int verify_io_u(struct thread_data *td, struct io_u *io_u)
+int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr)
 {
 	struct verify_header *hdr;
+	struct io_u *io_u = *io_u_ptr;
 	unsigned int header_size, hdr_inc, hdr_num = 0;
 	void *p;
 	int ret;
 
 	if (td->o.verify == VERIFY_NULL || io_u->ddir != DDIR_READ)
 		return 0;
+	/*
+	 * If the IO engine is faking IO (like null), then just pretend
+	 * we verified everything.
+	 */
+	if (td_ioengine_flagged(td, FIO_FAKEIO))
+		return 0;
+
 	if (io_u->flags & IO_U_F_TRIMMED) {
 		ret = verify_trimmed_io_u(td, io_u);
 		goto done;
@@ -692,13 +922,16 @@
 			memswp(p, p + td->o.verify_offset, header_size);
 		hdr = p;
 
-		if (!verify_header(io_u, hdr)) {
-			log_err("verify: bad magic header %x, wanted %x at "
-				"file %s offset %llu, length %u\n",
-				hdr->magic, FIO_HDR_MAGIC,
-				io_u->file->file_name,
-				io_u->offset + hdr_num * hdr->len, hdr->len);
-			return EILSEQ;
+		/*
+		 * Make rand_seed check pass when have verify_backlog.
+		 */
+		if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG))
+			io_u->rand_seed = hdr->rand_seed;
+
+		if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
+			ret = verify_header(io_u, td, hdr, hdr_num, hdr_inc);
+			if (ret)
+				return ret;
 		}
 
 		if (td->o.verify != VERIFY_NONE)
@@ -707,6 +940,12 @@
 			verify_type = hdr->verify_type;
 
 		switch (verify_type) {
+		case VERIFY_HDR_ONLY:
+			/* Header is always verified, check if pattern is left
+			 * for verification. */
+			if (td->o.verify_pattern_bytes)
+				ret = verify_io_u_pattern(hdr, &vc);
+			break;
 		case VERIFY_MD5:
 			ret = verify_io_u_md5(hdr, &vc);
 			break;
@@ -732,13 +971,26 @@
 		case VERIFY_SHA512:
 			ret = verify_io_u_sha512(hdr, &vc);
 			break;
-		case VERIFY_META:
-			ret = verify_io_u_meta(hdr, &vc);
+		case VERIFY_SHA3_224:
+			ret = verify_io_u_sha3_224(hdr, &vc);
+			break;
+		case VERIFY_SHA3_256:
+			ret = verify_io_u_sha3_256(hdr, &vc);
+			break;
+		case VERIFY_SHA3_384:
+			ret = verify_io_u_sha3_384(hdr, &vc);
+			break;
+		case VERIFY_SHA3_512:
+			ret = verify_io_u_sha3_512(hdr, &vc);
+			break;
+		case VERIFY_XXHASH:
+			ret = verify_io_u_xxhash(hdr, &vc);
 			break;
 		case VERIFY_SHA1:
 			ret = verify_io_u_sha1(hdr, &vc);
 			break;
 		case VERIFY_PATTERN:
+		case VERIFY_PATTERN_NO_HDR:
 			ret = verify_io_u_pattern(hdr, &vc);
 			break;
 		default:
@@ -753,24 +1005,69 @@
 
 done:
 	if (ret && td->o.verify_fatal)
-		td->terminate = 1;
+		fio_mark_td_terminate(td);
 
 	return ret;
 }
 
-static void fill_meta(struct verify_header *hdr, struct thread_data *td,
-		      struct io_u *io_u, unsigned int header_num)
+static void fill_xxhash(struct verify_header *hdr, void *p, unsigned int len)
 {
-	struct vhdr_meta *vh = hdr_priv(hdr);
+	struct vhdr_xxhash *vh = hdr_priv(hdr);
+	void *state;
 
-	vh->thread = td->thread_number;
+	state = XXH32_init(1);
+	XXH32_update(state, p, len);
+	vh->hash = XXH32_digest(state);
+}
+
+static void fill_sha3(struct fio_sha3_ctx *sha3_ctx, void *p, unsigned int len)
+{
+	fio_sha3_update(sha3_ctx, p, len);
+	fio_sha3_final(sha3_ctx);
+}
+
+static void fill_sha3_224(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_256(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
 
-	vh->time_sec = io_u->start_time.tv_sec;
-	vh->time_usec = io_u->start_time.tv_usec;
+	fio_sha3_256_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
 
-	vh->numberio = td->io_issues[DDIR_WRITE];
+static void fill_sha3_384(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
 
-	vh->offset = io_u->offset + header_num * td->o.verify_interval;
+	fio_sha3_384_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_512(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
 }
 
 static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len)
@@ -793,6 +1090,7 @@
 
 	fio_sha256_init(&sha256_ctx);
 	fio_sha256_update(&sha256_ctx, p, len);
+	fio_sha256_final(&sha256_ctx);
 }
 
 static void fill_sha1(struct verify_header *hdr, void *p, unsigned int len)
@@ -804,6 +1102,7 @@
 
 	fio_sha1_init(&sha1_ctx);
 	fio_sha1_update(&sha1_ctx, p, len);
+	fio_sha1_final(&sha1_ctx);
 }
 
 static void fill_crc7(struct verify_header *hdr, void *p, unsigned int len)
@@ -850,6 +1149,34 @@
 
 	fio_md5_init(&md5_ctx);
 	fio_md5_update(&md5_ctx, p, len);
+	fio_md5_final(&md5_ctx);
+}
+
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed)
+{
+	void *p = hdr;
+
+	hdr->magic = FIO_HDR_MAGIC;
+	hdr->verify_type = td->o.verify;
+	hdr->len = header_len;
+	hdr->rand_seed = rand_seed;
+	hdr->offset = io_u->offset + header_num * td->o.verify_interval;
+	hdr->time_sec = io_u->start_time.tv_sec;
+	hdr->time_nsec = io_u->start_time.tv_nsec;
+	hdr->thread = td->thread_number;
+	hdr->numberio = io_u->numberio;
+	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
+}
+
+
+static void fill_hdr(struct thread_data *td, struct io_u *io_u,
+		     struct verify_header *hdr, unsigned int header_num,
+		     unsigned int header_len, uint64_t rand_seed)
+{
+	if (td->o.verify != VERIFY_PATTERN_NO_HDR)
+		__fill_hdr(td, io_u, hdr, header_num, header_len, rand_seed);
 }
 
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
@@ -857,19 +1184,16 @@
 			 unsigned int header_len)
 {
 	unsigned int data_len;
-	void *data, *p;
+	void *data;
+	char *p;
 
-	p = (void *) hdr;
+	p = (char *) hdr;
 
-	hdr->magic = FIO_HDR_MAGIC;
-	hdr->verify_type = td->o.verify;
-	hdr->len = header_len;
-	hdr->rand_seed = io_u->rand_seed;
-	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
+	fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed);
 
-	data_len = header_len - hdr_size(hdr);
+	data_len = header_len - hdr_size(td, hdr);
 
-	data = p + hdr_size(hdr);
+	data = p + hdr_size(td, hdr);
 	switch (td->o.verify) {
 	case VERIFY_MD5:
 		dprint(FD_VERIFY, "fill md5 io_u %p, len %u\n",
@@ -912,25 +1236,48 @@
 						io_u, hdr->len);
 		fill_sha512(hdr, data, data_len);
 		break;
-	case VERIFY_META:
-		dprint(FD_VERIFY, "fill meta io_u %p, len %u\n",
+	case VERIFY_SHA3_224:
+		dprint(FD_VERIFY, "fill sha3-224 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_224(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_256:
+		dprint(FD_VERIFY, "fill sha3-256 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_256(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_384:
+		dprint(FD_VERIFY, "fill sha3-384 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_384(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_512:
+		dprint(FD_VERIFY, "fill sha3-512 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_512(hdr, data, data_len);
+		break;
+	case VERIFY_XXHASH:
+		dprint(FD_VERIFY, "fill xxhash io_u %p, len %u\n",
 						io_u, hdr->len);
-		fill_meta(hdr, td, io_u, header_num);
+		fill_xxhash(hdr, data, data_len);
 		break;
 	case VERIFY_SHA1:
 		dprint(FD_VERIFY, "fill sha1 io_u %p, len %u\n",
 						io_u, hdr->len);
 		fill_sha1(hdr, data, data_len);
 		break;
+	case VERIFY_HDR_ONLY:
 	case VERIFY_PATTERN:
+	case VERIFY_PATTERN_NO_HDR:
 		/* nothing to do here */
 		break;
 	default:
 		log_err("fio: bad verify type: %d\n", td->o.verify);
 		assert(0);
 	}
-	if (td->o.verify_offset)
-		memswp(p, p + td->o.verify_offset, hdr_size(hdr));
+
+	if (td->o.verify_offset && hdr_size(td, hdr))
+		memswp(p, p + td->o.verify_offset, hdr_size(td, hdr));
 }
 
 /*
@@ -942,6 +1289,8 @@
 	if (td->o.verify == VERIFY_NULL)
 		return;
 
+	io_u->numberio = td->io_issues[io_u->ddir];
+
 	fill_pattern_headers(td, io_u, 0, 0);
 }
 
@@ -956,14 +1305,30 @@
 		return 0;
 
 	if (!RB_EMPTY_ROOT(&td->io_hist_tree)) {
-		struct rb_node *n = rb_first(&td->io_hist_tree);
+		struct fio_rb_node *n = rb_first(&td->io_hist_tree);
 
 		ipo = rb_entry(n, struct io_piece, rb_node);
+
+		/*
+		 * Ensure that the associated IO has completed
+		 */
+		read_barrier();
+		if (ipo->flags & IP_F_IN_FLIGHT)
+			goto nothing;
+
 		rb_erase(n, &td->io_hist_tree);
 		assert(ipo->flags & IP_F_ONRB);
 		ipo->flags &= ~IP_F_ONRB;
 	} else if (!flist_empty(&td->io_hist_list)) {
-		ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
+
+		/*
+		 * Ensure that the associated IO has completed
+		 */
+		read_barrier();
+		if (ipo->flags & IP_F_IN_FLIGHT)
+			goto nothing;
+
 		flist_del(&ipo->list);
 		assert(ipo->flags & IP_F_ONLIST);
 		ipo->flags &= ~IP_F_ONLIST;
@@ -974,11 +1339,12 @@
 
 		io_u->offset = ipo->offset;
 		io_u->buflen = ipo->len;
+		io_u->numberio = ipo->numberio;
 		io_u->file = ipo->file;
-		io_u->flags |= IO_U_F_VER_LIST;
+		io_u_set(td, io_u, IO_U_F_VER_LIST);
 
 		if (ipo->flags & IP_F_TRIMMED)
-			io_u->flags |= IO_U_F_TRIMMED;
+			io_u_set(td, io_u, IO_U_F_TRIMMED);
 
 		if (!fio_file_open(io_u->file)) {
 			int r = td_io_open_file(td, io_u->file);
@@ -999,9 +1365,16 @@
 		remove_trim_entry(td, ipo);
 		free(ipo);
 		dprint(FD_VERIFY, "get_next_verify: ret io_u %p\n", io_u);
+
+		if (!td->o.verify_pattern_bytes) {
+			io_u->rand_seed = __rand(&td->verify_state);
+			if (sizeof(int) != sizeof(long *))
+				io_u->rand_seed *= __rand(&td->verify_state);
+		}
 		return 0;
 	}
 
+nothing:
 	dprint(FD_VERIFY, "get_next_verify: empty\n");
 	return 1;
 }
@@ -1010,6 +1383,7 @@
 {
 	if (td->o.verify == VERIFY_CRC32C_INTEL ||
 	    td->o.verify == VERIFY_CRC32C) {
+		crc32c_arm64_probe();
 		crc32c_intel_probe();
 	}
 }
@@ -1020,7 +1394,7 @@
 	struct io_u *io_u;
 	int ret = 0;
 
-	if (td->o.verify_cpumask_set &&
+	if (fio_option_is_set(&td->o, verify_cpumask) &&
 	    fio_setaffinity(td->pid, td->o.verify_cpumask)) {
 		log_err("fio: failed setting verify thread affinity\n");
 		goto done;
@@ -1052,10 +1426,12 @@
 			continue;
 
 		while (!flist_empty(&list)) {
-			io_u = flist_entry(list.next, struct io_u, verify_list);
-			flist_del(&io_u->verify_list);
+			io_u = flist_first_entry(&list, struct io_u, verify_list);
+			flist_del_init(&io_u->verify_list);
+
+			io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+			ret = verify_io_u(td, &io_u);
 
-			ret = verify_io_u(td, io_u);
 			put_io_u(td, io_u);
 			if (!ret)
 				continue;
@@ -1070,15 +1446,15 @@
 	if (ret) {
 		td_verror(td, ret, "async_verify");
 		if (td->o.verify_fatal)
-			td->terminate = 1;
+			fio_mark_td_terminate(td);
 	}
 
 done:
 	pthread_mutex_lock(&td->io_u_lock);
 	td->nr_verify_threads--;
+	pthread_cond_signal(&td->free_cond);
 	pthread_mutex_unlock(&td->io_u_lock);
 
-	pthread_cond_signal(&td->free_cond);
 	return NULL;
 }
 
@@ -1088,7 +1464,7 @@
 	pthread_attr_t attr;
 
 	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
 
 	td->verify_thread_exit = 0;
 
@@ -1114,9 +1490,12 @@
 
 	if (i != td->o.verify_async) {
 		log_err("fio: only %d verify threads started, exiting\n", i);
+
+		pthread_mutex_lock(&td->io_u_lock);
 		td->verify_thread_exit = 1;
-		write_barrier();
 		pthread_cond_broadcast(&td->verify_cond);
+		pthread_mutex_unlock(&td->io_u_lock);
+
 		return 1;
 	}
 
@@ -1125,12 +1504,10 @@
 
 void verify_async_exit(struct thread_data *td)
 {
+	pthread_mutex_lock(&td->io_u_lock);
 	td->verify_thread_exit = 1;
-	write_barrier();
 	pthread_cond_broadcast(&td->verify_cond);
 
-	pthread_mutex_lock(&td->io_u_lock);
-
 	while (td->nr_verify_threads)
 		pthread_cond_wait(&td->free_cond, &td->io_u_lock);
 
@@ -1138,3 +1515,367 @@
 	free(td->verify_threads);
 	td->verify_threads = NULL;
 }
+
+int paste_blockoff(char *buf, unsigned int len, void *priv)
+{
+	struct io_u *io = priv;
+	unsigned long long off;
+
+	typecheck(__typeof__(off), io->offset);
+	off = cpu_to_le64((uint64_t)io->offset);
+	len = min(len, (unsigned int)sizeof(off));
+	memcpy(buf, &off, len);
+	return 0;
+}
+
+static int __fill_file_completions(struct thread_data *td,
+				   struct thread_io_list *s,
+				   struct fio_file *f, unsigned int *index)
+{
+	unsigned int comps;
+	int i, j;
+
+	if (!f->last_write_comp)
+		return 0;
+
+	if (td->io_blocks[DDIR_WRITE] < td->o.iodepth)
+		comps = td->io_blocks[DDIR_WRITE];
+	else
+		comps = td->o.iodepth;
+
+	j = f->last_write_idx - 1;
+	for (i = 0; i < comps; i++) {
+		if (j == -1)
+			j = td->o.iodepth - 1;
+		s->comps[*index].fileno = __cpu_to_le64(f->fileno);
+		s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j]);
+		(*index)++;
+		j--;
+	}
+
+	return comps;
+}
+
+static int fill_file_completions(struct thread_data *td,
+				 struct thread_io_list *s, unsigned int *index)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int comps = 0;
+
+	for_each_file(td, f, i)
+		comps += __fill_file_completions(td, s, f, index);
+
+	return comps;
+}
+
+struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
+{
+	struct all_io_list *rep;
+	struct thread_data *td;
+	size_t depth;
+	void *next;
+	int i, nr;
+
+	compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list");
+
+	/*
+	 * Calculate reply space needed. We need one 'io_state' per thread,
+	 * and the size will vary depending on depth.
+	 */
+	depth = 0;
+	nr = 0;
+	for_each_td(td, i) {
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+		td->stop_io = 1;
+		td->flags |= TD_F_VSTATE_SAVED;
+		depth += (td->o.iodepth * td->o.nr_files);
+		nr++;
+	}
+
+	if (!nr)
+		return NULL;
+
+	*sz = sizeof(*rep);
+	*sz += nr * sizeof(struct thread_io_list);
+	*sz += depth * sizeof(struct file_comp);
+	rep = malloc(*sz);
+	memset(rep, 0, *sz);
+
+	rep->threads = cpu_to_le64((uint64_t) nr);
+
+	next = &rep->state[0];
+	for_each_td(td, i) {
+		struct thread_io_list *s = next;
+		unsigned int comps, index = 0;
+
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+
+		comps = fill_file_completions(td, s, &index);
+
+		s->no_comps = cpu_to_le64((uint64_t) comps);
+		s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
+		s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files);
+		s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
+		s->index = cpu_to_le64((uint64_t) i);
+		if (td->random_state.use64) {
+			s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1);
+			s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2);
+			s->rand.state64.s[2] = cpu_to_le64(td->random_state.state64.s3);
+			s->rand.state64.s[3] = cpu_to_le64(td->random_state.state64.s4);
+			s->rand.state64.s[4] = cpu_to_le64(td->random_state.state64.s5);
+			s->rand.state64.s[5] = 0;
+			s->rand.use64 = cpu_to_le64((uint64_t)1);
+		} else {
+			s->rand.state32.s[0] = cpu_to_le32(td->random_state.state32.s1);
+			s->rand.state32.s[1] = cpu_to_le32(td->random_state.state32.s2);
+			s->rand.state32.s[2] = cpu_to_le32(td->random_state.state32.s3);
+			s->rand.state32.s[3] = 0;
+			s->rand.use64 = 0;
+		}
+		snprintf((char *) s->name, sizeof(s->name), "%s", td->o.name);
+		next = io_list_next(s);
+	}
+
+	return rep;
+}
+
+static int open_state_file(const char *name, const char *prefix, int num,
+			   int for_write)
+{
+	char out[PATH_MAX];
+	int flags;
+	int fd;
+
+	if (for_write)
+		flags = O_CREAT | O_TRUNC | O_WRONLY | O_SYNC;
+	else
+		flags = O_RDONLY;
+
+	verify_state_gen_name(out, sizeof(out), name, prefix, num);
+
+	fd = open(out, flags, 0644);
+	if (fd == -1) {
+		perror("fio: open state file");
+		log_err("fio: state file: %s (for_write=%d)\n", out, for_write);
+		return -1;
+	}
+
+	return fd;
+}
+
+static int write_thread_list_state(struct thread_io_list *s,
+				   const char *prefix)
+{
+	struct verify_state_hdr hdr;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	fd = open_state_file((const char *) s->name, prefix, s->index, 1);
+	if (fd == -1)
+		return 1;
+
+	crc = fio_crc32c((void *)s, thread_io_list_sz(s));
+
+	hdr.version = cpu_to_le64((uint64_t) VSTATE_HDR_VERSION);
+	hdr.size = cpu_to_le64((uint64_t) thread_io_list_sz(s));
+	hdr.crc = cpu_to_le64(crc);
+	ret = write(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
+		goto write_fail;
+
+	ret = write(fd, s, thread_io_list_sz(s));
+	if (ret != thread_io_list_sz(s)) {
+write_fail:
+		if (ret < 0)
+			perror("fio: write state file");
+		log_err("fio: failed to write state file\n");
+		ret = 1;
+	} else
+		ret = 0;
+
+	close(fd);
+	return ret;
+}
+
+void __verify_save_state(struct all_io_list *state, const char *prefix)
+{
+	struct thread_io_list *s = &state->state[0];
+	unsigned int i;
+
+	for (i = 0; i < le64_to_cpu(state->threads); i++) {
+		write_thread_list_state(s,  prefix);
+		s = io_list_next(s);
+	}
+}
+
+void verify_save_state(int mask)
+{
+	struct all_io_list *state;
+	size_t sz;
+
+	state = get_all_io_list(mask, &sz);
+	if (state) {
+		char prefix[PATH_MAX];
+
+		if (aux_path)
+			sprintf(prefix, "%s%clocal", aux_path, FIO_OS_PATH_SEPARATOR);
+		else
+			strcpy(prefix, "local");
+
+		__verify_save_state(state, prefix);
+		free(state);
+	}
+}
+
+void verify_free_state(struct thread_data *td)
+{
+	if (td->vstate)
+		free(td->vstate);
+}
+
+void verify_assign_state(struct thread_data *td, void *p)
+{
+	struct thread_io_list *s = p;
+	int i;
+
+	s->no_comps = le64_to_cpu(s->no_comps);
+	s->depth = le32_to_cpu(s->depth);
+	s->nofiles = le32_to_cpu(s->nofiles);
+	s->numberio = le64_to_cpu(s->numberio);
+	s->rand.use64 = le64_to_cpu(s->rand.use64);
+
+	if (s->rand.use64) {
+		for (i = 0; i < 6; i++)
+			s->rand.state64.s[i] = le64_to_cpu(s->rand.state64.s[i]);
+	} else {
+		for (i = 0; i < 4; i++)
+			s->rand.state32.s[i] = le32_to_cpu(s->rand.state32.s[i]);
+	}
+
+	for (i = 0; i < s->no_comps; i++) {
+		s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+		s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+	}
+
+	td->vstate = p;
+}
+
+int verify_state_hdr(struct verify_state_hdr *hdr, struct thread_io_list *s)
+{
+	uint64_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	if (hdr->version != VSTATE_HDR_VERSION)
+		return 1;
+
+	crc = fio_crc32c((void *)s, hdr->size);
+	if (crc != hdr->crc)
+		return 1;
+
+	return 0;
+}
+
+int verify_load_state(struct thread_data *td, const char *prefix)
+{
+	struct verify_state_hdr hdr;
+	void *s = NULL;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	fd = open_state_file(td->o.name, prefix, td->thread_number - 1, 0);
+	if (fd == -1)
+		return 1;
+
+	ret = read(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr)) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state hdr");
+		log_err("fio: failed reading verify state header\n");
+		goto err;
+	}
+
+	hdr.version = le64_to_cpu(hdr.version);
+	hdr.size = le64_to_cpu(hdr.size);
+	hdr.crc = le64_to_cpu(hdr.crc);
+
+	if (hdr.version != VSTATE_HDR_VERSION) {
+		log_err("fio: unsupported (%d) version in verify state header\n",
+				(unsigned int) hdr.version);
+		goto err;
+	}
+
+	s = malloc(hdr.size);
+	ret = read(fd, s, hdr.size);
+	if (ret != hdr.size) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state");
+		log_err("fio: failed reading verity state\n");
+		goto err;
+	}
+
+	crc = fio_crc32c(s, hdr.size);
+	if (crc != hdr.crc) {
+		log_err("fio: verify state is corrupt\n");
+		goto err;
+	}
+
+	close(fd);
+
+	verify_assign_state(td, s);
+	return 0;
+err:
+	if (s)
+		free(s);
+	close(fd);
+	return 1;
+}
+
+/*
+ * Use the loaded verify state to know when to stop doing verification
+ */
+int verify_state_should_stop(struct thread_data *td, struct io_u *io_u)
+{
+	struct thread_io_list *s = td->vstate;
+	struct fio_file *f = io_u->file;
+	int i;
+
+	if (!s || !f)
+		return 0;
+
+	/*
+	 * If we're not into the window of issues - depth yet, continue. If
+	 * issue is shorter than depth, do check.
+	 */
+	if ((td->io_blocks[DDIR_READ] < s->depth ||
+	    s->numberio - td->io_blocks[DDIR_READ] > s->depth) &&
+	    s->numberio > s->depth)
+		return 0;
+
+	/*
+	 * We're in the window of having to check if this io was
+	 * completed or not. If the IO was seen as completed, then
+	 * lets verify it.
+	 */
+	for (i = 0; i < s->no_comps; i++) {
+		if (s->comps[i].fileno != f->fileno)
+			continue;
+		if (io_u->offset == s->comps[i].offset)
+			return 0;
+	}
+
+	/*
+	 * Not found, we have to stop
+	 */
+	return 1;
+}
diff -Nru fio-2.1.3/verify.h fio-3.16/verify.h
--- fio-2.1.3/verify.h	2013-09-24 14:42:24.000000000 +0000
+++ fio-3.16/verify.h	2019-09-20 01:01:52.000000000 +0000
@@ -2,11 +2,16 @@
 #define FIO_VERIFY_H
 
 #include <stdint.h>
+#include "compiler/compiler.h"
+#include "verify-state.h"
 
 #define FIO_HDR_MAGIC	0xacca
 
 enum {
 	VERIFY_NONE = 0,		/* no verification */
+	VERIFY_HDR_ONLY,		/* verify header only, kept for sake of
+					 * compatibility with old configurations
+					 * which use 'verify=meta' */
 	VERIFY_MD5,			/* md5 sum data blocks */
 	VERIFY_CRC64,			/* crc64 sum data blocks */
 	VERIFY_CRC32,			/* crc32 sum data blocks */
@@ -16,9 +21,14 @@
 	VERIFY_CRC7,			/* crc7 sum data blocks */
 	VERIFY_SHA256,			/* sha256 sum data blocks */
 	VERIFY_SHA512,			/* sha512 sum data blocks */
-	VERIFY_META,			/* block_num, timestamp etc. */
+	VERIFY_SHA3_224,		/* sha3-224 sum data blocks */
+	VERIFY_SHA3_256,		/* sha3-256 sum data blocks */
+	VERIFY_SHA3_384,		/* sha3-384 sum data blocks */
+	VERIFY_SHA3_512,		/* sha3-512 sum data blocks */
+	VERIFY_XXHASH,			/* xxhash sum data blocks */
 	VERIFY_SHA1,			/* sha1 sum data blocks */
 	VERIFY_PATTERN,			/* verify specific patterns */
+	VERIFY_PATTERN_NO_HDR,		/* verify specific patterns, no hdr */
 	VERIFY_NULL,			/* pretend to verify */
 };
 
@@ -32,12 +42,29 @@
 	uint16_t verify_type;
 	uint32_t len;
 	uint64_t rand_seed;
+	uint64_t offset;
+	uint32_t time_sec;
+	uint32_t time_nsec;
+	uint16_t thread;
+	uint16_t numberio;
 	uint32_t crc32;
 };
 
 struct vhdr_md5 {
 	uint32_t md5_digest[4];
 };
+struct vhdr_sha3_224 {
+	uint8_t sha[224 / 8];
+};
+struct vhdr_sha3_256 {
+	uint8_t sha[256 / 8];
+};
+struct vhdr_sha3_384 {
+	uint8_t sha[384 / 8];
+};
+struct vhdr_sha3_512 {
+	uint8_t sha[512 / 8];
+};
 struct vhdr_sha512 {
 	uint8_t sha512[128];
 };
@@ -59,12 +86,8 @@
 struct vhdr_crc7 {
 	uint8_t crc7;
 };
-struct vhdr_meta {
-	uint64_t offset;
-	unsigned char thread;
-	unsigned short numberio;
-	unsigned long time_sec;
-	unsigned long time_usec;
+struct vhdr_xxhash {
+	uint32_t hash;
 };
 
 /*
@@ -72,9 +95,10 @@
  */
 extern void populate_verify_io_u(struct thread_data *, struct io_u *);
 extern int __must_check get_next_verify(struct thread_data *td, struct io_u *);
-extern int __must_check verify_io_u(struct thread_data *, struct io_u *);
-extern int verify_io_u_async(struct thread_data *, struct io_u *);
-extern void fill_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, unsigned long seed, int use_seed);
+extern int __must_check verify_io_u(struct thread_data *, struct io_u **);
+extern int verify_io_u_async(struct thread_data *, struct io_u **);
+extern void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, uint64_t seed, int use_seed);
+extern void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len);
 extern void fio_verify_init(struct thread_data *td);
 
 /*
@@ -83,4 +107,9 @@
 extern int verify_async_init(struct thread_data *);
 extern void verify_async_exit(struct thread_data *);
 
+/*
+ * Callbacks for pasting formats in the pattern buffer
+ */
+extern int paste_blockoff(char *buf, unsigned int len, void *priv);
+
 #endif
diff -Nru fio-2.1.3/verify-state.h fio-3.16/verify-state.h
--- fio-2.1.3/verify-state.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/verify-state.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,109 @@
+#ifndef FIO_VERIFY_STATE_H
+#define FIO_VERIFY_STATE_H
+
+#include <stdint.h>
+#include <string.h>
+#include <limits.h>
+#include "lib/nowarn_snprintf.h"
+
+struct thread_rand32_state {
+	uint32_t s[4];
+};
+
+struct thread_rand64_state {
+	uint64_t s[6];
+};
+
+struct thread_rand_state {
+	uint64_t use64;
+	union {
+		struct thread_rand32_state state32;
+		struct thread_rand64_state state64;
+	};
+};
+
+/*
+ * For dumping current write state
+ */
+struct file_comp {
+	uint64_t fileno;
+	uint64_t offset;
+};
+
+struct thread_io_list {
+	uint64_t no_comps;
+	uint32_t depth;
+	uint32_t nofiles;
+	uint64_t numberio;
+	uint64_t index;
+	struct thread_rand_state rand;
+	uint8_t name[64];
+	struct file_comp comps[0];
+};
+
+struct all_io_list {
+	uint64_t threads;
+	struct thread_io_list state[0];
+};
+
+#define VSTATE_HDR_VERSION	0x03
+
+struct verify_state_hdr {
+	uint64_t version;
+	uint64_t size;
+	uint64_t crc;
+};
+
+#define IO_LIST_ALL		0xffffffff
+
+struct io_u;
+extern struct all_io_list *get_all_io_list(int, size_t *);
+extern void __verify_save_state(struct all_io_list *, const char *);
+extern void verify_save_state(int mask);
+extern int verify_load_state(struct thread_data *, const char *);
+extern void verify_free_state(struct thread_data *);
+extern int verify_state_should_stop(struct thread_data *, struct io_u *);
+extern void verify_assign_state(struct thread_data *, void *);
+extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *);
+
+static inline size_t __thread_io_list_sz(uint32_t depth, uint32_t nofiles)
+{
+	return sizeof(struct thread_io_list) + depth * nofiles * sizeof(struct file_comp);
+}
+
+static inline size_t thread_io_list_sz(struct thread_io_list *s)
+{
+	return __thread_io_list_sz(le32_to_cpu(s->depth), le32_to_cpu(s->nofiles));
+}
+
+static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
+{
+	return (struct thread_io_list *)((char *) s + thread_io_list_sz(s));
+}
+
+static inline void verify_state_gen_name(char *out, size_t size,
+					 const char *name, const char *prefix,
+					 int num)
+{
+	char ename[PATH_MAX];
+	char *ptr;
+
+	/*
+	 * Escape '/', just turn them into '.'
+	 */
+	ptr = ename;
+	do {
+		*ptr = *name;
+		if (*ptr == '\0')
+			break;
+		else if (*ptr == '/')
+			*ptr = '.';
+		ptr++;
+		name++;
+	} while (1);
+
+	nowarn_snprintf(out, size, "%s-%s-%d-verify.state", prefix, ename, num);
+	out[size - 1] = '\0';
+}
+
+#endif
diff -Nru fio-2.1.3/workqueue.c fio-3.16/workqueue.c
--- fio-2.1.3/workqueue.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/workqueue.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,371 @@
+/*
+ * Generic workqueue offload mechanism
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include <unistd.h>
+
+#include "fio.h"
+#include "flist.h"
+#include "workqueue.h"
+#include "smalloc.h"
+#include "pshared.h"
+
+enum {
+	SW_F_IDLE	= 1 << 0,
+	SW_F_RUNNING	= 1 << 1,
+	SW_F_EXIT	= 1 << 2,
+	SW_F_ACCOUNTED	= 1 << 3,
+	SW_F_ERROR	= 1 << 4,
+};
+
+static struct submit_worker *__get_submit_worker(struct workqueue *wq,
+						 unsigned int start,
+						 unsigned int end,
+						 struct submit_worker **best)
+{
+	struct submit_worker *sw = NULL;
+
+	while (start <= end) {
+		sw = &wq->workers[start];
+		if (sw->flags & SW_F_IDLE)
+			return sw;
+		if (!(*best) || sw->seq < (*best)->seq)
+			*best = sw;
+		start++;
+	}
+
+	return NULL;
+}
+
+static struct submit_worker *get_submit_worker(struct workqueue *wq)
+{
+	unsigned int next = wq->next_free_worker;
+	struct submit_worker *sw, *best = NULL;
+
+	assert(next < wq->max_workers);
+
+	sw = __get_submit_worker(wq, next, wq->max_workers - 1, &best);
+	if (!sw && next)
+		sw = __get_submit_worker(wq, 0, next - 1, &best);
+
+	/*
+	 * No truly idle found, use best match
+	 */
+	if (!sw)
+		sw = best;
+
+	if (sw->index == wq->next_free_worker) {
+		if (sw->index + 1 < wq->max_workers)
+			wq->next_free_worker = sw->index + 1;
+		else
+			wq->next_free_worker = 0;
+	}
+
+	return sw;
+}
+
+static bool all_sw_idle(struct workqueue *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		struct submit_worker *sw = &wq->workers[i];
+
+		if (!(sw->flags & SW_F_IDLE))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Must be serialized wrt workqueue_enqueue() by caller
+ */
+void workqueue_flush(struct workqueue *wq)
+{
+	wq->wake_idle = 1;
+
+	while (!all_sw_idle(wq)) {
+		pthread_mutex_lock(&wq->flush_lock);
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	}
+
+	wq->wake_idle = 0;
+}
+
+/*
+ * Must be serialized by caller.
+ */
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work)
+{
+	struct submit_worker *sw;
+
+	sw = get_submit_worker(wq);
+	assert(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	flist_add_tail(&work->list, &sw->work_list);
+	sw->seq = ++wq->work_seq;
+	sw->flags &= ~SW_F_IDLE;
+
+	pthread_cond_signal(&sw->cond);
+	pthread_mutex_unlock(&sw->lock);
+}
+
+static void handle_list(struct submit_worker *sw, struct flist_head *list)
+{
+	struct workqueue *wq = sw->wq;
+	struct workqueue_work *work;
+
+	while (!flist_empty(list)) {
+		work = flist_first_entry(list, struct workqueue_work, list);
+		flist_del_init(&work->list);
+		wq->ops.fn(sw, work);
+	}
+}
+
+static void *worker_thread(void *data)
+{
+	struct submit_worker *sw = data;
+	struct workqueue *wq = sw->wq;
+	unsigned int ret = 0;
+	FLIST_HEAD(local_list);
+
+	sk_out_assign(sw->sk_out);
+
+	if (wq->ops.nice) {
+		if (nice(wq->ops.nice) < 0) {
+			log_err("workqueue: nice %s\n", strerror(errno));
+			ret = 1;
+		}
+	}
+
+	if (!ret)
+		ret = workqueue_init_worker(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	sw->flags |= SW_F_RUNNING;
+	if (ret)
+		sw->flags |= SW_F_ERROR;
+	pthread_mutex_unlock(&sw->lock);
+
+	pthread_mutex_lock(&wq->flush_lock);
+	pthread_cond_signal(&wq->flush_cond);
+	pthread_mutex_unlock(&wq->flush_lock);
+
+	if (sw->flags & SW_F_ERROR)
+		goto done;
+
+	while (1) {
+		pthread_mutex_lock(&sw->lock);
+
+		if (flist_empty(&sw->work_list)) {
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			}
+
+			if (workqueue_pre_sleep_check(sw)) {
+				pthread_mutex_unlock(&sw->lock);
+				workqueue_pre_sleep(sw);
+				pthread_mutex_lock(&sw->lock);
+			}
+
+			/*
+			 * We dropped and reaquired the lock, check
+			 * state again.
+			 */
+			if (!flist_empty(&sw->work_list))
+				goto handle_work;
+
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			} else if (!(sw->flags & SW_F_IDLE)) {
+				sw->flags |= SW_F_IDLE;
+				wq->next_free_worker = sw->index;
+				if (wq->wake_idle)
+					pthread_cond_signal(&wq->flush_cond);
+			}
+
+			pthread_cond_wait(&sw->cond, &sw->lock);
+		} else {
+handle_work:
+			flist_splice_init(&sw->work_list, &local_list);
+		}
+		pthread_mutex_unlock(&sw->lock);
+		handle_list(sw, &local_list);
+		if (wq->ops.update_acct_fn)
+			wq->ops.update_acct_fn(sw);
+	}
+
+done:
+	sk_out_drop();
+	return NULL;
+}
+
+static void free_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+
+	workqueue_exit_worker(sw, sum_cnt);
+
+	pthread_cond_destroy(&sw->cond);
+	pthread_mutex_destroy(&sw->lock);
+
+	if (wq->ops.free_worker_fn)
+		wq->ops.free_worker_fn(sw);
+}
+
+static void shutdown_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	pthread_join(sw->thread, NULL);
+	free_worker(sw, sum_cnt);
+}
+
+void workqueue_exit(struct workqueue *wq)
+{
+	unsigned int shutdown, sum_cnt = 0;
+	struct submit_worker *sw;
+	int i;
+
+	if (!wq->workers)
+		return;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		sw = &wq->workers[i];
+
+		pthread_mutex_lock(&sw->lock);
+		sw->flags |= SW_F_EXIT;
+		pthread_cond_signal(&sw->cond);
+		pthread_mutex_unlock(&sw->lock);
+	}
+
+	do {
+		shutdown = 0;
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			if (sw->flags & SW_F_ACCOUNTED)
+				continue;
+			pthread_mutex_lock(&sw->lock);
+			sw->flags |= SW_F_ACCOUNTED;
+			pthread_mutex_unlock(&sw->lock);
+			shutdown_worker(sw, &sum_cnt);
+			shutdown++;
+		}
+	} while (shutdown && shutdown != wq->max_workers);
+
+	sfree(wq->workers);
+	wq->workers = NULL;
+	pthread_mutex_destroy(&wq->flush_lock);
+	pthread_cond_destroy(&wq->flush_cond);
+	pthread_mutex_destroy(&wq->stat_lock);
+}
+
+static int start_worker(struct workqueue *wq, unsigned int index,
+			struct sk_out *sk_out)
+{
+	struct submit_worker *sw = &wq->workers[index];
+	int ret;
+
+	INIT_FLIST_HEAD(&sw->work_list);
+
+	ret = mutex_cond_init_pshared(&sw->lock, &sw->cond);
+	if (ret)
+		return ret;
+
+	sw->wq = wq;
+	sw->index = index;
+	sw->sk_out = sk_out;
+
+	if (wq->ops.alloc_worker_fn) {
+		ret = wq->ops.alloc_worker_fn(sw);
+		if (ret)
+			return ret;
+	}
+
+	ret = pthread_create(&sw->thread, NULL, worker_thread, sw);
+	if (!ret) {
+		pthread_mutex_lock(&sw->lock);
+		sw->flags = SW_F_IDLE;
+		pthread_mutex_unlock(&sw->lock);
+		return 0;
+	}
+
+	free_worker(sw, NULL);
+	return 1;
+}
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq,
+		   struct workqueue_ops *ops, unsigned int max_workers,
+		   struct sk_out *sk_out)
+{
+	unsigned int running;
+	int i, error;
+	int ret;
+
+	wq->max_workers = max_workers;
+	wq->td = td;
+	wq->ops = *ops;
+	wq->work_seq = 0;
+	wq->next_free_worker = 0;
+
+	ret = mutex_cond_init_pshared(&wq->flush_lock, &wq->flush_cond);
+	if (ret)
+		goto err;
+	ret = mutex_init_pshared(&wq->stat_lock);
+	if (ret)
+		goto err;
+
+	wq->workers = smalloc(wq->max_workers * sizeof(struct submit_worker));
+	if (!wq->workers)
+		goto err;
+
+	for (i = 0; i < wq->max_workers; i++)
+		if (start_worker(wq, i, sk_out))
+			break;
+
+	wq->max_workers = i;
+	if (!wq->max_workers)
+		goto err;
+
+	/*
+	 * Wait for them all to be started and initialized
+	 */
+	error = 0;
+	do {
+		struct submit_worker *sw;
+
+		running = 0;
+		pthread_mutex_lock(&wq->flush_lock);
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			pthread_mutex_lock(&sw->lock);
+			if (sw->flags & SW_F_RUNNING)
+				running++;
+			if (sw->flags & SW_F_ERROR)
+				error++;
+			pthread_mutex_unlock(&sw->lock);
+		}
+
+		if (error || running == wq->max_workers) {
+			pthread_mutex_unlock(&wq->flush_lock);
+			break;
+		}
+
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	} while (1);
+
+	if (!error)
+		return 0;
+
+err:
+	log_err("Can't create rate workqueue\n");
+	td_verror(td, ESRCH, "workqueue_init");
+	workqueue_exit(wq);
+	return 1;
+}
diff -Nru fio-2.1.3/workqueue.h fio-3.16/workqueue.h
--- fio-2.1.3/workqueue.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/workqueue.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,119 @@
+#ifndef FIO_RATE_H
+#define FIO_RATE_H
+
+#include <inttypes.h>
+#include <pthread.h>
+
+#include "flist.h"
+#include "lib/types.h"
+
+struct sk_out;
+struct thread_data;
+
+struct workqueue_work {
+	struct flist_head list;
+};
+
+struct submit_worker {
+	pthread_t thread;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	struct flist_head work_list;
+	unsigned int flags;
+	unsigned int index;
+	uint64_t seq;
+	struct workqueue *wq;
+	void *priv;
+	struct sk_out *sk_out;
+};
+
+typedef int (workqueue_work_fn)(struct submit_worker *, struct workqueue_work *);
+typedef bool (workqueue_pre_sleep_flush_fn)(struct submit_worker *);
+typedef void (workqueue_pre_sleep_fn)(struct submit_worker *);
+typedef int (workqueue_alloc_worker_fn)(struct submit_worker *);
+typedef void (workqueue_free_worker_fn)(struct submit_worker *);
+typedef int (workqueue_init_worker_fn)(struct submit_worker *);
+typedef void (workqueue_exit_worker_fn)(struct submit_worker *, unsigned int *);
+typedef void (workqueue_update_acct_fn)(struct submit_worker *);
+
+struct workqueue_ops {
+	workqueue_work_fn *fn;
+	workqueue_pre_sleep_flush_fn *pre_sleep_flush_fn;
+	workqueue_pre_sleep_fn *pre_sleep_fn;
+
+	workqueue_update_acct_fn *update_acct_fn;
+
+	workqueue_alloc_worker_fn *alloc_worker_fn;
+	workqueue_free_worker_fn *free_worker_fn;
+
+	workqueue_init_worker_fn *init_worker_fn;
+	workqueue_exit_worker_fn *exit_worker_fn;
+
+	unsigned int nice;
+};
+
+struct workqueue {
+	unsigned int max_workers;
+
+	struct thread_data *td;
+	struct workqueue_ops ops;
+
+	uint64_t work_seq;
+	struct submit_worker *workers;
+	unsigned int next_free_worker;
+
+	pthread_cond_t flush_cond;
+	pthread_mutex_t flush_lock;
+	pthread_mutex_t stat_lock;
+	volatile int wake_idle;
+};
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq, struct workqueue_ops *ops, unsigned int max_workers, struct sk_out *sk_out);
+void workqueue_exit(struct workqueue *wq);
+
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work);
+void workqueue_flush(struct workqueue *wq);
+
+static inline bool workqueue_pre_sleep_check(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.pre_sleep_flush_fn)
+		return false;
+
+	return wq->ops.pre_sleep_flush_fn(sw);
+}
+
+static inline void workqueue_pre_sleep(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (wq->ops.pre_sleep_fn)
+		wq->ops.pre_sleep_fn(sw);
+}
+
+static inline int workqueue_init_worker(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.init_worker_fn)
+		return 0;
+
+	return wq->ops.init_worker_fn(sw);
+}
+
+static inline void workqueue_exit_worker(struct submit_worker *sw,
+					 unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+	unsigned int tmp = 1;
+
+	if (!wq->ops.exit_worker_fn)
+		return;
+
+	if (!sum_cnt)
+		sum_cnt = &tmp;
+
+	wq->ops.exit_worker_fn(sw, sum_cnt);
+}
+#endif
diff -Nru fio-2.1.3/zbd.c fio-3.16/zbd.c
--- fio-2.1.3/zbd.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/zbd.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,1519 @@
+/*
+ * Copyright (C) 2018 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/blkzoned.h>
+#include "file.h"
+#include "fio.h"
+#include "lib/pow2.h"
+#include "log.h"
+#include "smalloc.h"
+#include "verify.h"
+#include "zbd.h"
+
+/**
+ * zbd_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ *	    past the disk size then the index of the sentinel is returned.
+ */
+static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+{
+	uint32_t zone_idx;
+
+	if (f->zbd_info->zone_size_log2 > 0)
+		zone_idx = offset >> f->zbd_info->zone_size_log2;
+	else
+		zone_idx = offset / f->zbd_info->zone_size;
+
+	return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+			  uint64_t required)
+{
+	assert((required & 511) == 0);
+
+	return z->type == BLK_ZONE_TYPE_SEQWRITE_REQ &&
+		z->wp + required > z->start + f->zbd_info->zone_size;
+}
+
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+	return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+/* Verify whether direct I/O is used for all host-managed zoned drives. */
+static bool zbd_using_direct_io(void)
+{
+	struct thread_data *td;
+	struct fio_file *f;
+	int i, j;
+
+	for_each_td(td, i) {
+		if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
+			continue;
+		for_each_file(td, f, j) {
+			if (f->zbd_info &&
+			    f->zbd_info->model == ZBD_DM_HOST_MANAGED)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/* Whether or not the I/O range for f includes one or more sequential zones */
+static bool zbd_is_seq_job(struct fio_file *f)
+{
+	uint32_t zone_idx, zone_idx_b, zone_idx_e;
+
+	assert(f->zbd_info);
+	if (f->io_size == 0)
+		return false;
+	zone_idx_b = zbd_zone_idx(f, f->file_offset);
+	zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+	for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
+		if (f->zbd_info->zone_info[zone_idx].type ==
+		    BLK_ZONE_TYPE_SEQWRITE_REQ)
+			return true;
+
+	return false;
+}
+
+/*
+ * Verify whether offset and size parameters are aligned with zone boundaries.
+ */
+static bool zbd_verify_sizes(void)
+{
+	const struct fio_zone_info *z;
+	struct thread_data *td;
+	struct fio_file *f;
+	uint64_t new_offset, new_end;
+	uint32_t zone_idx;
+	int i, j;
+
+	for_each_td(td, i) {
+		for_each_file(td, f, j) {
+			if (!f->zbd_info)
+				continue;
+			if (f->file_offset >= f->real_file_size)
+				continue;
+			if (!zbd_is_seq_job(f))
+				continue;
+
+			if (!td->o.zone_size) {
+				td->o.zone_size = f->zbd_info->zone_size;
+				if (!td->o.zone_size) {
+					log_err("%s: invalid 0 zone size\n",
+						f->file_name);
+					return false;
+				}
+			} else if (td->o.zone_size != f->zbd_info->zone_size) {
+				log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
+					f->file_name, (unsigned long long) td->o.zone_size,
+					(unsigned long long) f->zbd_info->zone_size);
+				return false;
+			}
+
+			if (td->o.zone_skip &&
+			    (td->o.zone_skip < td->o.zone_size ||
+			     td->o.zone_skip % td->o.zone_size)) {
+				log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+					f->file_name, (unsigned long long) td->o.zone_skip,
+					(unsigned long long) td->o.zone_size);
+				return false;
+			}
+
+			zone_idx = zbd_zone_idx(f, f->file_offset);
+			z = &f->zbd_info->zone_info[zone_idx];
+			if (f->file_offset != z->start) {
+				new_offset = (z+1)->start;
+				if (new_offset >= f->file_offset + f->io_size) {
+					log_info("%s: io_size must be at least one zone\n",
+						 f->file_name);
+					return false;
+				}
+				log_info("%s: rounded up offset from %llu to %llu\n",
+					 f->file_name, (unsigned long long) f->file_offset,
+					 (unsigned long long) new_offset);
+				f->io_size -= (new_offset - f->file_offset);
+				f->file_offset = new_offset;
+			}
+			zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
+			z = &f->zbd_info->zone_info[zone_idx];
+			new_end = z->start;
+			if (f->file_offset + f->io_size != new_end) {
+				if (new_end <= f->file_offset) {
+					log_info("%s: io_size must be at least one zone\n",
+						 f->file_name);
+					return false;
+				}
+				log_info("%s: rounded down io_size from %llu to %llu\n",
+					 f->file_name, (unsigned long long) f->io_size,
+					 (unsigned long long) new_end - f->file_offset);
+				f->io_size = new_end - f->file_offset;
+			}
+		}
+	}
+
+	return true;
+}
+
+static bool zbd_verify_bs(void)
+{
+	struct thread_data *td;
+	struct fio_file *f;
+	uint32_t zone_size;
+	int i, j, k;
+
+	for_each_td(td, i) {
+		for_each_file(td, f, j) {
+			if (!f->zbd_info)
+				continue;
+			zone_size = f->zbd_info->zone_size;
+			for (k = 0; k < ARRAY_SIZE(td->o.bs); k++) {
+				if (td->o.verify != VERIFY_NONE &&
+				    zone_size % td->o.bs[k] != 0) {
+					log_info("%s: block size %llu is not a divisor of the zone size %d\n",
+						 f->file_name, td->o.bs[k],
+						 zone_size);
+					return false;
+				}
+			}
+		}
+	}
+	return true;
+}
+
+/*
+ * Read zone information into @buf starting from sector @start_sector.
+ * @fd is a file descriptor that refers to a block device and @bufsz is the
+ * size of @buf.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ * If the zone report is empty, always assume an error (device problem) and
+ * return -EIO.
+ */
+static int read_zone_info(int fd, uint64_t start_sector,
+			  void *buf, unsigned int bufsz)
+{
+	struct blk_zone_report *hdr = buf;
+	int ret;
+
+	if (bufsz < sizeof(*hdr))
+		return -EINVAL;
+
+	memset(hdr, 0, sizeof(*hdr));
+
+	hdr->nr_zones = (bufsz - sizeof(*hdr)) / sizeof(struct blk_zone);
+	hdr->sector = start_sector;
+	ret = ioctl(fd, BLKREPORTZONE, hdr);
+	if (ret)
+		return -errno;
+	if (!hdr->nr_zones)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * Read up to 255 characters from the first line of a file. Strip the trailing
+ * newline.
+ */
+static char *read_file(const char *path)
+{
+	char line[256], *p = line;
+	FILE *f;
+
+	f = fopen(path, "rb");
+	if (!f)
+		return NULL;
+	if (!fgets(line, sizeof(line), f))
+		line[0] = '\0';
+	strsep(&p, "\n");
+	fclose(f);
+
+	return strdup(line);
+}
+
+static enum blk_zoned_model get_zbd_model(const char *file_name)
+{
+	enum blk_zoned_model model = ZBD_DM_NONE;
+	char *zoned_attr_path = NULL;
+	char *model_str = NULL;
+	struct stat statbuf;
+	char *sys_devno_path = NULL;
+	char *part_attr_path = NULL;
+	char *part_str = NULL;
+	char sys_path[PATH_MAX];
+	ssize_t sz;
+	char *delim = NULL;
+
+	if (stat(file_name, &statbuf) < 0)
+		goto out;
+
+	if (asprintf(&sys_devno_path, "/sys/dev/block/%d:%d",
+		     major(statbuf.st_rdev), minor(statbuf.st_rdev)) < 0)
+		goto out;
+
+	sz = readlink(sys_devno_path, sys_path, sizeof(sys_path) - 1);
+	if (sz < 0)
+		goto out;
+	sys_path[sz] = '\0';
+
+	/*
+	 * If the device is a partition device, cut the device name in the
+	 * canonical sysfs path to obtain the sysfs path of the holder device.
+	 *   e.g.:  /sys/devices/.../sda/sda1 -> /sys/devices/.../sda
+	 */
+	if (asprintf(&part_attr_path, "/sys/dev/block/%s/partition",
+		     sys_path) < 0)
+		goto out;
+	part_str = read_file(part_attr_path);
+	if (part_str && *part_str == '1') {
+		delim = strrchr(sys_path, '/');
+		if (!delim)
+			goto out;
+		*delim = '\0';
+	}
+
+	if (asprintf(&zoned_attr_path,
+		     "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+		goto out;
+
+	model_str = read_file(zoned_attr_path);
+	if (!model_str)
+		goto out;
+	dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+	if (strcmp(model_str, "host-aware") == 0)
+		model = ZBD_DM_HOST_AWARE;
+	else if (strcmp(model_str, "host-managed") == 0)
+		model = ZBD_DM_HOST_MANAGED;
+
+out:
+	free(model_str);
+	free(zoned_attr_path);
+	free(part_str);
+	free(part_attr_path);
+	free(sys_devno_path);
+	return model;
+}
+
+static int ilog2(uint64_t i)
+{
+	int log = -1;
+
+	while (i) {
+		i >>= 1;
+		log++;
+	}
+	return log;
+}
+
+/*
+ * Initialize f->zbd_info for devices that are not zoned block devices. This
+ * allows to execute a ZBD workload against a non-ZBD device.
+ */
+static int init_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	uint32_t nr_zones;
+	struct fio_zone_info *p;
+	uint64_t zone_size = td->o.zone_size;
+	struct zoned_block_device_info *zbd_info = NULL;
+	pthread_mutexattr_t attr;
+	int i;
+
+	if (zone_size == 0) {
+		log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+			f->file_name);
+		return 1;
+	}
+
+	if (zone_size < 512) {
+		log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n",
+			f->file_name);
+		return 1;
+	}
+
+	nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
+	zbd_info = scalloc(1, sizeof(*zbd_info) +
+			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
+	if (!zbd_info)
+		return -ENOMEM;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutexattr_setpshared(&attr, true);
+	pthread_mutex_init(&zbd_info->mutex, &attr);
+	zbd_info->refcount = 1;
+	p = &zbd_info->zone_info[0];
+	for (i = 0; i < nr_zones; i++, p++) {
+		pthread_mutex_init(&p->mutex, &attr);
+		p->start = i * zone_size;
+		p->wp = p->start + zone_size;
+		p->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		p->cond = BLK_ZONE_COND_EMPTY;
+	}
+	/* a sentinel */
+	p->start = nr_zones * zone_size;
+
+	f->zbd_info = zbd_info;
+	f->zbd_info->zone_size = zone_size;
+	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
+		ilog2(zone_size) : -1;
+	f->zbd_info->nr_zones = nr_zones;
+	pthread_mutexattr_destroy(&attr);
+	return 0;
+}
+
+/*
+ * Parse the BLKREPORTZONE output and store it in f->zbd_info. Must be called
+ * only for devices that support this ioctl, namely zoned block devices.
+ */
+static int parse_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	const unsigned int bufsz = sizeof(struct blk_zone_report) +
+		4096 * sizeof(struct blk_zone);
+	uint32_t nr_zones;
+	struct blk_zone_report *hdr;
+	const struct blk_zone *z;
+	struct fio_zone_info *p;
+	uint64_t zone_size, start_sector;
+	struct zoned_block_device_info *zbd_info = NULL;
+	pthread_mutexattr_t attr;
+	void *buf;
+	int fd, i, j, ret = 0;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+	pthread_mutexattr_setpshared(&attr, true);
+
+	buf = malloc(bufsz);
+	if (!buf)
+		goto out;
+
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0) {
+		ret = -errno;
+		goto free;
+	}
+
+	ret = read_zone_info(fd, 0, buf, bufsz);
+	if (ret < 0) {
+		log_info("fio: BLKREPORTZONE(%lu) failed for %s (%d).\n",
+			 0UL, f->file_name, -ret);
+		goto close;
+	}
+	hdr = buf;
+	if (hdr->nr_zones < 1) {
+		log_info("fio: %s has invalid zone information.\n",
+			 f->file_name);
+		goto close;
+	}
+	z = (void *)(hdr + 1);
+	zone_size = z->len << 9;
+	nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
+
+	if (td->o.zone_size == 0) {
+		td->o.zone_size = zone_size;
+	} else if (td->o.zone_size != zone_size) {
+		log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
+			f->file_name, (unsigned long long) td->o.zone_size,
+			(unsigned long long) zone_size);
+		ret = -EINVAL;
+		goto close;
+	}
+
+	dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
+	       nr_zones, (unsigned long long) zone_size / 1024);
+
+	zbd_info = scalloc(1, sizeof(*zbd_info) +
+			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
+	ret = -ENOMEM;
+	if (!zbd_info)
+		goto close;
+	pthread_mutex_init(&zbd_info->mutex, &attr);
+	zbd_info->refcount = 1;
+	p = &zbd_info->zone_info[0];
+	for (start_sector = 0, j = 0; j < nr_zones;) {
+		z = (void *)(hdr + 1);
+		for (i = 0; i < hdr->nr_zones; i++, j++, z++, p++) {
+			pthread_mutex_init(&p->mutex, &attr);
+			p->start = z->start << 9;
+			switch (z->cond) {
+			case BLK_ZONE_COND_NOT_WP:
+			case BLK_ZONE_COND_FULL:
+				p->wp = p->start + zone_size;
+				break;
+			default:
+				assert(z->start <= z->wp);
+				assert(z->wp <= z->start + (zone_size >> 9));
+				p->wp = z->wp << 9;
+				break;
+			}
+			p->type = z->type;
+			p->cond = z->cond;
+			if (j > 0 && p->start != p[-1].start + zone_size) {
+				log_info("%s: invalid zone data\n",
+					 f->file_name);
+				ret = -EINVAL;
+				goto close;
+			}
+		}
+		z--;
+		start_sector = z->start + z->len;
+		if (j >= nr_zones)
+			break;
+		ret = read_zone_info(fd, start_sector, buf, bufsz);
+		if (ret < 0) {
+			log_info("fio: BLKREPORTZONE(%llu) failed for %s (%d).\n",
+				 (unsigned long long) start_sector, f->file_name, -ret);
+			goto close;
+		}
+	}
+	/* a sentinel */
+	zbd_info->zone_info[nr_zones].start = start_sector << 9;
+
+	f->zbd_info = zbd_info;
+	f->zbd_info->zone_size = zone_size;
+	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
+		ilog2(zone_size) : -1;
+	f->zbd_info->nr_zones = nr_zones;
+	zbd_info = NULL;
+	ret = 0;
+
+close:
+	sfree(zbd_info);
+	close(fd);
+free:
+	free(buf);
+out:
+	pthread_mutexattr_destroy(&attr);
+	return ret;
+}
+
+/*
+ * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
+{
+	enum blk_zoned_model zbd_model;
+	int ret = 0;
+
+	assert(td->o.zone_mode == ZONE_MODE_ZBD);
+
+	zbd_model = get_zbd_model(f->file_name);
+	switch (zbd_model) {
+	case ZBD_DM_HOST_AWARE:
+	case ZBD_DM_HOST_MANAGED:
+		ret = parse_zone_info(td, f);
+		break;
+	case ZBD_DM_NONE:
+		ret = init_zone_info(td, f);
+		break;
+	}
+	if (ret == 0)
+		f->zbd_info->model = zbd_model;
+	return ret;
+}
+
+void zbd_free_zone_info(struct fio_file *f)
+{
+	uint32_t refcount;
+
+	if (!f->zbd_info)
+		return;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	refcount = --f->zbd_info->refcount;
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	assert((int32_t)refcount >= 0);
+	if (refcount == 0)
+		sfree(f->zbd_info);
+	f->zbd_info = NULL;
+}
+
+/*
+ * Initialize f->zbd_info.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * Note: this function can only work correctly if it is called before the first
+ * fio fork() call.
+ */
+static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
+{
+	struct thread_data *td2;
+	struct fio_file *f2;
+	int i, j, ret;
+
+	for_each_td(td2, i) {
+		for_each_file(td2, f2, j) {
+			if (td2 == td && f2 == file)
+				continue;
+			if (!f2->zbd_info ||
+			    strcmp(f2->file_name, file->file_name) != 0)
+				continue;
+			file->zbd_info = f2->zbd_info;
+			file->zbd_info->refcount++;
+			return 0;
+		}
+	}
+
+	ret = zbd_create_zone_info(td, file);
+	if (ret < 0)
+		td_verror(td, -ret, "zbd_create_zone_info() failed");
+	return ret;
+}
+
+int zbd_init(struct thread_data *td)
+{
+	struct fio_file *f;
+	int i;
+
+	for_each_file(td, f, i) {
+		if (f->filetype != FIO_TYPE_BLOCK)
+			continue;
+		if (zbd_init_zone_info(td, f))
+			return 1;
+	}
+
+	if (!zbd_using_direct_io()) {
+		log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
+		return 1;
+	}
+
+	if (!zbd_verify_sizes())
+		return 1;
+
+	if (!zbd_verify_bs())
+		return 1;
+
+	return 0;
+}
+
+/**
+ * zbd_reset_range - reset zones for a range of sectors
+ * @td: FIO thread data.
+ * @f: Fio file for which to reset zones
+ * @sector: Starting sector in units of 512 bytes
+ * @nr_sectors: Number of sectors in units of 512 bytes
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_reset_range(struct thread_data *td, const struct fio_file *f,
+			   uint64_t offset, uint64_t length)
+{
+	struct blk_zone_range zr = {
+		.sector         = offset >> 9,
+		.nr_sectors     = length >> 9,
+	};
+	uint32_t zone_idx_b, zone_idx_e;
+	struct fio_zone_info *zb, *ze, *z;
+	int ret = 0;
+
+	assert(f->fd != -1);
+	assert(is_valid_offset(f, offset + length - 1));
+	switch (f->zbd_info->model) {
+	case ZBD_DM_HOST_AWARE:
+	case ZBD_DM_HOST_MANAGED:
+		ret = ioctl(f->fd, BLKRESETZONE, &zr);
+		if (ret < 0) {
+			td_verror(td, errno, "resetting wp failed");
+			log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
+				f->file_name, zr.nr_sectors, zr.sector, errno);
+			return ret;
+		}
+		break;
+	case ZBD_DM_NONE:
+		break;
+	}
+
+	zone_idx_b = zbd_zone_idx(f, offset);
+	zb = &f->zbd_info->zone_info[zone_idx_b];
+	zone_idx_e = zbd_zone_idx(f, offset + length);
+	ze = &f->zbd_info->zone_info[zone_idx_e];
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		pthread_mutex_lock(&f->zbd_info->mutex);
+		f->zbd_info->sectors_with_data -= z->wp - z->start;
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		z->wp = z->start;
+		z->verify_block = 0;
+		pthread_mutex_unlock(&z->mutex);
+	}
+
+	td->ts.nr_zone_resets += ze - zb;
+
+	return ret;
+}
+
+static unsigned int zbd_zone_nr(struct zoned_block_device_info *zbd_info,
+				struct fio_zone_info *zone)
+{
+	return zone - zbd_info->zone_info;
+}
+
+/**
+ * zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_reset_zone(struct thread_data *td, const struct fio_file *f,
+			  struct fio_zone_info *z)
+{
+	dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
+		zbd_zone_nr(f->zbd_info, z));
+
+	return zbd_reset_range(td, f, z->start, (z+1)->start - z->start);
+}
+
+/*
+ * Reset a range of zones. Returns 0 upon success and 1 upon failure.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ * @all_zones: whether to reset all zones or only those zones for which the
+ *	write pointer is not a multiple of td->o.min_bs[DDIR_WRITE].
+ */
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+			   struct fio_zone_info *const zb,
+			   struct fio_zone_info *const ze, bool all_zones)
+{
+	struct fio_zone_info *z, *start_z = ze;
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	bool reset_wp;
+	int res = 0;
+
+	dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
+		zbd_zone_nr(f->zbd_info, zb), zbd_zone_nr(f->zbd_info, ze));
+	assert(f->fd != -1);
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		switch (z->type) {
+		case BLK_ZONE_TYPE_SEQWRITE_REQ:
+			reset_wp = all_zones ? z->wp != z->start :
+					(td->o.td_ddir & TD_DDIR_WRITE) &&
+					z->wp % min_bs != 0;
+			if (start_z == ze && reset_wp) {
+				start_z = z;
+			} else if (start_z < ze && !reset_wp) {
+				dprint(FD_ZBD,
+				       "%s: resetting zones %u .. %u\n",
+				       f->file_name,
+					zbd_zone_nr(f->zbd_info, start_z),
+					zbd_zone_nr(f->zbd_info, z));
+				if (zbd_reset_range(td, f, start_z->start,
+						z->start - start_z->start) < 0)
+					res = 1;
+				start_z = ze;
+			}
+			break;
+		default:
+			if (start_z == ze)
+				break;
+			dprint(FD_ZBD, "%s: resetting zones %u .. %u\n",
+			       f->file_name, zbd_zone_nr(f->zbd_info, start_z),
+			       zbd_zone_nr(f->zbd_info, z));
+			if (zbd_reset_range(td, f, start_z->start,
+					    z->start - start_z->start) < 0)
+				res = 1;
+			start_z = ze;
+			break;
+		}
+	}
+	if (start_z < ze) {
+		dprint(FD_ZBD, "%s: resetting zones %u .. %u\n", f->file_name,
+			zbd_zone_nr(f->zbd_info, start_z),
+			zbd_zone_nr(f->zbd_info, z));
+		if (zbd_reset_range(td, f, start_z->start,
+				    z->start - start_z->start) < 0)
+			res = 1;
+	}
+	for (z = zb; z < ze; z++)
+		pthread_mutex_unlock(&z->mutex);
+
+	return res;
+}
+
+/*
+ * Reset zbd_info.write_cnt, the counter that counts down towards the next
+ * zone reset.
+ */
+static void zbd_reset_write_cnt(const struct thread_data *td,
+				const struct fio_file *f)
+{
+	assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	f->zbd_info->write_cnt = td->o.zrf.u.f ?
+		min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+}
+
+static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
+					const struct fio_file *f)
+{
+	uint32_t write_cnt = 0;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	assert(f->zbd_info->write_cnt);
+	if (f->zbd_info->write_cnt)
+		write_cnt = --f->zbd_info->write_cnt;
+	if (write_cnt == 0)
+		zbd_reset_write_cnt(td, f);
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	return write_cnt == 0;
+}
+
+enum swd_action {
+	CHECK_SWD,
+	SET_SWD,
+};
+
+/* Calculate the number of sectors with data (swd) and perform action 'a' */
+static uint64_t zbd_process_swd(const struct fio_file *f, enum swd_action a)
+{
+	struct fio_zone_info *zb, *ze, *z;
+	uint64_t swd = 0;
+
+	zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+	ze = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset +
+						  f->io_size)];
+	for (z = zb; z < ze; z++) {
+		pthread_mutex_lock(&z->mutex);
+		swd += z->wp - z->start;
+	}
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	switch (a) {
+	case CHECK_SWD:
+		assert(f->zbd_info->sectors_with_data == swd);
+		break;
+	case SET_SWD:
+		f->zbd_info->sectors_with_data = swd;
+		break;
+	}
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	for (z = zb; z < ze; z++)
+		pthread_mutex_unlock(&z->mutex);
+
+	return swd;
+}
+
+/*
+ * The swd check is useful for debugging but takes too much time to leave
+ * it enabled all the time. Hence it is disabled by default.
+ */
+static const bool enable_check_swd = false;
+
+/* Check whether the value of zbd_info.sectors_with_data is correct. */
+static void zbd_check_swd(const struct fio_file *f)
+{
+	if (!enable_check_swd)
+		return;
+
+	zbd_process_swd(f, CHECK_SWD);
+}
+
+static void zbd_init_swd(struct fio_file *f)
+{
+	uint64_t swd;
+
+	swd = zbd_process_swd(f, SET_SWD);
+	dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
+	       swd);
+}
+
+void zbd_file_reset(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_zone_info *zb, *ze;
+	uint32_t zone_idx_e;
+
+	if (!f->zbd_info)
+		return;
+
+	zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+	zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size);
+	ze = &f->zbd_info->zone_info[zone_idx_e];
+	zbd_init_swd(f);
+	/*
+	 * If data verification is enabled reset the affected zones before
+	 * writing any data to avoid that a zone reset has to be issued while
+	 * writing data, which causes data loss.
+	 */
+	zbd_reset_zones(td, f, zb, ze, td->o.verify != VERIFY_NONE &&
+			(td->o.td_ddir & TD_DDIR_WRITE) &&
+			td->runstate != TD_VERIFYING);
+	zbd_reset_write_cnt(td, f);
+}
+
+/* The caller must hold f->zbd_info->mutex. */
+static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
+			 unsigned int zone_idx)
+{
+	struct zoned_block_device_info *zbdi = f->zbd_info;
+	int i;
+
+	assert(td->o.max_open_zones <= ARRAY_SIZE(zbdi->open_zones));
+	assert(zbdi->num_open_zones <= td->o.max_open_zones);
+
+	for (i = 0; i < zbdi->num_open_zones; i++)
+		if (zbdi->open_zones[i] == zone_idx)
+			return true;
+
+	return false;
+}
+
+/*
+ * Open a ZBD zone if it was not yet open. Returns true if either the zone was
+ * already open or if opening a new zone is allowed. Returns false if the zone
+ * was not yet open and opening a new zone would cause the zone limit to be
+ * exceeded.
+ */
+static bool zbd_open_zone(struct thread_data *td, const struct io_u *io_u,
+			  uint32_t zone_idx)
+{
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z = &f->zbd_info->zone_info[zone_idx];
+	bool res = true;
+
+	if (z->cond == BLK_ZONE_COND_OFFLINE)
+		return false;
+
+	/*
+	 * Skip full zones with data verification enabled because resetting a
+	 * zone causes data loss and hence causes verification to fail.
+	 */
+	if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+		return false;
+
+	/* Zero means no limit */
+	if (!td->o.max_open_zones)
+		return true;
+
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	if (is_zone_open(td, f, zone_idx))
+		goto out;
+	res = false;
+	if (f->zbd_info->num_open_zones >= td->o.max_open_zones)
+		goto out;
+	dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
+	f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
+	z->open = 1;
+	res = true;
+
+out:
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	return res;
+}
+
+/* The caller must hold f->zbd_info->mutex */
+static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
+			   unsigned int open_zone_idx)
+{
+	uint32_t zone_idx;
+
+	assert(open_zone_idx < f->zbd_info->num_open_zones);
+	zone_idx = f->zbd_info->open_zones[open_zone_idx];
+	memmove(f->zbd_info->open_zones + open_zone_idx,
+		f->zbd_info->open_zones + open_zone_idx + 1,
+		(FIO_MAX_OPEN_ZBD_ZONES - (open_zone_idx + 1)) *
+		sizeof(f->zbd_info->open_zones[0]));
+	f->zbd_info->num_open_zones--;
+	f->zbd_info->zone_info[zone_idx].open = 0;
+}
+
+/*
+ * Modify the offset of an I/O unit that does not refer to an open zone such
+ * that it refers to an open zone. Close an open zone and open a new zone if
+ * necessary. This algorithm can only work correctly if all write pointers are
+ * a multiple of the fio block size. The caller must neither hold z->mutex
+ * nor f->zbd_info->mutex. Returns with z->mutex held upon success.
+ */
+static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
+						      struct io_u *io_u)
+{
+	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z;
+	unsigned int open_zone_idx = -1;
+	uint32_t zone_idx, new_zone_idx;
+	int i;
+
+	assert(is_valid_offset(f, io_u->offset));
+
+	if (td->o.max_open_zones) {
+		/*
+		 * This statement accesses f->zbd_info->open_zones[] on purpose
+		 * without locking.
+		 */
+		zone_idx = f->zbd_info->open_zones[(io_u->offset -
+						    f->file_offset) *
+				f->zbd_info->num_open_zones / f->io_size];
+	} else {
+		zone_idx = zbd_zone_idx(f, io_u->offset);
+	}
+	dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+	       __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
+
+	/*
+	 * Since z->mutex is the outer lock and f->zbd_info->mutex the inner
+	 * lock it can happen that the state of the zone with index zone_idx
+	 * has changed after 'z' has been assigned and before f->zbd_info->mutex
+	 * has been obtained. Hence the loop.
+	 */
+	for (;;) {
+		z = &f->zbd_info->zone_info[zone_idx];
+
+		pthread_mutex_lock(&z->mutex);
+		pthread_mutex_lock(&f->zbd_info->mutex);
+		if (td->o.max_open_zones == 0)
+			goto examine_zone;
+		if (f->zbd_info->num_open_zones == 0) {
+			pthread_mutex_unlock(&f->zbd_info->mutex);
+			pthread_mutex_unlock(&z->mutex);
+			dprint(FD_ZBD, "%s(%s): no zones are open\n",
+			       __func__, f->file_name);
+			return NULL;
+		}
+		open_zone_idx = (io_u->offset - f->file_offset) *
+			f->zbd_info->num_open_zones / f->io_size;
+		assert(open_zone_idx < f->zbd_info->num_open_zones);
+		new_zone_idx = f->zbd_info->open_zones[open_zone_idx];
+		if (new_zone_idx == zone_idx)
+			break;
+		zone_idx = new_zone_idx;
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		pthread_mutex_unlock(&z->mutex);
+	}
+
+	/* Both z->mutex and f->zbd_info->mutex are held. */
+
+examine_zone:
+	if (z->wp + min_bs <= (z+1)->start) {
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		goto out;
+	}
+	dprint(FD_ZBD, "%s(%s): closing zone %d\n", __func__, f->file_name,
+	       zone_idx);
+	if (td->o.max_open_zones)
+		zbd_close_zone(td, f, open_zone_idx);
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	/* Only z->mutex is held. */
+
+	/* Zone 'z' is full, so try to open a new zone. */
+	for (i = f->io_size / f->zbd_info->zone_size; i > 0; i--) {
+		zone_idx++;
+		pthread_mutex_unlock(&z->mutex);
+		z++;
+		if (!is_valid_offset(f, z->start)) {
+			/* Wrap-around. */
+			zone_idx = zbd_zone_idx(f, f->file_offset);
+			z = &f->zbd_info->zone_info[zone_idx];
+		}
+		assert(is_valid_offset(f, z->start));
+		pthread_mutex_lock(&z->mutex);
+		if (z->open)
+			continue;
+		if (zbd_open_zone(td, io_u, zone_idx))
+			goto out;
+	}
+
+	/* Only z->mutex is held. */
+
+	/* Check whether the write fits in any of the already opened zones. */
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	for (i = 0; i < f->zbd_info->num_open_zones; i++) {
+		zone_idx = f->zbd_info->open_zones[i];
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+		pthread_mutex_unlock(&z->mutex);
+
+		z = &f->zbd_info->zone_info[zone_idx];
+
+		pthread_mutex_lock(&z->mutex);
+		if (z->wp + min_bs <= (z+1)->start)
+			goto out;
+		pthread_mutex_lock(&f->zbd_info->mutex);
+	}
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+	pthread_mutex_unlock(&z->mutex);
+	dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
+	       f->file_name);
+	return NULL;
+
+out:
+	dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
+	       zone_idx);
+	io_u->offset = z->start;
+	return z;
+}
+
+/* The caller must hold z->mutex. */
+static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
+						    struct io_u *io_u,
+						    struct fio_zone_info *z)
+{
+	const struct fio_file *f = io_u->file;
+	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+
+	if (!zbd_open_zone(td, io_u, z - f->zbd_info->zone_info)) {
+		pthread_mutex_unlock(&z->mutex);
+		z = zbd_convert_to_open_zone(td, io_u);
+		assert(z);
+	}
+
+	if (z->verify_block * min_bs >= f->zbd_info->zone_size)
+		log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
+			min_bs, (unsigned long long) f->zbd_info->zone_size);
+	io_u->offset = z->start + z->verify_block++ * min_bs;
+	return z;
+}
+
+/*
+ * Find another zone for which @io_u fits below the write pointer. Start
+ * searching in zones @zb + 1 .. @zl and continue searching in zones
+ * @zf .. @zb - 1.
+ *
+ * Either returns NULL or returns a zone pointer and holds the mutex for that
+ * zone.
+ */
+static struct fio_zone_info *
+zbd_find_zone(struct thread_data *td, struct io_u *io_u,
+	      struct fio_zone_info *zb, struct fio_zone_info *zl)
+{
+	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	const struct fio_file *f = io_u->file;
+	struct fio_zone_info *z1, *z2;
+	const struct fio_zone_info *const zf =
+		&f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)];
+
+	/*
+	 * Skip to the next non-empty zone in case of sequential I/O and to
+	 * the nearest non-empty zone in case of random I/O.
+	 */
+	for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
+		if (z1 < zl && z1->cond != BLK_ZONE_COND_OFFLINE) {
+			pthread_mutex_lock(&z1->mutex);
+			if (z1->start + min_bs <= z1->wp)
+				return z1;
+			pthread_mutex_unlock(&z1->mutex);
+		} else if (!td_random(td)) {
+			break;
+		}
+		if (td_random(td) && z2 >= zf &&
+		    z2->cond != BLK_ZONE_COND_OFFLINE) {
+			pthread_mutex_lock(&z2->mutex);
+			if (z2->start + min_bs <= z2->wp)
+				return z2;
+			pthread_mutex_unlock(&z2->mutex);
+		}
+	}
+	dprint(FD_ZBD, "%s: adjusting random read offset failed\n",
+	       f->file_name);
+	return NULL;
+}
+
+/**
+ * zbd_queue_io - update the write pointer of a sequential zone
+ * @io_u: I/O unit
+ * @success: Whether or not the I/O unit has been queued successfully
+ * @q: queueing status (busy, completed or queued).
+ *
+ * For write and trim operations, update the write pointer of the I/O unit
+ * target zone.
+ */
+static void zbd_queue_io(struct io_u *io_u, int q, bool success)
+{
+	const struct fio_file *f = io_u->file;
+	struct zoned_block_device_info *zbd_info = f->zbd_info;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+	uint64_t zone_end;
+
+	if (!zbd_info)
+		return;
+
+	zone_idx = zbd_zone_idx(f, io_u->offset);
+	assert(zone_idx < zbd_info->nr_zones);
+	z = &zbd_info->zone_info[zone_idx];
+
+	if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+		return;
+
+	if (!success)
+		goto unlock;
+
+	dprint(FD_ZBD,
+	       "%s: queued I/O (%lld, %llu) for zone %u\n",
+	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+
+	switch (io_u->ddir) {
+	case DDIR_WRITE:
+		zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
+			       (z + 1)->start);
+		pthread_mutex_lock(&zbd_info->mutex);
+		/*
+		 * z->wp > zone_end means that one or more I/O errors
+		 * have occurred.
+		 */
+		if (z->wp <= zone_end)
+			zbd_info->sectors_with_data += zone_end - z->wp;
+		pthread_mutex_unlock(&zbd_info->mutex);
+		z->wp = zone_end;
+		break;
+	case DDIR_TRIM:
+		assert(z->wp == z->start);
+		break;
+	default:
+		break;
+	}
+
+unlock:
+	if (!success || q != FIO_Q_QUEUED) {
+		/* BUSY or COMPLETED: unlock the zone */
+		pthread_mutex_unlock(&z->mutex);
+		io_u->zbd_put_io = NULL;
+	}
+}
+
+/**
+ * zbd_put_io - Unlock an I/O unit target zone lock
+ * @io_u: I/O unit
+ */
+static void zbd_put_io(const struct io_u *io_u)
+{
+	const struct fio_file *f = io_u->file;
+	struct zoned_block_device_info *zbd_info = f->zbd_info;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+
+	if (!zbd_info)
+		return;
+
+	zone_idx = zbd_zone_idx(f, io_u->offset);
+	assert(zone_idx < zbd_info->nr_zones);
+	z = &zbd_info->zone_info[zone_idx];
+
+	if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+		return;
+
+	dprint(FD_ZBD,
+	       "%s: terminate I/O (%lld, %llu) for zone %u\n",
+	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+
+	assert(pthread_mutex_unlock(&z->mutex) == 0);
+	zbd_check_swd(f);
+}
+
+bool zbd_unaligned_write(int error_code)
+{
+	switch (error_code) {
+	case EIO:
+	case EREMOTEIO:
+		return true;
+	}
+	return false;
+}
+
+/**
+ * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * For sequential workloads, change the file offset to skip zoneskip bytes when
+ * no more IO can be performed in the current zone.
+ * - For read workloads, zoneskip is applied when the io has reached the end of
+ *   the zone or the zone write position (when td->o.read_beyond_wp is false).
+ * - For write workloads, zoneskip is applied when the zone is full.
+ * This applies only to read and write operations.
+ */
+void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	enum fio_ddir ddir = io_u->ddir;
+	struct fio_zone_info *z;
+	uint32_t zone_idx;
+
+	assert(td->o.zone_mode == ZONE_MODE_ZBD);
+	assert(td->o.zone_size);
+
+	/*
+	 * zone_skip is valid only for sequential workloads.
+	 */
+	if (td_random(td) || !td->o.zone_skip)
+		return;
+
+	/*
+	 * It is time to switch to a new zone if:
+	 * - zone_bytes == zone_size bytes have already been accessed
+	 * - The last position reached the end of the current zone.
+	 * - For reads with td->o.read_beyond_wp == false, the last position
+	 *   reached the zone write pointer.
+	 */
+	zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
+	z = &f->zbd_info->zone_info[zone_idx];
+
+	if (td->zone_bytes >= td->o.zone_size ||
+	    f->last_pos[ddir] >= (z+1)->start ||
+	    (ddir == DDIR_READ &&
+	     (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
+		/*
+		 * Skip zones.
+		 */
+		td->zone_bytes = 0;
+		f->file_offset += td->o.zone_size + td->o.zone_skip;
+
+		/*
+		 * Wrap from the beginning, if we exceed the file size
+		 */
+		if (f->file_offset >= f->real_file_size)
+			f->file_offset = get_start_offset(td, f);
+
+		f->last_pos[ddir] = f->file_offset;
+		td->io_skip_bytes += td->o.zone_skip;
+	}
+}
+
+/**
+ * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * Locking strategy: returns with z->mutex locked if and only if z refers
+ * to a sequential zone and if io_u_accept is returned. z is the zone that
+ * corresponds to io_u->offset at the end of this function.
+ */
+enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
+{
+	const struct fio_file *f = io_u->file;
+	uint32_t zone_idx_b;
+	struct fio_zone_info *zb, *zl, *orig_zb;
+	uint32_t orig_len = io_u->buflen;
+	uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	uint64_t new_len;
+	int64_t range;
+
+	if (!f->zbd_info)
+		return io_u_accept;
+
+	assert(is_valid_offset(f, io_u->offset));
+	assert(io_u->buflen);
+	zone_idx_b = zbd_zone_idx(f, io_u->offset);
+	zb = &f->zbd_info->zone_info[zone_idx_b];
+	orig_zb = zb;
+
+	/* Accept the I/O offset for conventional zones. */
+	if (zb->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return io_u_accept;
+
+	/*
+	 * Accept the I/O offset for reads if reading beyond the write pointer
+	 * is enabled.
+	 */
+	if (zb->cond != BLK_ZONE_COND_OFFLINE &&
+	    io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
+		return io_u_accept;
+
+	zbd_check_swd(f);
+
+	/*
+	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
+	 * is changed or when io_u completes and zbd_put_io() executed.
+	 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+	 * other waiting for zone locks when building an io_u batch, first
+	 * only trylock the zone. If the zone is already locked by another job,
+	 * process the currently queued I/Os so that I/O progress is made and
+	 * zones unlocked.
+	 */
+	if (pthread_mutex_trylock(&zb->mutex) != 0) {
+		if (!td_ioengine_flagged(td, FIO_SYNCIO))
+			io_u_quiesce(td);
+		pthread_mutex_lock(&zb->mutex);
+	}
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		if (td->runstate == TD_VERIFYING) {
+			zb = zbd_replay_write_order(td, io_u, zb);
+			goto accept;
+		}
+		/*
+		 * Check that there is enough written data in the zone to do an
+		 * I/O of at least min_bs B. If there isn't, find a new zone for
+		 * the I/O.
+		 */
+		range = zb->cond != BLK_ZONE_COND_OFFLINE ?
+			zb->wp - zb->start : 0;
+		if (range < min_bs ||
+		    ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
+			pthread_mutex_unlock(&zb->mutex);
+			zl = &f->zbd_info->zone_info[zbd_zone_idx(f,
+						f->file_offset + f->io_size)];
+			zb = zbd_find_zone(td, io_u, zb, zl);
+			if (!zb) {
+				dprint(FD_ZBD,
+				       "%s: zbd_find_zone(%lld, %llu) failed\n",
+				       f->file_name, io_u->offset,
+				       io_u->buflen);
+				goto eof;
+			}
+			/*
+			 * zbd_find_zone() returned a zone with a range of at
+			 * least min_bs.
+			 */
+			range = zb->wp - zb->start;
+			assert(range >= min_bs);
+
+			if (!td_random(td))
+				io_u->offset = zb->start;
+		}
+		/*
+		 * Make sure the I/O is within the zone valid data range while
+		 * maximizing the I/O size and preserving randomness.
+		 */
+		if (range <= io_u->buflen)
+			io_u->offset = zb->start;
+		else if (td_random(td))
+			io_u->offset = zb->start +
+				((io_u->offset - orig_zb->start) %
+				 (range - io_u->buflen)) / min_bs * min_bs;
+		/*
+		 * Make sure the I/O does not cross over the zone wp position.
+		 */
+		new_len = min((unsigned long long)io_u->buflen,
+			      (unsigned long long)(zb->wp - io_u->offset));
+		new_len = new_len / min_bs * min_bs;
+		if (new_len < io_u->buflen) {
+			io_u->buflen = new_len;
+			dprint(FD_IO, "Changed length from %u into %llu\n",
+			       orig_len, io_u->buflen);
+		}
+		assert(zb->start <= io_u->offset);
+		assert(io_u->offset + io_u->buflen <= zb->wp);
+		goto accept;
+	case DDIR_WRITE:
+		if (io_u->buflen > f->zbd_info->zone_size)
+			goto eof;
+		if (!zbd_open_zone(td, io_u, zone_idx_b)) {
+			pthread_mutex_unlock(&zb->mutex);
+			zb = zbd_convert_to_open_zone(td, io_u);
+			if (!zb)
+				goto eof;
+			zone_idx_b = zb - f->zbd_info->zone_info;
+		}
+		/* Check whether the zone reset threshold has been exceeded */
+		if (td->o.zrf.u.f) {
+			if (f->zbd_info->sectors_with_data >=
+			    f->io_size * td->o.zrt.u.f &&
+			    zbd_dec_and_reset_write_cnt(td, f)) {
+				zb->reset_zone = 1;
+			}
+		}
+		/* Reset the zone pointer if necessary */
+		if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
+			assert(td->o.verify == VERIFY_NONE);
+			/*
+			 * Since previous write requests may have been submitted
+			 * asynchronously and since we will submit the zone
+			 * reset synchronously, wait until previously submitted
+			 * write requests have completed before issuing a
+			 * zone reset.
+			 */
+			io_u_quiesce(td);
+			zb->reset_zone = 0;
+			if (zbd_reset_zone(td, f, zb) < 0)
+				goto eof;
+		}
+		/* Make writes occur at the write pointer */
+		assert(!zbd_zone_full(f, zb, min_bs));
+		io_u->offset = zb->wp;
+		if (!is_valid_offset(f, io_u->offset)) {
+			dprint(FD_ZBD, "Dropped request with offset %llu\n",
+			       io_u->offset);
+			goto eof;
+		}
+		/*
+		 * Make sure that the buflen is a multiple of the minimal
+		 * block size. Give up if shrinking would make the request too
+		 * small.
+		 */
+		new_len = min((unsigned long long)io_u->buflen,
+			      (zb + 1)->start - io_u->offset);
+		new_len = new_len / min_bs * min_bs;
+		if (new_len == io_u->buflen)
+			goto accept;
+		if (new_len >= min_bs) {
+			io_u->buflen = new_len;
+			dprint(FD_IO, "Changed length from %u into %llu\n",
+			       orig_len, io_u->buflen);
+			goto accept;
+		}
+		log_err("Zone remainder %lld smaller than minimum block size %d\n",
+			((zb + 1)->start - io_u->offset),
+			min_bs);
+		goto eof;
+	case DDIR_TRIM:
+		/* fall-through */
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+	case DDIR_WAIT:
+	case DDIR_LAST:
+	case DDIR_INVAL:
+		goto accept;
+	}
+
+	assert(false);
+
+accept:
+	assert(zb);
+	assert(zb->cond != BLK_ZONE_COND_OFFLINE);
+	assert(!io_u->zbd_queue_io);
+	assert(!io_u->zbd_put_io);
+	io_u->zbd_queue_io = zbd_queue_io;
+	io_u->zbd_put_io = zbd_put_io;
+	return io_u_accept;
+
+eof:
+	if (zb)
+		pthread_mutex_unlock(&zb->mutex);
+	return io_u_eof;
+}
+
+/* Return a string with ZBD statistics */
+char *zbd_write_status(const struct thread_stat *ts)
+{
+	char *res;
+
+	if (asprintf(&res, "; %llu zone resets", (unsigned long long) ts->nr_zone_resets) < 0)
+		return NULL;
+	return res;
+}
diff -Nru fio-2.1.3/zbd.h fio-3.16/zbd.h
--- fio-2.1.3/zbd.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/zbd.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2018 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef FIO_ZBD_H
+#define FIO_ZBD_H
+
+#include <inttypes.h>
+#include "fio.h"	/* FIO_MAX_OPEN_ZBD_ZONES */
+#ifdef CONFIG_LINUX_BLKZONED
+#include <linux/blkzoned.h>
+#endif
+
+struct fio_file;
+
+/*
+ * Zoned block device models.
+ */
+enum blk_zoned_model {
+	ZBD_DM_NONE,	/* Regular block device */
+	ZBD_DM_HOST_AWARE,	/* Host-aware zoned block device */
+	ZBD_DM_HOST_MANAGED,	/* Host-managed zoned block device */
+};
+
+enum io_u_action {
+	io_u_accept	= 0,
+	io_u_eof	= 1,
+};
+
+/**
+ * struct fio_zone_info - information about a single ZBD zone
+ * @start: zone start location (bytes)
+ * @wp: zone write pointer location (bytes)
+ * @verify_block: number of blocks that have been verified for this zone
+ * @mutex: protects the modifiable members in this structure
+ * @type: zone type (BLK_ZONE_TYPE_*)
+ * @cond: zone state (BLK_ZONE_COND_*)
+ * @open: whether or not this zone is currently open. Only relevant if
+ *		max_open_zones > 0.
+ * @reset_zone: whether or not this zone should be reset before writing to it
+ */
+struct fio_zone_info {
+#ifdef CONFIG_LINUX_BLKZONED
+	pthread_mutex_t		mutex;
+	uint64_t		start;
+	uint64_t		wp;
+	uint32_t		verify_block;
+	enum blk_zone_type	type:2;
+	enum blk_zone_cond	cond:4;
+	unsigned int		open:1;
+	unsigned int		reset_zone:1;
+#endif
+};
+
+/**
+ * zoned_block_device_info - zoned block device characteristics
+ * @model: Device model.
+ * @mutex: Protects the modifiable members in this structure (refcount and
+ *		num_open_zones).
+ * @zone_size: size of a single zone in units of 512 bytes
+ * @sectors_with_data: total size of data in all zones in units of 512 bytes
+ * @zone_size_log2: log2 of the zone size in bytes if it is a power of 2 or 0
+ *		if the zone size is not a power of 2.
+ * @nr_zones: number of zones
+ * @refcount: number of fio files that share this structure
+ * @num_open_zones: number of open zones
+ * @write_cnt: Number of writes since the latest zone reset triggered by
+ *	       the zone_reset_frequency fio job parameter.
+ * @open_zones: zone numbers of open zones
+ * @zone_info: description of the individual zones
+ *
+ * Only devices for which all zones have the same size are supported.
+ * Note: if the capacity is not a multiple of the zone size then the last zone
+ * will be smaller than 'zone_size'.
+ */
+struct zoned_block_device_info {
+	enum blk_zoned_model	model;
+	pthread_mutex_t		mutex;
+	uint64_t		zone_size;
+	uint64_t		sectors_with_data;
+	uint32_t		zone_size_log2;
+	uint32_t		nr_zones;
+	uint32_t		refcount;
+	uint32_t		num_open_zones;
+	uint32_t		write_cnt;
+	uint32_t		open_zones[FIO_MAX_OPEN_ZBD_ZONES];
+	struct fio_zone_info	zone_info[0];
+};
+
+#ifdef CONFIG_LINUX_BLKZONED
+void zbd_free_zone_info(struct fio_file *f);
+int zbd_init(struct thread_data *td);
+void zbd_file_reset(struct thread_data *td, struct fio_file *f);
+bool zbd_unaligned_write(int error_code);
+void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u);
+enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
+char *zbd_write_status(const struct thread_stat *ts);
+
+static inline void zbd_queue_io_u(struct io_u *io_u, enum fio_q_status status)
+{
+	if (io_u->zbd_queue_io) {
+		io_u->zbd_queue_io(io_u, status, io_u->error == 0);
+		io_u->zbd_queue_io = NULL;
+	}
+}
+
+static inline void zbd_put_io_u(struct io_u *io_u)
+{
+	if (io_u->zbd_put_io) {
+		io_u->zbd_put_io(io_u);
+		io_u->zbd_queue_io = NULL;
+		io_u->zbd_put_io = NULL;
+	}
+}
+
+#else
+static inline void zbd_free_zone_info(struct fio_file *f)
+{
+}
+
+static inline int zbd_init(struct thread_data *td)
+{
+	return 0;
+}
+
+static inline void zbd_file_reset(struct thread_data *td, struct fio_file *f)
+{
+}
+
+static inline bool zbd_unaligned_write(int error_code)
+{
+	return false;
+}
+
+static inline enum io_u_action zbd_adjust_block(struct thread_data *td,
+						struct io_u *io_u)
+{
+	return io_u_accept;
+}
+
+static inline char *zbd_write_status(const struct thread_stat *ts)
+{
+	return NULL;
+}
+
+static inline void zbd_queue_io_u(struct io_u *io_u,
+				  enum fio_q_status status) {}
+static inline void zbd_put_io_u(struct io_u *io_u) {}
+
+static inline void setup_zbd_zone_mode(struct thread_data *td,
+					struct io_u *io_u)
+{
+}
+
+#endif
+
+#endif /* FIO_ZBD_H */
diff -Nru fio-2.1.3/zone-dist.c fio-3.16/zone-dist.c
--- fio-2.1.3/zone-dist.c	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/zone-dist.c	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,74 @@
+#include <stdlib.h>
+#include "fio.h"
+#include "zone-dist.h"
+
+static void __td_zone_gen_index(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned int i, j, sprev, aprev;
+	uint64_t sprev_sz;
+
+	td->zone_state_index[ddir] = malloc(sizeof(struct zone_split_index) * 100);
+
+	sprev_sz = sprev = aprev = 0;
+	for (i = 0; i < td->o.zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &td->o.zone_split[ddir][i];
+
+		for (j = aprev; j < aprev + zsp->access_perc; j++) {
+			struct zone_split_index *zsi = &td->zone_state_index[ddir][j];
+
+			zsi->size_perc = sprev + zsp->size_perc;
+			zsi->size_perc_prev = sprev;
+
+			zsi->size = sprev_sz + zsp->size;
+			zsi->size_prev = sprev_sz;
+		}
+
+		aprev += zsp->access_perc;
+		sprev += zsp->size_perc;
+		sprev_sz += zsp->size;
+	}
+}
+
+static bool has_zones(struct thread_data *td)
+{
+	int i, zones = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		zones += td->o.zone_split_nr[i];
+
+	return zones != 0;
+}
+
+/*
+ * Generate state table for indexes, so we don't have to do it inline from
+ * the hot IO path
+ */
+void td_zone_gen_index(struct thread_data *td)
+{
+	int i;
+
+	if (!has_zones(td))
+		return;
+
+	td->zone_state_index = malloc(DDIR_RWDIR_CNT *
+					sizeof(struct zone_split_index *));
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		__td_zone_gen_index(td, i);
+}
+
+void td_zone_free_index(struct thread_data *td)
+{
+	int i;
+
+	if (!td->zone_state_index)
+		return;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		free(td->zone_state_index[i]);
+		td->zone_state_index[i] = NULL;
+	}
+
+	free(td->zone_state_index);
+	td->zone_state_index = NULL;
+}
diff -Nru fio-2.1.3/zone-dist.h fio-3.16/zone-dist.h
--- fio-2.1.3/zone-dist.h	1970-01-01 00:00:00.000000000 +0000
+++ fio-3.16/zone-dist.h	2019-09-20 01:01:52.000000000 +0000
@@ -0,0 +1,7 @@
+#ifndef FIO_ZONE_DIST_H
+#define FIO_ZONE_DIST_H
+
+void td_zone_gen_index(struct thread_data *td);
+void td_zone_free_index(struct thread_data *td);
+
+#endif