diff -Nru zfs-linux-0.8.3/debian/changelog zfs-linux-0.8.3/debian/changelog
--- zfs-linux-0.8.3/debian/changelog	2020-04-14 09:14:33.000000000 +0000
+++ zfs-linux-0.8.3/debian/changelog	2021-04-29 15:09:45.000000000 +0000
@@ -1,3 +1,89 @@
+zfs-linux (0.8.3-1ubuntu12.9) focal; urgency=medium
+
+  * No change rebuild in security pocket. LP: #1914279.
+
+ -- Dimitri John Ledkov <xnox@ubuntu.com>  Thu, 29 Apr 2021 16:09:45 +0100
+
+zfs-linux (0.8.3-1ubuntu12.8) focal; urgency=medium
+
+  * Prevent build of the zfs-dkms binary package for kernels later than 5.4.
+    This is a re-working of the fix for bug #1902701 with the \ escaped
+    so that 5.10+ kernels get detected correctly (LP: #1919252)
+
+ -- Colin Ian King <colin.king@canonical.com>  Wed, 07 Apr 2021 13:44:14 +0100
+
+zfs-linux (0.8.3-1ubuntu12.7) focal; urgency=medium
+
+  * Fix race condition in zfs_iput_async (LP: #1916486)
+    - Upstream ZFS fix 43eaef6de817 ("Fix zrele race in zrele_async that can
+      cause hang")
+
+ -- Heitor Alves de Siqueira <halves@canonical.com>  Thu, 25 Feb 2021 19:48:51 +0000
+
+zfs-linux (0.8.3-1ubuntu12.6) focal; urgency=medium
+
+  [ Didier Roche ]
+  [ Jean-Baptiste Lallement ]
+  * Generate clone uuid without dd which is flagged as having an executable
+    stack. Thanks Usarin Heininga for the patch (LP: #1894329)
+
+  [ Andrea Righi ]
+  * fix potential user-space double free when running "zfs mount -a"
+    (LP: #1902588)
+    - 4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch
+
+ -- Colin Ian King <colin.king@canonical.com>  Mon, 30 Nov 2020 19:00:00 +0000
+
+zfs-linux (0.8.3-1ubuntu12.5) focal; urgency=medium
+
+  * Prevent build of the zfs-dkms binary package for kernels later than 5.4.
+    If that is required, one should use the zfs-dkms package of a later series
+    (like it is done for built-in modules of Ubuntu kernels). (LP: #1902701)
+
+ -- Stefan Bader <stefan.bader@canonical.com>  Tue, 03 Nov 2020 18:05:38 +0100
+
+zfs-linux (0.8.3-1ubuntu12.4) focal; urgency=medium
+
+  * Fix zfs-dkms build on arm64 with PREEMPTION and BLK_CGROUP (LP: #1892001)
+    - 4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch
+      preempt_schedule_notrace is GPL-only so redfine it to preempt_schedule
+      on arm64 with PREEMPTION and BLK_CGROUP enabled to 'fix' the DKMS
+      build failure.
+
+ -- Juerg Haefliger <juergh@canonical.com>  Tue, 18 Aug 2020 11:10:41 +0200
+
+zfs-linux (0.8.3-1ubuntu12.3) focal; urgency=medium
+
+  * Fix volume wait on locked encrypted zvols (LP: #1888405)
+    [ James Dingwall ] 
+    - 4620-zfs-vol-wait-fix-locked-encrypted-vols.patch
+      zfs-volume-wait.service systemd unit does not start if the encrypted
+      zvol is locked. The /sbin/zvol_wait should not wait for links when the
+      volume has property keystatus=unavailable. Add a check for this.
+
+ -- Colin Ian King <colin.king@canonical.com>  Wed, 22 Jul 2020 09:58:22 +0100
+
+zfs-linux (0.8.3-1ubuntu12.2) focal; urgency=medium
+
+  * Don't report errors if modprobe fails (LP: #1880421)
+    - 4510-silently-ignore-modprobe-failure.patch
+      loading ZFS modules on zfs-utils installation is a nice
+      to have feature, but don't throw an error if modules are
+      not available to load
+
+ -- Colin Ian King <colin.king@canonical.com>  Mon, 6 Jul 2020 12:13:15 +0100
+
+zfs-linux (0.8.3-1ubuntu12.1) focal; urgency=medium
+
+  * Backport AES-GCM performance accelleration (LP: #1881107)
+   - backport of upstream zfs commit 31b160f0a6c673c8f926233af2ed6d5354808393
+     ("ICP: Improve AES-GCM performance").
+     tests on a memory backed pool show performance improvements of ~15-22%
+     for AES-CCM writes, ~17-20% AES-CCM reads, 34-36% AES-GCM writes and
+     ~79-80% AES-GCM reads.
+
+ -- Colin Ian King <colin.king@canonical.com>  Tue, 28 May 2020 11:54:33 +0100
+
 zfs-linux (0.8.3-1ubuntu12) focal; urgency=medium
 
   [ Jean-Baptiste Lallement ]
diff -Nru zfs-linux-0.8.3/debian/patches/4000-zsys-support.patch zfs-linux-0.8.3/debian/patches/4000-zsys-support.patch
--- zfs-linux-0.8.3/debian/patches/4000-zsys-support.patch	2020-04-02 10:35:17.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4000-zsys-support.patch	2020-11-30 19:00:00.000000000 +0000
@@ -165,7 +165,7 @@
 +
 +uid()
 +{
-+	dd if=/dev/urandom of=/dev/stdout bs=1 count=100 2>/dev/null | tr -dc 'a-z0-9' | cut -c-6
++	grep -a -m10 -E "\*" /dev/urandom 2>/dev/null | tr -dc 'a-z0-9' | cut -c-6
 +}
 Index: zfs-linux-0.8.3/etc/systemd/system-generators/zfs-mount-generator.in
 ===================================================================
diff -Nru zfs-linux-0.8.3/debian/patches/4510-silently-ignore-modprobe-failure.patch zfs-linux-0.8.3/debian/patches/4510-silently-ignore-modprobe-failure.patch
--- zfs-linux-0.8.3/debian/patches/4510-silently-ignore-modprobe-failure.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4510-silently-ignore-modprobe-failure.patch	2020-07-06 11:10:38.000000000 +0000
@@ -0,0 +1,33 @@
+Description: Don't fail if zfs modules can't load on package installation
+  Ideally, modprobe should be attempted but not fatal (LP: #1880421)
+Author: Colin Ian King <colin.king@canonical.com>
+Origin: ubuntu
+Forwarded: no
+Last-Update: 2020-06-04
+
+Index: zfs-linux-0.8.4/etc/systemd/system/zfs-load-module.service.in
+===================================================================
+--- zfs-linux-0.8.4.orig/etc/systemd/system/zfs-load-module.service.in
++++ zfs-linux-0.8.4/etc/systemd/system/zfs-load-module.service.in
+@@ -10,7 +10,7 @@ After=systemd-remount-fs.service
+ [Service]
+ Type=oneshot
+ RemainAfterExit=yes
+-ExecStart=/sbin/modprobe zfs
++ExecStart=-/sbin/modprobe zfs
+ 
+ [Install]
+ WantedBy=zfs-mount.service
+Index: zfs-linux-0.8.4/etc/systemd/system/zfs-share.service.in
+===================================================================
+--- zfs-linux-0.8.4.orig/etc/systemd/system/zfs-share.service.in
++++ zfs-linux-0.8.4/etc/systemd/system/zfs-share.service.in
+@@ -13,7 +13,7 @@ PartOf=smb.service
+ Type=oneshot
+ RemainAfterExit=yes
+ ExecStartPre=-/bin/rm -f /etc/dfs/sharetab
+-ExecStart=@sbindir@/zfs share -a
++ExecStart=-@sbindir@/zfs share -a
+ 
+ [Install]
+ WantedBy=zfs.target
diff -Nru zfs-linux-0.8.3/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch zfs-linux-0.8.3/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch
--- zfs-linux-0.8.3/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4610-ICP-Improve-AES-GCM-performance.patch	2020-05-28 10:53:02.000000000 +0000
@@ -0,0 +1,3115 @@
+From 31b160f0a6c673c8f926233af2ed6d5354808393 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= <attila@fueloep.org>
+Date: Mon, 10 Feb 2020 21:59:50 +0100
+Subject: [PATCH] ICP: Improve AES-GCM performance
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 8bit
+
+Currently SIMD accelerated AES-GCM performance is limited by two
+factors:
+
+a. The need to disable preemption and interrupts and save the FPU
+state before using it and to do the reverse when done. Due to the
+way the code is organized (see (b) below) we have to pay this price
+twice for each 16 byte GCM block processed.
+
+b. Most processing is done in C, operating on single GCM blocks.
+The use of SIMD instructions is limited to the AES encryption of the
+counter block (AES-NI) and the Galois multiplication (PCLMULQDQ).
+This leads to the FPU not being fully utilized for crypto
+operations.
+
+To solve (a) we do crypto processing in larger chunks while owning
+the FPU. An `icp_gcm_avx_chunk_size` module parameter was introduced
+to make this chunk size tweakable. It defaults to 32 KiB. This step
+alone roughly doubles performance. (b) is tackled by porting and
+using the highly optimized openssl AES-GCM assembler routines, which
+do all the processing (CTR, AES, GMULT) in a single routine. Both
+steps together result in up to 32x reduction of the time spend in
+the en/decryption routines, leading up to approximately 12x
+throughput increase for large (128 KiB) blocks.
+
+Lastly, this commit changes the default encryption algorithm from
+AES-CCM to AES-GCM when setting the `encryption=on` property.
+
+Reviewed-By: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-By: Jason King <jason.king@joyent.com>
+Reviewed-By: Tom Caputi <tcaputi@datto.com>
+Reviewed-By: Richard Laager <rlaager@wiktel.com>
+Signed-off-by: Attila Fülöp <attila@fueloep.org>
+Closes #9749
+Signed-off-by: Colin Ian King <colin.king@canonical.com>
+---
+ COPYRIGHT                                     |   4 +
+ config/toolchain-simd.m4                      |  21 +
+ include/linux/simd_x86.h                      |  13 +
+ include/sys/zio.h                             |   2 +-
+ lib/libicp/Makefile.am                        |   2 +
+ include/linux/simd.h                          |  15 +-
+ man/man8/zfsprops.8                           |   2 +-
+ module/icp/Makefile.in                        |   9 +
+ module/icp/algs/modes/gcm.c                   | 746 ++++++++++++++-
+ .../modes/THIRDPARTYLICENSE.cryptogams        |  36 +
+ .../THIRDPARTYLICENSE.cryptogams.descrip      |   1 +
+ .../modes/THIRDPARTYLICENSE.openssl           | 177 ++++
+ .../modes/THIRDPARTYLICENSE.openssl.descrip   |   1 +
+ .../icp/asm-x86_64/modes/aesni-gcm-x86_64.S   | 892 ++++++++++++++++++
+ module/icp/asm-x86_64/modes/ghash-x86_64.S    | 714 ++++++++++++++
+ module/icp/include/aes/aes_impl.h             |   5 +
+ module/icp/include/modes/modes.h              |  29 +-
+ .../zfs_create/zfs_create_crypt_combos.ksh    |   2 +-
+ .../zpool_create_crypt_combos.ksh             |   2 +-
+ .../functional/rsend/send_encrypted_props.ksh |  12 +-
+ 20 files changed, 2654 insertions(+), 31 deletions(-)
+ create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
+ create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
+ create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
+ create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
+ create mode 100644 module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+ create mode 100644 module/icp/asm-x86_64/modes/ghash-x86_64.S
+
+Index: zfs-linux-0.8.3/COPYRIGHT
+===================================================================
+--- zfs-linux-0.8.3.orig/COPYRIGHT
++++ zfs-linux-0.8.3/COPYRIGHT
+@@ -20,6 +20,10 @@ notable exceptions and their respective
+   * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
+   * PBKDF2 Implementation: lib/libzfs/THIRDPARTYLICENSE.openssl
+   * SPL Implementation: module/spl/THIRDPARTYLICENSE.gplv2
++  * GCM Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
++  * GCM Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
++  * GHASH Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
++  * GHASH Implementaion: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
+ 
+ This product includes software developed by the OpenSSL Project for use
+ in the OpenSSL Toolkit (http://www.openssl.org/)
+Index: zfs-linux-0.8.3/config/toolchain-simd.m4
+===================================================================
+--- zfs-linux-0.8.3.orig/config/toolchain-simd.m4
++++ zfs-linux-0.8.3/config/toolchain-simd.m4
+@@ -23,6 +23,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN
+ 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL
+ 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES
+ 			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ
++			ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE
+ 			;;
+ 	esac
+ ])
+@@ -400,4 +401,24 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BU
+ 	], [
+ 		AC_MSG_RESULT([no])
+ 	])
++])
++
++dnl #
++dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE
++dnl #
++AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [
++	AC_MSG_CHECKING([whether host toolchain supports MOVBE])
++
++	AC_LINK_IFELSE([AC_LANG_SOURCE([
++	[
++		void main()
++		{
++			__asm__ __volatile__("movbe 0(%eax), %eax");
++		}
++	]])], [
++		AC_MSG_RESULT([yes])
++		AC_DEFINE([HAVE_MOVBE], 1, [Define if host toolchain supports MOVBE])
++	], [
++		AC_MSG_RESULT([no])
++	])
+ ])
+Index: zfs-linux-0.8.3/include/linux/simd_x86.h
+===================================================================
+--- zfs-linux-0.8.3.orig/include/linux/simd_x86.h
++++ zfs-linux-0.8.3/include/linux/simd_x86.h
+@@ -382,7 +382,8 @@ typedef enum cpuid_inst_sets {
+ 	AVX512ER,
+ 	AVX512VL,
+ 	AES,
+-	PCLMULQDQ
++	PCLMULQDQ,
++	MOVBE
+ } cpuid_inst_sets_t;
+ 
+ /*
+@@ -406,6 +407,7 @@ typedef struct cpuid_feature_desc {
+ #define	_AVX512VL_BIT		(1U << 31) /* if used also check other levels */
+ #define	_AES_BIT		(1U << 25)
+ #define	_PCLMULQDQ_BIT		(1U << 1)
++#define _MOVBE_BIT		(1U << 22)
+ 
+ /*
+  * Descriptions of supported instruction sets
+@@ -433,6 +435,7 @@ static const cpuid_feature_desc_t cpuid_
+ 	[AVX512VL]	= {7U, 0U, _AVX512ER_BIT,	EBX	},
+ 	[AES]		= {1U, 0U, _AES_BIT,		ECX	},
+ 	[PCLMULQDQ]	= {1U, 0U, _PCLMULQDQ_BIT,	ECX	},
++	[MOVBE]		= {1U, 0U, _MOVBE_BIT,		ECX	},
+ };
+ 
+ /*
+@@ -505,6 +508,7 @@ CPUID_FEATURE_CHECK(avx512er, AVX512ER);
+ CPUID_FEATURE_CHECK(avx512vl, AVX512VL);
+ CPUID_FEATURE_CHECK(aes, AES);
+ CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ);
++CPUID_FEATURE_CHECK(movbe, MOVBE);
+ 
+ #endif /* !defined(_KERNEL) */
+ 
+@@ -719,6 +723,19 @@ zfs_pclmulqdq_available(void)
+ #endif
+ }
+ 
++/*
++ * Check if MOVBE instruction is available
++ */
++static inline boolean_t
++zfs_movbe_available(void)
++{
++#if defined(X86_FEATURE_MOVBE)
++	return (!!boot_cpu_has(X86_FEATURE_MOVBE));
++#else
++	return (B_FALSE);
++#endif
++}
++
+ /*
+  * AVX-512 family of instruction sets:
+  *
+Index: zfs-linux-0.8.3/include/sys/zio.h
+===================================================================
+--- zfs-linux-0.8.3.orig/include/sys/zio.h
++++ zfs-linux-0.8.3/include/sys/zio.h
+@@ -118,7 +118,7 @@ enum zio_encrypt {
+ 	ZIO_CRYPT_FUNCTIONS
+ };
+ 
+-#define	ZIO_CRYPT_ON_VALUE	ZIO_CRYPT_AES_256_CCM
++#define	ZIO_CRYPT_ON_VALUE	ZIO_CRYPT_AES_256_GCM
+ #define	ZIO_CRYPT_DEFAULT	ZIO_CRYPT_OFF
+ 
+ /* macros defining encryption lengths */
+Index: zfs-linux-0.8.3/lib/libicp/Makefile.am
+===================================================================
+--- zfs-linux-0.8.3.orig/lib/libicp/Makefile.am
++++ zfs-linux-0.8.3/lib/libicp/Makefile.am
+@@ -20,6 +20,8 @@ ASM_SOURCES_AS = \
+ 	asm-x86_64/aes/aes_amd64.S \
+ 	asm-x86_64/aes/aes_aesni.S \
+ 	asm-x86_64/modes/gcm_pclmulqdq.S \
++	asm-x86_64/modes/aesni-gcm-x86_64.S \
++	asm-x86_64/modes/ghash-x86_64.S \
+ 	asm-x86_64/sha1/sha1-x86_64.S \
+ 	asm-x86_64/sha2/sha256_impl.S \
+ 	asm-x86_64/sha2/sha512_impl.S
+Index: zfs-linux-0.8.3/module/icp/Makefile.in
+===================================================================
+--- zfs-linux-0.8.3.orig/module/icp/Makefile.in
++++ zfs-linux-0.8.3/module/icp/Makefile.in
+@@ -69,9 +69,18 @@ $(MODULE)-objs += algs/skein/skein_iv.o
+ $(MODULE)-objs += $(ASM_SOURCES)
+ 
+ $(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o
++$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o
++$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o
+ $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o
+ $(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o
+ 
++# Suppress objtool "can't find jump dest instruction at" warnings.  They
++# are caused by the constants which are defined in the text section of the
++# assembly file using .byte instructions (e.g. bswap_mask).  The objtool
++# utility tries to interpret them as opcodes and obviously fails doing so.
++OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
++OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y
++
+ ICP_DIRS = \
+ 	api \
+ 	core \
+Index: zfs-linux-0.8.3/module/icp/algs/modes/gcm.c
+===================================================================
+--- zfs-linux-0.8.3.orig/module/icp/algs/modes/gcm.c
++++ zfs-linux-0.8.3/module/icp/algs/modes/gcm.c
+@@ -30,12 +30,46 @@
+ #include <sys/byteorder.h>
+ #include <modes/gcm_impl.h>
+ #include <linux/simd.h>
++#ifdef CAN_USE_GCM_ASM
++#include <aes/aes_impl.h>
++#endif
+ 
+ #define	GHASH(c, d, t, o) \
+ 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+ 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
+ 	(uint64_t *)(void *)(t));
+ 
++/* Select GCM implementation */
++#define	IMPL_FASTEST	(UINT32_MAX)
++#define	IMPL_CYCLE	(UINT32_MAX-1)
++#ifdef CAN_USE_GCM_ASM
++#define	IMPL_AVX	(UINT32_MAX-2)
++#endif
++#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
++static uint32_t icp_gcm_impl = IMPL_FASTEST;
++static uint32_t user_sel_impl = IMPL_FASTEST;
++
++#ifdef CAN_USE_GCM_ASM
++/*
++ * Whether to use the optimized openssl gcm and ghash implementations.
++ * Set to true if module parameter icp_gcm_impl == "avx".
++ */
++static boolean_t gcm_use_avx = B_FALSE;
++#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
++
++static inline boolean_t gcm_avx_will_work(void);
++static inline void gcm_set_avx(boolean_t);
++static inline boolean_t gcm_toggle_avx(void);
++
++static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
++    crypto_data_t *, size_t);
++
++static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
++static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
++static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
++    size_t, size_t);
++#endif /* ifdef CAN_USE_GCM_ASM */
++
+ /*
+  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
+  * is done in another function.
+@@ -47,6 +81,12 @@ gcm_mode_encrypt_contiguous_blocks(gcm_c
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
++#ifdef CAN_USE_GCM_ASM
++	if (ctx->gcm_use_avx == B_TRUE)
++		return (gcm_mode_encrypt_contiguous_blocks_avx(
++		    ctx, data, length, out, block_size));
++#endif
++
+ 	const gcm_impl_ops_t *gops;
+ 	size_t remainder = length;
+ 	size_t need = 0;
+@@ -109,6 +149,14 @@ gcm_mode_encrypt_contiguous_blocks(gcm_c
+ 
+ 		ctx->gcm_processed_data_len += block_size;
+ 
++		/*
++		 * The following copies a complete GCM block back to where it
++		 * came from if there was a remainder in the last call and out
++		 * is NULL. That doesn't seem to make sense. So we assert this
++		 * can't happen and leave the code in for reference.
++		 * See https://github.com/zfsonlinux/zfs/issues/9661
++		 */
++		ASSERT(out != NULL);
+ 		if (out == NULL) {
+ 			if (ctx->gcm_remainder_len > 0) {
+ 				bcopy(blockp, ctx->gcm_copy_to,
+@@ -169,6 +217,11 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
++#ifdef CAN_USE_GCM_ASM
++	if (ctx->gcm_use_avx == B_TRUE)
++		return (gcm_encrypt_final_avx(ctx, out, block_size));
++#endif
++
+ 	const gcm_impl_ops_t *gops;
+ 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+ 	uint8_t *ghash, *macp = NULL;
+@@ -321,6 +374,11 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto
+     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
++#ifdef CAN_USE_GCM_ASM
++	if (ctx->gcm_use_avx == B_TRUE)
++		return (gcm_decrypt_final_avx(ctx, out, block_size));
++#endif
++
+ 	const gcm_impl_ops_t *gops;
+ 	size_t pt_len;
+ 	size_t remainder;
+@@ -526,6 +584,9 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *
+ 	return (CRYPTO_SUCCESS);
+ }
+ 
++/*
++ * Init the GCM context struct. Handle the cycle and avx implementations here.
++ */
+ int
+ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
+     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+@@ -556,11 +617,37 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *p
+ 		return (CRYPTO_MECHANISM_PARAM_INVALID);
+ 	}
+ 
+-	if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
+-	    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
+-	    encrypt_block, copy_block, xor_block) != 0) {
+-		rv = CRYPTO_MECHANISM_PARAM_INVALID;
++#ifdef CAN_USE_GCM_ASM
++	/*
++	 * Handle the "cycle" implementation by creating avx and non avx
++	 * contexts alternately.
++	 */
++	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
++		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
++	} else {
++		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+ 	}
++	/* We don't handle byte swapped key schedules in the avx code path. */
++	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
++	if (ks->ops->needs_byteswap == B_TRUE) {
++		gcm_ctx->gcm_use_avx = B_FALSE;
++	}
++	/* Avx and non avx context initialization differs from here on. */
++	if (gcm_ctx->gcm_use_avx == B_FALSE) {
++#endif /* ifdef CAN_USE_GCM_ASM */
++		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
++		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
++		    encrypt_block, copy_block, xor_block) != 0) {
++			rv = CRYPTO_MECHANISM_PARAM_INVALID;
++		}
++#ifdef CAN_USE_GCM_ASM
++	} else {
++		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
++		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
++			rv = CRYPTO_MECHANISM_PARAM_INVALID;
++		}
++	}
++#endif /* ifdef CAN_USE_GCM_ASM */
+ 
+ 	return (rv);
+ }
+@@ -590,11 +677,37 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *
+ 		return (CRYPTO_MECHANISM_PARAM_INVALID);
+ 	}
+ 
+-	if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
+-	    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
+-	    encrypt_block, copy_block, xor_block) != 0) {
+-		rv = CRYPTO_MECHANISM_PARAM_INVALID;
++#ifdef CAN_USE_GCM_ASM
++	/*
++	 * Handle the "cycle" implementation by creating avx and non avx
++	 * contexts alternately.
++	 */
++	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
++		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
++	} else {
++		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
++	}
++	/* We don't handle byte swapped key schedules in the avx code path. */
++	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
++	if (ks->ops->needs_byteswap == B_TRUE) {
++		gcm_ctx->gcm_use_avx = B_FALSE;
++	}
++	/* Avx and non avx context initialization differs from here on. */
++	if (gcm_ctx->gcm_use_avx == B_FALSE) {
++#endif	/* ifdef CAN_USE_GCM_ASM */
++		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
++		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
++		    encrypt_block, copy_block, xor_block) != 0) {
++			rv = CRYPTO_MECHANISM_PARAM_INVALID;
++		}
++#ifdef CAN_USE_GCM_ASM
++	} else {
++		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
++		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
++			rv = CRYPTO_MECHANISM_PARAM_INVALID;
++		}
+ 	}
++#endif /* ifdef CAN_USE_GCM_ASM */
+ 
+ 	return (rv);
+ }
+@@ -645,15 +758,6 @@ const gcm_impl_ops_t *gcm_all_impl[] = {
+ /* Indicate that benchmark has been completed */
+ static boolean_t gcm_impl_initialized = B_FALSE;
+ 
+-/* Select GCM implementation */
+-#define	IMPL_FASTEST	(UINT32_MAX)
+-#define	IMPL_CYCLE	(UINT32_MAX-1)
+-
+-#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+-
+-static uint32_t icp_gcm_impl = IMPL_FASTEST;
+-static uint32_t user_sel_impl = IMPL_FASTEST;
+-
+ /* Hold all supported implementations */
+ static size_t gcm_supp_impl_cnt = 0;
+ static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
+@@ -685,6 +789,16 @@ gcm_impl_get_ops()
+ 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+ 		ops = gcm_supp_impl[idx];
+ 		break;
++#ifdef CAN_USE_GCM_ASM
++	case IMPL_AVX:
++		/*
++		 * Make sure that we return a valid implementation while
++		 * switching to the avx implementation since there still
++		 * may be unfinished non-avx contexts around.
++		 */
++		ops = &gcm_generic_impl;
++		break;
++#endif
+ 	default:
+ 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
+ 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+@@ -733,6 +847,16 @@ gcm_impl_init(void)
+ 
+ 	strcpy(gcm_fastest_impl.name, "fastest");
+ 
++#ifdef CAN_USE_GCM_ASM
++	/*
++	 * Use the avx implementation if it's available and the implementation
++	 * hasn't changed from its default value of fastest on module load.
++	 */
++	if (gcm_avx_will_work() &&
++	    GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
++		gcm_set_avx(B_TRUE);
++	}
++#endif
+ 	/* Finish initialization */
+ 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
+ 	gcm_impl_initialized = B_TRUE;
+@@ -744,6 +868,9 @@ static const struct {
+ } gcm_impl_opts[] = {
+ 		{ "cycle",	IMPL_CYCLE },
+ 		{ "fastest",	IMPL_FASTEST },
++#ifdef CAN_USE_GCM_ASM
++		{ "avx",	IMPL_AVX },
++#endif
+ };
+ 
+ /*
+@@ -777,6 +904,12 @@ gcm_impl_set(const char *val)
+ 
+ 	/* Check mandatory options */
+ 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
++#ifdef CAN_USE_GCM_ASM
++		/* Ignore avx implementation if it won't work. */
++		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
++			continue;
++		}
++#endif
+ 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
+ 			impl = gcm_impl_opts[i].sel;
+ 			err = 0;
+@@ -795,6 +928,18 @@ gcm_impl_set(const char *val)
+ 			}
+ 		}
+ 	}
++#ifdef CAN_USE_GCM_ASM
++	/*
++	 * Use the avx implementation if available and the requested one is
++	 * avx or fastest.
++	 */
++	if (gcm_avx_will_work() == B_TRUE &&
++	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
++		gcm_set_avx(B_TRUE);
++	} else {
++		gcm_set_avx(B_FALSE);
++	}
++#endif
+ 
+ 	if (err == 0) {
+ 		if (gcm_impl_initialized)
+@@ -826,6 +971,12 @@ icp_gcm_impl_get(char *buffer, zfs_kerne
+ 
+ 	/* list mandatory options */
+ 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
++#ifdef CAN_USE_GCM_ASM
++		/* Ignore avx implementation if it won't work. */
++		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
++			continue;
++		}
++#endif
+ 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
+ 		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
+ 	}
+@@ -842,4 +993,563 @@ icp_gcm_impl_get(char *buffer, zfs_kerne
+ module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
+     NULL, 0644);
+ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
+-#endif
++#endif /* defined(__KERNEL) */
++
++#ifdef CAN_USE_GCM_ASM
++#define	GCM_BLOCK_LEN 16
++/*
++ * The openssl asm routines are 6x aggregated and need that many bytes
++ * at minimum.
++ */
++#define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
++#define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
++/*
++ * Ensure the chunk size is reasonable since we are allocating a
++ * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
++ */
++#define	GCM_AVX_MAX_CHUNK_SIZE \
++	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
++
++/* Get the chunk size module parameter. */
++#define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
++
++/* Clear the FPU registers since they hold sensitive internal state. */
++#define	clear_fpu_regs() clear_fpu_regs_avx()
++#define	GHASH_AVX(ctx, in, len) \
++    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \
++    in, len)
++
++#define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
++
++/*
++ * Module parameter: number of bytes to process at once while owning the FPU.
++ * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
++ * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
++ */
++static uint32_t gcm_avx_chunk_size =
++	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
++
++extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
++extern void clear_fpu_regs_avx(void);
++extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
++extern void aes_encrypt_intel(const uint32_t rk[], int nr,
++    const uint32_t pt[4], uint32_t ct[4]);
++
++extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]);
++extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2],
++    const uint8_t *in, size_t len);
++
++extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
++    const void *, uint64_t *, uint64_t *);
++
++extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
++    const void *, uint64_t *, uint64_t *);
++
++static inline boolean_t
++gcm_avx_will_work(void)
++{
++	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
++	return (kfpu_allowed() &&
++	    zfs_avx_available() && zfs_movbe_available() &&
++	    zfs_aes_available() && zfs_pclmulqdq_available());
++}
++
++static inline void
++gcm_set_avx(boolean_t val)
++{
++	if (gcm_avx_will_work() == B_TRUE) {
++		atomic_swap_32(&gcm_use_avx, val);
++	}
++}
++
++static inline boolean_t
++gcm_toggle_avx(void)
++{
++	if (gcm_avx_will_work() == B_TRUE) {
++		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
++	} else {
++		return (B_FALSE);
++	}
++}
++
++/*
++ * Clear senssitve data in the context.
++ *
++ * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
++ * ctx->gcm_Htable contain the hash sub key which protects authentication.
++ *
++ * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
++ * a known plaintext attack, they consists of the IV and the first and last
++ * counter respectively. If they should be cleared is debatable.
++ */
++static inline void
++gcm_clear_ctx(gcm_ctx_t *ctx)
++{
++	bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
++	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
++	bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable));
++	bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
++	bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
++}
++
++/* Increment the GCM counter block by n. */
++static inline void
++gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
++{
++	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
++	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
++
++	counter = htonll(counter + n);
++	counter &= counter_mask;
++	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
++}
++
++/*
++ * Encrypt multiple blocks of data in GCM mode.
++ * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
++ * if possible. While processing a chunk the FPU is "locked".
++ */
++static int
++gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
++    size_t length, crypto_data_t *out, size_t block_size)
++{
++	size_t bleft = length;
++	size_t need = 0;
++	size_t done = 0;
++	uint8_t *datap = (uint8_t *)data;
++	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
++	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
++	uint64_t *ghash = ctx->gcm_ghash;
++	uint64_t *cb = ctx->gcm_cb;
++	uint8_t *ct_buf = NULL;
++	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
++	int rv = CRYPTO_SUCCESS;
++
++	ASSERT(block_size == GCM_BLOCK_LEN);
++	/*
++	 * If the last call left an incomplete block, try to fill
++	 * it first.
++	 */
++	if (ctx->gcm_remainder_len > 0) {
++		need = block_size - ctx->gcm_remainder_len;
++		if (length < need) {
++			/* Accumulate bytes here and return. */
++			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
++			    ctx->gcm_remainder_len, length);
++
++			ctx->gcm_remainder_len += length;
++			if (ctx->gcm_copy_to == NULL) {
++				ctx->gcm_copy_to = datap;
++			}
++			return (CRYPTO_SUCCESS);
++		} else {
++			/* Complete incomplete block. */
++			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
++			    ctx->gcm_remainder_len, need);
++
++			ctx->gcm_copy_to = NULL;
++		}
++	}
++
++	/* Allocate a buffer to encrypt to if there is enough input. */
++	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
++		ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
++		if (ct_buf == NULL) {
++			return (CRYPTO_HOST_MEMORY);
++		}
++	}
++
++	/* If we completed an incomplete block, encrypt and write it out. */
++	if (ctx->gcm_remainder_len > 0) {
++		kfpu_begin();
++		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
++		    (const uint32_t *)cb, (uint32_t *)tmp);
++
++		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
++		GHASH_AVX(ctx, tmp, block_size);
++		clear_fpu_regs();
++		kfpu_end();
++		/*
++		 * We don't follow gcm_mode_encrypt_contiguous_blocks() here
++		 * but assert that out is not null.
++		 * See gcm_mode_encrypt_contiguous_blocks() above and
++		 * https://github.com/zfsonlinux/zfs/issues/9661
++		 */
++		ASSERT(out != NULL);
++		rv = crypto_put_output_data(tmp, out, block_size);
++		out->cd_offset += block_size;
++		gcm_incr_counter_block(ctx);
++		ctx->gcm_processed_data_len += block_size;
++		bleft -= need;
++		datap += need;
++		ctx->gcm_remainder_len = 0;
++	}
++
++	/* Do the bulk encryption in chunk_size blocks. */
++	for (; bleft >= chunk_size; bleft -= chunk_size) {
++		kfpu_begin();
++		done = aesni_gcm_encrypt(
++		    datap, ct_buf, chunk_size, key, cb, ghash);
++
++		clear_fpu_regs();
++		kfpu_end();
++		if (done != chunk_size) {
++			rv = CRYPTO_FAILED;
++			goto out_nofpu;
++		}
++		if (out != NULL) {
++			rv = crypto_put_output_data(ct_buf, out, chunk_size);
++			if (rv != CRYPTO_SUCCESS) {
++				goto out_nofpu;
++			}
++			out->cd_offset += chunk_size;
++		}
++		datap += chunk_size;
++		ctx->gcm_processed_data_len += chunk_size;
++	}
++	/* Check if we are already done. */
++	if (bleft == 0) {
++		goto out_nofpu;
++	}
++	/* Bulk encrypt the remaining data. */
++	kfpu_begin();
++	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
++		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
++		if (done == 0) {
++			rv = CRYPTO_FAILED;
++			goto out;
++		}
++		if (out != NULL) {
++			rv = crypto_put_output_data(ct_buf, out, done);
++			if (rv != CRYPTO_SUCCESS) {
++				goto out;
++			}
++			out->cd_offset += done;
++		}
++		ctx->gcm_processed_data_len += done;
++		datap += done;
++		bleft -= done;
++
++	}
++	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
++	while (bleft > 0) {
++		if (bleft < block_size) {
++			bcopy(datap, ctx->gcm_remainder, bleft);
++			ctx->gcm_remainder_len = bleft;
++			ctx->gcm_copy_to = datap;
++			goto out;
++		}
++		/* Encrypt, hash and write out. */
++		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
++		    (const uint32_t *)cb, (uint32_t *)tmp);
++
++		gcm_xor_avx(datap, tmp);
++		GHASH_AVX(ctx, tmp, block_size);
++		if (out != NULL) {
++			rv = crypto_put_output_data(tmp, out, block_size);
++			if (rv != CRYPTO_SUCCESS) {
++				goto out;
++			}
++			out->cd_offset += block_size;
++		}
++		gcm_incr_counter_block(ctx);
++		ctx->gcm_processed_data_len += block_size;
++		datap += block_size;
++		bleft -= block_size;
++	}
++out:
++	clear_fpu_regs();
++	kfpu_end();
++out_nofpu:
++	if (ct_buf != NULL) {
++		vmem_free(ct_buf, chunk_size);
++	}
++	return (rv);
++}
++
++/*
++ * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
++ * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
++ */
++static int
++gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
++{
++	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
++	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
++	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
++	size_t rem_len = ctx->gcm_remainder_len;
++	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
++	int aes_rounds = ((aes_key_t *)keysched)->nr;
++	int rv;
++
++	ASSERT(block_size == GCM_BLOCK_LEN);
++
++	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
++		return (CRYPTO_DATA_LEN_RANGE);
++	}
++
++	kfpu_begin();
++	/* Pad last incomplete block with zeros, encrypt and hash. */
++	if (rem_len > 0) {
++		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
++		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
++
++		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
++		bzero(remainder + rem_len, block_size - rem_len);
++		for (int i = 0; i < rem_len; i++) {
++			remainder[i] ^= tmp[i];
++		}
++		GHASH_AVX(ctx, remainder, block_size);
++		ctx->gcm_processed_data_len += rem_len;
++		/* No need to increment counter_block, it's the last block. */
++	}
++	/* Finish tag. */
++	ctx->gcm_len_a_len_c[1] =
++	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
++	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
++	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
++
++	gcm_xor_avx((uint8_t *)J0, ghash);
++	clear_fpu_regs();
++	kfpu_end();
++
++	/* Output remainder. */
++	if (rem_len > 0) {
++		rv = crypto_put_output_data(remainder, out, rem_len);
++		if (rv != CRYPTO_SUCCESS)
++			return (rv);
++	}
++	out->cd_offset += rem_len;
++	ctx->gcm_remainder_len = 0;
++	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
++	if (rv != CRYPTO_SUCCESS)
++		return (rv);
++
++	out->cd_offset += ctx->gcm_tag_len;
++	/* Clear sensitive data in the context before returning. */
++	gcm_clear_ctx(ctx);
++	return (CRYPTO_SUCCESS);
++}
++
++/*
++ * Finalize decryption: We just have accumulated crypto text, so now we
++ * decrypt it here inplace.
++ */
++static int
++gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
++{
++	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
++	ASSERT3U(block_size, ==, 16);
++
++	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
++	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
++	uint8_t *datap = ctx->gcm_pt_buf;
++	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
++	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
++	uint64_t *ghash = ctx->gcm_ghash;
++	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
++	int rv = CRYPTO_SUCCESS;
++	size_t bleft, done;
++
++	/*
++	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
++	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
++	 * GCM_AVX_MIN_DECRYPT_BYTES.
++	 */
++	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
++		kfpu_begin();
++		done = aesni_gcm_decrypt(datap, datap, chunk_size,
++		    (const void *)key, ctx->gcm_cb, ghash);
++		clear_fpu_regs();
++		kfpu_end();
++		if (done != chunk_size) {
++			return (CRYPTO_FAILED);
++		}
++		datap += done;
++	}
++	/* Decrypt remainder, which is less then chunk size, in one go. */
++	kfpu_begin();
++	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
++		done = aesni_gcm_decrypt(datap, datap, bleft,
++		    (const void *)key, ctx->gcm_cb, ghash);
++		if (done == 0) {
++			clear_fpu_regs();
++			kfpu_end();
++			return (CRYPTO_FAILED);
++		}
++		datap += done;
++		bleft -= done;
++	}
++	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
++
++	/*
++	 * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
++	 * decrypt them block by block.
++	 */
++	while (bleft > 0) {
++		/* Incomplete last block. */
++		if (bleft < block_size) {
++			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
++
++			bzero(lastb, block_size);
++			bcopy(datap, lastb, bleft);
++			/* The GCM processing. */
++			GHASH_AVX(ctx, lastb, block_size);
++			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
++			for (size_t i = 0; i < bleft; i++) {
++				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
++			}
++			break;
++		}
++		/* The GCM processing. */
++		GHASH_AVX(ctx, datap, block_size);
++		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
++		gcm_xor_avx((uint8_t *)tmp, datap);
++		gcm_incr_counter_block(ctx);
++
++		datap += block_size;
++		bleft -= block_size;
++	}
++	if (rv != CRYPTO_SUCCESS) {
++		clear_fpu_regs();
++		kfpu_end();
++		return (rv);
++	}
++	/* Decryption done, finish the tag. */
++	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
++	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
++	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
++	    (uint32_t *)ctx->gcm_J0);
++
++	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
++
++	/* We are done with the FPU, restore its state. */
++	clear_fpu_regs();
++	kfpu_end();
++
++	/* Compare the input authentication tag with what we calculated. */
++	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
++		/* They don't match. */
++		return (CRYPTO_INVALID_MAC);
++	}
++	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
++	if (rv != CRYPTO_SUCCESS) {
++		return (rv);
++	}
++	out->cd_offset += pt_len;
++	gcm_clear_ctx(ctx);
++	return (CRYPTO_SUCCESS);
++}
++
++/*
++ * Initialize the GCM params H, Htabtle and the counter block. Save the
++ * initial counter block.
++ */
++static int
++gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
++    unsigned char *auth_data, size_t auth_data_len, size_t block_size)
++{
++	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
++	uint64_t *H = ctx->gcm_H;
++	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
++	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
++	uint8_t *datap = auth_data;
++	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
++	size_t bleft;
++
++	ASSERT(block_size == GCM_BLOCK_LEN);
++
++	/* Init H (encrypt zero block) and create the initial counter block. */
++	bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
++	bzero(H, sizeof (ctx->gcm_H));
++	kfpu_begin();
++	aes_encrypt_intel(keysched, aes_rounds,
++	    (const uint32_t *)H, (uint32_t *)H);
++
++	gcm_init_htab_avx(ctx->gcm_Htable, H);
++
++	if (iv_len == 12) {
++		bcopy(iv, cb, 12);
++		cb[12] = 0;
++		cb[13] = 0;
++		cb[14] = 0;
++		cb[15] = 1;
++		/* We need the ICB later. */
++		bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
++	} else {
++		/*
++		 * Most consumers use 12 byte IVs, so it's OK to use the
++		 * original routines for other IV sizes, just avoid nesting
++		 * kfpu_begin calls.
++		 */
++		clear_fpu_regs();
++		kfpu_end();
++		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
++		    aes_copy_block, aes_xor_block);
++		kfpu_begin();
++	}
++
++	/* Openssl post increments the counter, adjust for that. */
++	gcm_incr_counter_block(ctx);
++
++	/* Ghash AAD in chunk_size blocks. */
++	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
++		GHASH_AVX(ctx, datap, chunk_size);
++		datap += chunk_size;
++		clear_fpu_regs();
++		kfpu_end();
++		kfpu_begin();
++	}
++	/* Ghash the remainder and handle possible incomplete GCM block. */
++	if (bleft > 0) {
++		size_t incomp = bleft % block_size;
++
++		bleft -= incomp;
++		if (bleft > 0) {
++			GHASH_AVX(ctx, datap, bleft);
++			datap += bleft;
++		}
++		if (incomp > 0) {
++			/* Zero pad and hash incomplete last block. */
++			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
++
++			bzero(authp, block_size);
++			bcopy(datap, authp, incomp);
++			GHASH_AVX(ctx, authp, block_size);
++		}
++	}
++	clear_fpu_regs();
++	kfpu_end();
++	return (CRYPTO_SUCCESS);
++}
++
++#if defined(_KERNEL)
++static int
++icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
++{
++	unsigned long val;
++	char val_rounded[16];
++	int error = 0;
++
++	error = kstrtoul(buf, 0, &val);
++	if (error)
++		return (error);
++
++	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
++
++	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
++		return (-EINVAL);
++
++	snprintf(val_rounded, 16, "%u", (uint32_t)val);
++	error = param_set_uint(val_rounded, kp);
++	return (error);
++}
++
++module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
++    param_get_uint, &gcm_avx_chunk_size, 0644);
++
++MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
++	"How many bytes to process while owning the FPU");
++
++#endif /* defined(__KERNEL) */
++#endif /* ifdef CAN_USE_GCM_ASM */
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
+@@ -0,0 +1,36 @@
++Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions
++are met:
++
++      *	Redistributions of source code must retain copyright notices,
++	this list of conditions and the following disclaimer.
++
++      *	Redistributions in binary form must reproduce the above
++	copyright notice, this list of conditions and the following
++	disclaimer in the documentation and/or other materials
++	provided with the distribution.
++
++      *	Neither the name of the CRYPTOGAMS nor the names of its
++	copyright holder and contributors may be used to endorse or
++	promote products derived from this software without specific
++	prior written permission.
++
++ALTERNATIVELY, provided that this notice is retained in full, this
++product may be distributed under the terms of the GNU General Public
++License (GPL), in which case the provisions of the GPL apply INSTEAD OF
++those given above.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
+@@ -0,0 +1 @@
++PORTIONS OF GCM and GHASH FUNCTIONALITY
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
+@@ -0,0 +1,177 @@
++
++                                 Apache License
++                           Version 2.0, January 2004
++                        https://www.apache.org/licenses/
++
++   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
++
++   1. Definitions.
++
++      "License" shall mean the terms and conditions for use, reproduction,
++      and distribution as defined by Sections 1 through 9 of this document.
++
++      "Licensor" shall mean the copyright owner or entity authorized by
++      the copyright owner that is granting the License.
++
++      "Legal Entity" shall mean the union of the acting entity and all
++      other entities that control, are controlled by, or are under common
++      control with that entity. For the purposes of this definition,
++      "control" means (i) the power, direct or indirect, to cause the
++      direction or management of such entity, whether by contract or
++      otherwise, or (ii) ownership of fifty percent (50%) or more of the
++      outstanding shares, or (iii) beneficial ownership of such entity.
++
++      "You" (or "Your") shall mean an individual or Legal Entity
++      exercising permissions granted by this License.
++
++      "Source" form shall mean the preferred form for making modifications,
++      including but not limited to software source code, documentation
++      source, and configuration files.
++
++      "Object" form shall mean any form resulting from mechanical
++      transformation or translation of a Source form, including but
++      not limited to compiled object code, generated documentation,
++      and conversions to other media types.
++
++      "Work" shall mean the work of authorship, whether in Source or
++      Object form, made available under the License, as indicated by a
++      copyright notice that is included in or attached to the work
++      (an example is provided in the Appendix below).
++
++      "Derivative Works" shall mean any work, whether in Source or Object
++      form, that is based on (or derived from) the Work and for which the
++      editorial revisions, annotations, elaborations, or other modifications
++      represent, as a whole, an original work of authorship. For the purposes
++      of this License, Derivative Works shall not include works that remain
++      separable from, or merely link (or bind by name) to the interfaces of,
++      the Work and Derivative Works thereof.
++
++      "Contribution" shall mean any work of authorship, including
++      the original version of the Work and any modifications or additions
++      to that Work or Derivative Works thereof, that is intentionally
++      submitted to Licensor for inclusion in the Work by the copyright owner
++      or by an individual or Legal Entity authorized to submit on behalf of
++      the copyright owner. For the purposes of this definition, "submitted"
++      means any form of electronic, verbal, or written communication sent
++      to the Licensor or its representatives, including but not limited to
++      communication on electronic mailing lists, source code control systems,
++      and issue tracking systems that are managed by, or on behalf of, the
++      Licensor for the purpose of discussing and improving the Work, but
++      excluding communication that is conspicuously marked or otherwise
++      designated in writing by the copyright owner as "Not a Contribution."
++
++      "Contributor" shall mean Licensor and any individual or Legal Entity
++      on behalf of whom a Contribution has been received by Licensor and
++      subsequently incorporated within the Work.
++
++   2. Grant of Copyright License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      copyright license to reproduce, prepare Derivative Works of,
++      publicly display, publicly perform, sublicense, and distribute the
++      Work and such Derivative Works in Source or Object form.
++
++   3. Grant of Patent License. Subject to the terms and conditions of
++      this License, each Contributor hereby grants to You a perpetual,
++      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
++      (except as stated in this section) patent license to make, have made,
++      use, offer to sell, sell, import, and otherwise transfer the Work,
++      where such license applies only to those patent claims licensable
++      by such Contributor that are necessarily infringed by their
++      Contribution(s) alone or by combination of their Contribution(s)
++      with the Work to which such Contribution(s) was submitted. If You
++      institute patent litigation against any entity (including a
++      cross-claim or counterclaim in a lawsuit) alleging that the Work
++      or a Contribution incorporated within the Work constitutes direct
++      or contributory patent infringement, then any patent licenses
++      granted to You under this License for that Work shall terminate
++      as of the date such litigation is filed.
++
++   4. Redistribution. You may reproduce and distribute copies of the
++      Work or Derivative Works thereof in any medium, with or without
++      modifications, and in Source or Object form, provided that You
++      meet the following conditions:
++
++      (a) You must give any other recipients of the Work or
++          Derivative Works a copy of this License; and
++
++      (b) You must cause any modified files to carry prominent notices
++          stating that You changed the files; and
++
++      (c) You must retain, in the Source form of any Derivative Works
++          that You distribute, all copyright, patent, trademark, and
++          attribution notices from the Source form of the Work,
++          excluding those notices that do not pertain to any part of
++          the Derivative Works; and
++
++      (d) If the Work includes a "NOTICE" text file as part of its
++          distribution, then any Derivative Works that You distribute must
++          include a readable copy of the attribution notices contained
++          within such NOTICE file, excluding those notices that do not
++          pertain to any part of the Derivative Works, in at least one
++          of the following places: within a NOTICE text file distributed
++          as part of the Derivative Works; within the Source form or
++          documentation, if provided along with the Derivative Works; or,
++          within a display generated by the Derivative Works, if and
++          wherever such third-party notices normally appear. The contents
++          of the NOTICE file are for informational purposes only and
++          do not modify the License. You may add Your own attribution
++          notices within Derivative Works that You distribute, alongside
++          or as an addendum to the NOTICE text from the Work, provided
++          that such additional attribution notices cannot be construed
++          as modifying the License.
++
++      You may add Your own copyright statement to Your modifications and
++      may provide additional or different license terms and conditions
++      for use, reproduction, or distribution of Your modifications, or
++      for any such Derivative Works as a whole, provided Your use,
++      reproduction, and distribution of the Work otherwise complies with
++      the conditions stated in this License.
++
++   5. Submission of Contributions. Unless You explicitly state otherwise,
++      any Contribution intentionally submitted for inclusion in the Work
++      by You to the Licensor shall be under the terms and conditions of
++      this License, without any additional terms or conditions.
++      Notwithstanding the above, nothing herein shall supersede or modify
++      the terms of any separate license agreement you may have executed
++      with Licensor regarding such Contributions.
++
++   6. Trademarks. This License does not grant permission to use the trade
++      names, trademarks, service marks, or product names of the Licensor,
++      except as required for reasonable and customary use in describing the
++      origin of the Work and reproducing the content of the NOTICE file.
++
++   7. Disclaimer of Warranty. Unless required by applicable law or
++      agreed to in writing, Licensor provides the Work (and each
++      Contributor provides its Contributions) on an "AS IS" BASIS,
++      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
++      implied, including, without limitation, any warranties or conditions
++      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
++      PARTICULAR PURPOSE. You are solely responsible for determining the
++      appropriateness of using or redistributing the Work and assume any
++      risks associated with Your exercise of permissions under this License.
++
++   8. Limitation of Liability. In no event and under no legal theory,
++      whether in tort (including negligence), contract, or otherwise,
++      unless required by applicable law (such as deliberate and grossly
++      negligent acts) or agreed to in writing, shall any Contributor be
++      liable to You for damages, including any direct, indirect, special,
++      incidental, or consequential damages of any character arising as a
++      result of this License or out of the use or inability to use the
++      Work (including but not limited to damages for loss of goodwill,
++      work stoppage, computer failure or malfunction, or any and all
++      other commercial damages or losses), even if such Contributor
++      has been advised of the possibility of such damages.
++
++   9. Accepting Warranty or Additional Liability. While redistributing
++      the Work or Derivative Works thereof, You may choose to offer,
++      and charge a fee for, acceptance of support, warranty, indemnity,
++      or other liability obligations and/or rights consistent with this
++      License. However, in accepting such obligations, You may act only
++      on Your own behalf and on Your sole responsibility, not on behalf
++      of any other Contributor, and only if You agree to indemnify,
++      defend, and hold each Contributor harmless for any liability
++      incurred by, or claims asserted against, such Contributor by reason
++      of your accepting any such warranty or additional liability.
++
++   END OF TERMS AND CONDITIONS
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
+@@ -0,0 +1 @@
++PORTIONS OF GCM and GHASH FUNCTIONALITY
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+@@ -0,0 +1,892 @@
++# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++#
++# AES-NI-CTR+GHASH stitch.
++#
++# February 2013
++#
++# OpenSSL GCM implementation is organized in such way that its
++# performance is rather close to the sum of its streamed components,
++# in the context parallelized AES-NI CTR and modulo-scheduled
++# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
++# was observed to perform significantly better than the sum of the
++# components on contemporary CPUs, the effort was deemed impossible to
++# justify. This module is based on combination of Intel submissions,
++# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
++# Locktyukhin of Intel Corp. who verified that it reduces shuffles
++# pressure with notable relative improvement, achieving 1.0 cycle per
++# byte processed with 128-bit key on Haswell processor, 0.74 - on
++# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
++# measurements for favourable packet size, one divisible by 96.
++# Applications using the EVP interface will observe a few percent
++# worse performance.]
++#
++# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
++#
++# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
++# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
++
++# Generated once from
++# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
++# and modified for ICP. Modification are kept at a bare minimum to ease later
++# upstream merges.
++
++#if defined(__x86_64__) && defined(HAVE_AVX) && \
++    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
++
++.text
++
++.type	_aesni_ctr32_ghash_6x,@function
++.align	32
++_aesni_ctr32_ghash_6x:
++	vmovdqu	32(%r11),%xmm2
++	subq	$6,%rdx
++	vpxor	%xmm4,%xmm4,%xmm4
++	vmovdqu	0-128(%rcx),%xmm15
++	vpaddb	%xmm2,%xmm1,%xmm10
++	vpaddb	%xmm2,%xmm10,%xmm11
++	vpaddb	%xmm2,%xmm11,%xmm12
++	vpaddb	%xmm2,%xmm12,%xmm13
++	vpaddb	%xmm2,%xmm13,%xmm14
++	vpxor	%xmm15,%xmm1,%xmm9
++	vmovdqu	%xmm4,16+8(%rsp)
++	jmp	.Loop6x
++
++.align	32
++.Loop6x:
++	addl	$100663296,%ebx
++	jc	.Lhandle_ctr32
++	vmovdqu	0-32(%r9),%xmm3
++	vpaddb	%xmm2,%xmm14,%xmm1
++	vpxor	%xmm15,%xmm10,%xmm10
++	vpxor	%xmm15,%xmm11,%xmm11
++
++.Lresume_ctr32:
++	vmovdqu	%xmm1,(%r8)
++	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
++	vpxor	%xmm15,%xmm12,%xmm12
++	vmovups	16-128(%rcx),%xmm2
++	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
++	xorq	%r12,%r12
++	cmpq	%r14,%r15
++
++	vaesenc	%xmm2,%xmm9,%xmm9
++	vmovdqu	48+8(%rsp),%xmm0
++	vpxor	%xmm15,%xmm13,%xmm13
++	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
++	vaesenc	%xmm2,%xmm10,%xmm10
++	vpxor	%xmm15,%xmm14,%xmm14
++	setnc	%r12b
++	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
++	vaesenc	%xmm2,%xmm11,%xmm11
++	vmovdqu	16-32(%r9),%xmm3
++	negq	%r12
++	vaesenc	%xmm2,%xmm12,%xmm12
++	vpxor	%xmm5,%xmm6,%xmm6
++	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
++	vpxor	%xmm4,%xmm8,%xmm8
++	vaesenc	%xmm2,%xmm13,%xmm13
++	vpxor	%xmm5,%xmm1,%xmm4
++	andq	$0x60,%r12
++	vmovups	32-128(%rcx),%xmm15
++	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
++	vaesenc	%xmm2,%xmm14,%xmm14
++
++	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
++	leaq	(%r14,%r12,1),%r14
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	16+8(%rsp),%xmm8,%xmm8
++	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
++	vmovdqu	64+8(%rsp),%xmm0
++	vaesenc	%xmm15,%xmm10,%xmm10
++	movbeq	88(%r14),%r13
++	vaesenc	%xmm15,%xmm11,%xmm11
++	movbeq	80(%r14),%r12
++	vaesenc	%xmm15,%xmm12,%xmm12
++	movq	%r13,32+8(%rsp)
++	vaesenc	%xmm15,%xmm13,%xmm13
++	movq	%r12,40+8(%rsp)
++	vmovdqu	48-32(%r9),%xmm5
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vmovups	48-128(%rcx),%xmm15
++	vpxor	%xmm1,%xmm6,%xmm6
++	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	%xmm2,%xmm6,%xmm6
++	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vpxor	%xmm3,%xmm7,%xmm7
++	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
++	vmovdqu	80+8(%rsp),%xmm0
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vpxor	%xmm1,%xmm4,%xmm4
++	vmovdqu	64-32(%r9),%xmm1
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vmovups	64-128(%rcx),%xmm15
++	vpxor	%xmm2,%xmm6,%xmm6
++	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	%xmm3,%xmm6,%xmm6
++	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
++	vaesenc	%xmm15,%xmm10,%xmm10
++	movbeq	72(%r14),%r13
++	vpxor	%xmm5,%xmm7,%xmm7
++	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
++	vaesenc	%xmm15,%xmm11,%xmm11
++	movbeq	64(%r14),%r12
++	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
++	vmovdqu	96+8(%rsp),%xmm0
++	vaesenc	%xmm15,%xmm12,%xmm12
++	movq	%r13,48+8(%rsp)
++	vaesenc	%xmm15,%xmm13,%xmm13
++	movq	%r12,56+8(%rsp)
++	vpxor	%xmm2,%xmm4,%xmm4
++	vmovdqu	96-32(%r9),%xmm2
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vmovups	80-128(%rcx),%xmm15
++	vpxor	%xmm3,%xmm6,%xmm6
++	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	%xmm5,%xmm6,%xmm6
++	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
++	vaesenc	%xmm15,%xmm10,%xmm10
++	movbeq	56(%r14),%r13
++	vpxor	%xmm1,%xmm7,%xmm7
++	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
++	vpxor	112+8(%rsp),%xmm8,%xmm8
++	vaesenc	%xmm15,%xmm11,%xmm11
++	movbeq	48(%r14),%r12
++	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
++	vaesenc	%xmm15,%xmm12,%xmm12
++	movq	%r13,64+8(%rsp)
++	vaesenc	%xmm15,%xmm13,%xmm13
++	movq	%r12,72+8(%rsp)
++	vpxor	%xmm3,%xmm4,%xmm4
++	vmovdqu	112-32(%r9),%xmm3
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vmovups	96-128(%rcx),%xmm15
++	vpxor	%xmm5,%xmm6,%xmm6
++	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	%xmm1,%xmm6,%xmm6
++	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
++	vaesenc	%xmm15,%xmm10,%xmm10
++	movbeq	40(%r14),%r13
++	vpxor	%xmm2,%xmm7,%xmm7
++	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
++	vaesenc	%xmm15,%xmm11,%xmm11
++	movbeq	32(%r14),%r12
++	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
++	vaesenc	%xmm15,%xmm12,%xmm12
++	movq	%r13,80+8(%rsp)
++	vaesenc	%xmm15,%xmm13,%xmm13
++	movq	%r12,88+8(%rsp)
++	vpxor	%xmm5,%xmm6,%xmm6
++	vaesenc	%xmm15,%xmm14,%xmm14
++	vpxor	%xmm1,%xmm6,%xmm6
++
++	vmovups	112-128(%rcx),%xmm15
++	vpslldq	$8,%xmm6,%xmm5
++	vpxor	%xmm2,%xmm4,%xmm4
++	vmovdqu	16(%r11),%xmm3
++
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	%xmm8,%xmm7,%xmm7
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vpxor	%xmm5,%xmm4,%xmm4
++	movbeq	24(%r14),%r13
++	vaesenc	%xmm15,%xmm11,%xmm11
++	movbeq	16(%r14),%r12
++	vpalignr	$8,%xmm4,%xmm4,%xmm0
++	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
++	movq	%r13,96+8(%rsp)
++	vaesenc	%xmm15,%xmm12,%xmm12
++	movq	%r12,104+8(%rsp)
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vmovups	128-128(%rcx),%xmm1
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vaesenc	%xmm1,%xmm9,%xmm9
++	vmovups	144-128(%rcx),%xmm15
++	vaesenc	%xmm1,%xmm10,%xmm10
++	vpsrldq	$8,%xmm6,%xmm6
++	vaesenc	%xmm1,%xmm11,%xmm11
++	vpxor	%xmm6,%xmm7,%xmm7
++	vaesenc	%xmm1,%xmm12,%xmm12
++	vpxor	%xmm0,%xmm4,%xmm4
++	movbeq	8(%r14),%r13
++	vaesenc	%xmm1,%xmm13,%xmm13
++	movbeq	0(%r14),%r12
++	vaesenc	%xmm1,%xmm14,%xmm14
++	vmovups	160-128(%rcx),%xmm1
++	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
++	jb	.Lenc_tail
++
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vaesenc	%xmm1,%xmm9,%xmm9
++	vaesenc	%xmm1,%xmm10,%xmm10
++	vaesenc	%xmm1,%xmm11,%xmm11
++	vaesenc	%xmm1,%xmm12,%xmm12
++	vaesenc	%xmm1,%xmm13,%xmm13
++	vmovups	176-128(%rcx),%xmm15
++	vaesenc	%xmm1,%xmm14,%xmm14
++	vmovups	192-128(%rcx),%xmm1
++	cmpl	$14,%ebp	// ICP does not zero key schedule.
++	jb	.Lenc_tail
++
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vaesenc	%xmm15,%xmm14,%xmm14
++
++	vaesenc	%xmm1,%xmm9,%xmm9
++	vaesenc	%xmm1,%xmm10,%xmm10
++	vaesenc	%xmm1,%xmm11,%xmm11
++	vaesenc	%xmm1,%xmm12,%xmm12
++	vaesenc	%xmm1,%xmm13,%xmm13
++	vmovups	208-128(%rcx),%xmm15
++	vaesenc	%xmm1,%xmm14,%xmm14
++	vmovups	224-128(%rcx),%xmm1
++	jmp	.Lenc_tail
++
++.align	32
++.Lhandle_ctr32:
++	vmovdqu	(%r11),%xmm0
++	vpshufb	%xmm0,%xmm1,%xmm6
++	vmovdqu	48(%r11),%xmm5
++	vpaddd	64(%r11),%xmm6,%xmm10
++	vpaddd	%xmm5,%xmm6,%xmm11
++	vmovdqu	0-32(%r9),%xmm3
++	vpaddd	%xmm5,%xmm10,%xmm12
++	vpshufb	%xmm0,%xmm10,%xmm10
++	vpaddd	%xmm5,%xmm11,%xmm13
++	vpshufb	%xmm0,%xmm11,%xmm11
++	vpxor	%xmm15,%xmm10,%xmm10
++	vpaddd	%xmm5,%xmm12,%xmm14
++	vpshufb	%xmm0,%xmm12,%xmm12
++	vpxor	%xmm15,%xmm11,%xmm11
++	vpaddd	%xmm5,%xmm13,%xmm1
++	vpshufb	%xmm0,%xmm13,%xmm13
++	vpshufb	%xmm0,%xmm14,%xmm14
++	vpshufb	%xmm0,%xmm1,%xmm1
++	jmp	.Lresume_ctr32
++
++.align	32
++.Lenc_tail:
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vmovdqu	%xmm7,16+8(%rsp)
++	vpalignr	$8,%xmm4,%xmm4,%xmm8
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
++	vpxor	0(%rdi),%xmm1,%xmm2
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vpxor	16(%rdi),%xmm1,%xmm0
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vpxor	32(%rdi),%xmm1,%xmm5
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vpxor	48(%rdi),%xmm1,%xmm6
++	vaesenc	%xmm15,%xmm14,%xmm14
++	vpxor	64(%rdi),%xmm1,%xmm7
++	vpxor	80(%rdi),%xmm1,%xmm3
++	vmovdqu	(%r8),%xmm1
++
++	vaesenclast	%xmm2,%xmm9,%xmm9
++	vmovdqu	32(%r11),%xmm2
++	vaesenclast	%xmm0,%xmm10,%xmm10
++	vpaddb	%xmm2,%xmm1,%xmm0
++	movq	%r13,112+8(%rsp)
++	leaq	96(%rdi),%rdi
++	vaesenclast	%xmm5,%xmm11,%xmm11
++	vpaddb	%xmm2,%xmm0,%xmm5
++	movq	%r12,120+8(%rsp)
++	leaq	96(%rsi),%rsi
++	vmovdqu	0-128(%rcx),%xmm15
++	vaesenclast	%xmm6,%xmm12,%xmm12
++	vpaddb	%xmm2,%xmm5,%xmm6
++	vaesenclast	%xmm7,%xmm13,%xmm13
++	vpaddb	%xmm2,%xmm6,%xmm7
++	vaesenclast	%xmm3,%xmm14,%xmm14
++	vpaddb	%xmm2,%xmm7,%xmm3
++
++	addq	$0x60,%r10
++	subq	$0x6,%rdx
++	jc	.L6x_done
++
++	vmovups	%xmm9,-96(%rsi)
++	vpxor	%xmm15,%xmm1,%xmm9
++	vmovups	%xmm10,-80(%rsi)
++	vmovdqa	%xmm0,%xmm10
++	vmovups	%xmm11,-64(%rsi)
++	vmovdqa	%xmm5,%xmm11
++	vmovups	%xmm12,-48(%rsi)
++	vmovdqa	%xmm6,%xmm12
++	vmovups	%xmm13,-32(%rsi)
++	vmovdqa	%xmm7,%xmm13
++	vmovups	%xmm14,-16(%rsi)
++	vmovdqa	%xmm3,%xmm14
++	vmovdqu	32+8(%rsp),%xmm7
++	jmp	.Loop6x
++
++.L6x_done:
++	vpxor	16+8(%rsp),%xmm8,%xmm8
++	vpxor	%xmm4,%xmm8,%xmm8
++
++	.byte	0xf3,0xc3
++.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
++.globl	aesni_gcm_decrypt
++.type	aesni_gcm_decrypt,@function
++.align	32
++aesni_gcm_decrypt:
++.cfi_startproc
++	xorq	%r10,%r10
++	cmpq	$0x60,%rdx
++	jb	.Lgcm_dec_abort
++
++	leaq	(%rsp),%rax
++.cfi_def_cfa_register	%rax
++	pushq	%rbx
++.cfi_offset	%rbx,-16
++	pushq	%rbp
++.cfi_offset	%rbp,-24
++	pushq	%r12
++.cfi_offset	%r12,-32
++	pushq	%r13
++.cfi_offset	%r13,-40
++	pushq	%r14
++.cfi_offset	%r14,-48
++	pushq	%r15
++.cfi_offset	%r15,-56
++	vzeroupper
++
++	vmovdqu	(%r8),%xmm1
++	addq	$-128,%rsp
++	movl	12(%r8),%ebx
++	leaq	.Lbswap_mask(%rip),%r11
++	leaq	-128(%rcx),%r14
++	movq	$0xf80,%r15
++	vmovdqu	(%r9),%xmm8
++	andq	$-128,%rsp
++	vmovdqu	(%r11),%xmm0
++	leaq	128(%rcx),%rcx
++	leaq	32+32(%r9),%r9
++	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
++	vpshufb	%xmm0,%xmm8,%xmm8
++
++	andq	%r15,%r14
++	andq	%rsp,%r15
++	subq	%r14,%r15
++	jc	.Ldec_no_key_aliasing
++	cmpq	$768,%r15
++	jnc	.Ldec_no_key_aliasing
++	subq	%r15,%rsp
++.Ldec_no_key_aliasing:
++
++	vmovdqu	80(%rdi),%xmm7
++	leaq	(%rdi),%r14
++	vmovdqu	64(%rdi),%xmm4
++	leaq	-192(%rdi,%rdx,1),%r15
++	vmovdqu	48(%rdi),%xmm5
++	shrq	$4,%rdx
++	xorq	%r10,%r10
++	vmovdqu	32(%rdi),%xmm6
++	vpshufb	%xmm0,%xmm7,%xmm7
++	vmovdqu	16(%rdi),%xmm2
++	vpshufb	%xmm0,%xmm4,%xmm4
++	vmovdqu	(%rdi),%xmm3
++	vpshufb	%xmm0,%xmm5,%xmm5
++	vmovdqu	%xmm4,48(%rsp)
++	vpshufb	%xmm0,%xmm6,%xmm6
++	vmovdqu	%xmm5,64(%rsp)
++	vpshufb	%xmm0,%xmm2,%xmm2
++	vmovdqu	%xmm6,80(%rsp)
++	vpshufb	%xmm0,%xmm3,%xmm3
++	vmovdqu	%xmm2,96(%rsp)
++	vmovdqu	%xmm3,112(%rsp)
++
++	call	_aesni_ctr32_ghash_6x
++
++	vmovups	%xmm9,-96(%rsi)
++	vmovups	%xmm10,-80(%rsi)
++	vmovups	%xmm11,-64(%rsi)
++	vmovups	%xmm12,-48(%rsi)
++	vmovups	%xmm13,-32(%rsi)
++	vmovups	%xmm14,-16(%rsi)
++
++	vpshufb	(%r11),%xmm8,%xmm8
++	vmovdqu	%xmm8,-64(%r9)
++
++	vzeroupper
++	movq	-48(%rax),%r15
++.cfi_restore	%r15
++	movq	-40(%rax),%r14
++.cfi_restore	%r14
++	movq	-32(%rax),%r13
++.cfi_restore	%r13
++	movq	-24(%rax),%r12
++.cfi_restore	%r12
++	movq	-16(%rax),%rbp
++.cfi_restore	%rbp
++	movq	-8(%rax),%rbx
++.cfi_restore	%rbx
++	leaq	(%rax),%rsp
++.cfi_def_cfa_register	%rsp
++.Lgcm_dec_abort:
++	movq	%r10,%rax
++	.byte	0xf3,0xc3
++.cfi_endproc
++.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
++.type	_aesni_ctr32_6x,@function
++.align	32
++_aesni_ctr32_6x:
++	vmovdqu	0-128(%rcx),%xmm4
++	vmovdqu	32(%r11),%xmm2
++	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
++	vmovups	16-128(%rcx),%xmm15
++	leaq	32-128(%rcx),%r12
++	vpxor	%xmm4,%xmm1,%xmm9
++	addl	$100663296,%ebx
++	jc	.Lhandle_ctr32_2
++	vpaddb	%xmm2,%xmm1,%xmm10
++	vpaddb	%xmm2,%xmm10,%xmm11
++	vpxor	%xmm4,%xmm10,%xmm10
++	vpaddb	%xmm2,%xmm11,%xmm12
++	vpxor	%xmm4,%xmm11,%xmm11
++	vpaddb	%xmm2,%xmm12,%xmm13
++	vpxor	%xmm4,%xmm12,%xmm12
++	vpaddb	%xmm2,%xmm13,%xmm14
++	vpxor	%xmm4,%xmm13,%xmm13
++	vpaddb	%xmm2,%xmm14,%xmm1
++	vpxor	%xmm4,%xmm14,%xmm14
++	jmp	.Loop_ctr32
++
++.align	16
++.Loop_ctr32:
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vaesenc	%xmm15,%xmm14,%xmm14
++	vmovups	(%r12),%xmm15
++	leaq	16(%r12),%r12
++	decl	%r13d
++	jnz	.Loop_ctr32
++
++	vmovdqu	(%r12),%xmm3
++	vaesenc	%xmm15,%xmm9,%xmm9
++	vpxor	0(%rdi),%xmm3,%xmm4
++	vaesenc	%xmm15,%xmm10,%xmm10
++	vpxor	16(%rdi),%xmm3,%xmm5
++	vaesenc	%xmm15,%xmm11,%xmm11
++	vpxor	32(%rdi),%xmm3,%xmm6
++	vaesenc	%xmm15,%xmm12,%xmm12
++	vpxor	48(%rdi),%xmm3,%xmm8
++	vaesenc	%xmm15,%xmm13,%xmm13
++	vpxor	64(%rdi),%xmm3,%xmm2
++	vaesenc	%xmm15,%xmm14,%xmm14
++	vpxor	80(%rdi),%xmm3,%xmm3
++	leaq	96(%rdi),%rdi
++
++	vaesenclast	%xmm4,%xmm9,%xmm9
++	vaesenclast	%xmm5,%xmm10,%xmm10
++	vaesenclast	%xmm6,%xmm11,%xmm11
++	vaesenclast	%xmm8,%xmm12,%xmm12
++	vaesenclast	%xmm2,%xmm13,%xmm13
++	vaesenclast	%xmm3,%xmm14,%xmm14
++	vmovups	%xmm9,0(%rsi)
++	vmovups	%xmm10,16(%rsi)
++	vmovups	%xmm11,32(%rsi)
++	vmovups	%xmm12,48(%rsi)
++	vmovups	%xmm13,64(%rsi)
++	vmovups	%xmm14,80(%rsi)
++	leaq	96(%rsi),%rsi
++
++	.byte	0xf3,0xc3
++.align	32
++.Lhandle_ctr32_2:
++	vpshufb	%xmm0,%xmm1,%xmm6
++	vmovdqu	48(%r11),%xmm5
++	vpaddd	64(%r11),%xmm6,%xmm10
++	vpaddd	%xmm5,%xmm6,%xmm11
++	vpaddd	%xmm5,%xmm10,%xmm12
++	vpshufb	%xmm0,%xmm10,%xmm10
++	vpaddd	%xmm5,%xmm11,%xmm13
++	vpshufb	%xmm0,%xmm11,%xmm11
++	vpxor	%xmm4,%xmm10,%xmm10
++	vpaddd	%xmm5,%xmm12,%xmm14
++	vpshufb	%xmm0,%xmm12,%xmm12
++	vpxor	%xmm4,%xmm11,%xmm11
++	vpaddd	%xmm5,%xmm13,%xmm1
++	vpshufb	%xmm0,%xmm13,%xmm13
++	vpxor	%xmm4,%xmm12,%xmm12
++	vpshufb	%xmm0,%xmm14,%xmm14
++	vpxor	%xmm4,%xmm13,%xmm13
++	vpshufb	%xmm0,%xmm1,%xmm1
++	vpxor	%xmm4,%xmm14,%xmm14
++	jmp	.Loop_ctr32
++.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
++
++.globl	aesni_gcm_encrypt
++.type	aesni_gcm_encrypt,@function
++.align	32
++aesni_gcm_encrypt:
++.cfi_startproc
++	xorq	%r10,%r10
++	cmpq	$288,%rdx
++	jb	.Lgcm_enc_abort
++
++	leaq	(%rsp),%rax
++.cfi_def_cfa_register	%rax
++	pushq	%rbx
++.cfi_offset	%rbx,-16
++	pushq	%rbp
++.cfi_offset	%rbp,-24
++	pushq	%r12
++.cfi_offset	%r12,-32
++	pushq	%r13
++.cfi_offset	%r13,-40
++	pushq	%r14
++.cfi_offset	%r14,-48
++	pushq	%r15
++.cfi_offset	%r15,-56
++	vzeroupper
++
++	vmovdqu	(%r8),%xmm1
++	addq	$-128,%rsp
++	movl	12(%r8),%ebx
++	leaq	.Lbswap_mask(%rip),%r11
++	leaq	-128(%rcx),%r14
++	movq	$0xf80,%r15
++	leaq	128(%rcx),%rcx
++	vmovdqu	(%r11),%xmm0
++	andq	$-128,%rsp
++	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
++
++	andq	%r15,%r14
++	andq	%rsp,%r15
++	subq	%r14,%r15
++	jc	.Lenc_no_key_aliasing
++	cmpq	$768,%r15
++	jnc	.Lenc_no_key_aliasing
++	subq	%r15,%rsp
++.Lenc_no_key_aliasing:
++
++	leaq	(%rsi),%r14
++	leaq	-192(%rsi,%rdx,1),%r15
++	shrq	$4,%rdx
++
++	call	_aesni_ctr32_6x
++	vpshufb	%xmm0,%xmm9,%xmm8
++	vpshufb	%xmm0,%xmm10,%xmm2
++	vmovdqu	%xmm8,112(%rsp)
++	vpshufb	%xmm0,%xmm11,%xmm4
++	vmovdqu	%xmm2,96(%rsp)
++	vpshufb	%xmm0,%xmm12,%xmm5
++	vmovdqu	%xmm4,80(%rsp)
++	vpshufb	%xmm0,%xmm13,%xmm6
++	vmovdqu	%xmm5,64(%rsp)
++	vpshufb	%xmm0,%xmm14,%xmm7
++	vmovdqu	%xmm6,48(%rsp)
++
++	call	_aesni_ctr32_6x
++
++	vmovdqu	(%r9),%xmm8
++	leaq	32+32(%r9),%r9
++	subq	$12,%rdx
++	movq	$192,%r10
++	vpshufb	%xmm0,%xmm8,%xmm8
++
++	call	_aesni_ctr32_ghash_6x
++	vmovdqu	32(%rsp),%xmm7
++	vmovdqu	(%r11),%xmm0
++	vmovdqu	0-32(%r9),%xmm3
++	vpunpckhqdq	%xmm7,%xmm7,%xmm1
++	vmovdqu	32-32(%r9),%xmm15
++	vmovups	%xmm9,-96(%rsi)
++	vpshufb	%xmm0,%xmm9,%xmm9
++	vpxor	%xmm7,%xmm1,%xmm1
++	vmovups	%xmm10,-80(%rsi)
++	vpshufb	%xmm0,%xmm10,%xmm10
++	vmovups	%xmm11,-64(%rsi)
++	vpshufb	%xmm0,%xmm11,%xmm11
++	vmovups	%xmm12,-48(%rsi)
++	vpshufb	%xmm0,%xmm12,%xmm12
++	vmovups	%xmm13,-32(%rsi)
++	vpshufb	%xmm0,%xmm13,%xmm13
++	vmovups	%xmm14,-16(%rsi)
++	vpshufb	%xmm0,%xmm14,%xmm14
++	vmovdqu	%xmm9,16(%rsp)
++	vmovdqu	48(%rsp),%xmm6
++	vmovdqu	16-32(%r9),%xmm0
++	vpunpckhqdq	%xmm6,%xmm6,%xmm2
++	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
++	vpxor	%xmm6,%xmm2,%xmm2
++	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
++	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
++
++	vmovdqu	64(%rsp),%xmm9
++	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
++	vmovdqu	48-32(%r9),%xmm3
++	vpxor	%xmm5,%xmm4,%xmm4
++	vpunpckhqdq	%xmm9,%xmm9,%xmm5
++	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
++	vpxor	%xmm9,%xmm5,%xmm5
++	vpxor	%xmm7,%xmm6,%xmm6
++	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
++	vmovdqu	80-32(%r9),%xmm15
++	vpxor	%xmm1,%xmm2,%xmm2
++
++	vmovdqu	80(%rsp),%xmm1
++	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
++	vmovdqu	64-32(%r9),%xmm0
++	vpxor	%xmm4,%xmm7,%xmm7
++	vpunpckhqdq	%xmm1,%xmm1,%xmm4
++	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpxor	%xmm6,%xmm9,%xmm9
++	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
++	vpxor	%xmm2,%xmm5,%xmm5
++
++	vmovdqu	96(%rsp),%xmm2
++	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
++	vmovdqu	96-32(%r9),%xmm3
++	vpxor	%xmm7,%xmm6,%xmm6
++	vpunpckhqdq	%xmm2,%xmm2,%xmm7
++	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
++	vpxor	%xmm2,%xmm7,%xmm7
++	vpxor	%xmm9,%xmm1,%xmm1
++	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
++	vmovdqu	128-32(%r9),%xmm15
++	vpxor	%xmm5,%xmm4,%xmm4
++
++	vpxor	112(%rsp),%xmm8,%xmm8
++	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
++	vmovdqu	112-32(%r9),%xmm0
++	vpunpckhqdq	%xmm8,%xmm8,%xmm9
++	vpxor	%xmm6,%xmm5,%xmm5
++	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
++	vpxor	%xmm8,%xmm9,%xmm9
++	vpxor	%xmm1,%xmm2,%xmm2
++	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
++	vpxor	%xmm4,%xmm7,%xmm4
++
++	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
++	vmovdqu	0-32(%r9),%xmm3
++	vpunpckhqdq	%xmm14,%xmm14,%xmm1
++	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
++	vpxor	%xmm14,%xmm1,%xmm1
++	vpxor	%xmm5,%xmm6,%xmm5
++	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
++	vmovdqu	32-32(%r9),%xmm15
++	vpxor	%xmm2,%xmm8,%xmm7
++	vpxor	%xmm4,%xmm9,%xmm6
++
++	vmovdqu	16-32(%r9),%xmm0
++	vpxor	%xmm5,%xmm7,%xmm9
++	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
++	vpxor	%xmm9,%xmm6,%xmm6
++	vpunpckhqdq	%xmm13,%xmm13,%xmm2
++	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
++	vpxor	%xmm13,%xmm2,%xmm2
++	vpslldq	$8,%xmm6,%xmm9
++	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
++	vpxor	%xmm9,%xmm5,%xmm8
++	vpsrldq	$8,%xmm6,%xmm6
++	vpxor	%xmm6,%xmm7,%xmm7
++
++	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
++	vmovdqu	48-32(%r9),%xmm3
++	vpxor	%xmm4,%xmm5,%xmm5
++	vpunpckhqdq	%xmm12,%xmm12,%xmm9
++	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
++	vpxor	%xmm12,%xmm9,%xmm9
++	vpxor	%xmm14,%xmm13,%xmm13
++	vpalignr	$8,%xmm8,%xmm8,%xmm14
++	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
++	vmovdqu	80-32(%r9),%xmm15
++	vpxor	%xmm1,%xmm2,%xmm2
++
++	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
++	vmovdqu	64-32(%r9),%xmm0
++	vpxor	%xmm5,%xmm4,%xmm4
++	vpunpckhqdq	%xmm11,%xmm11,%xmm1
++	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
++	vpxor	%xmm11,%xmm1,%xmm1
++	vpxor	%xmm13,%xmm12,%xmm12
++	vxorps	16(%rsp),%xmm7,%xmm7
++	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
++	vpxor	%xmm2,%xmm9,%xmm9
++
++	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
++	vxorps	%xmm14,%xmm8,%xmm8
++
++	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
++	vmovdqu	96-32(%r9),%xmm3
++	vpxor	%xmm4,%xmm5,%xmm5
++	vpunpckhqdq	%xmm10,%xmm10,%xmm2
++	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
++	vpxor	%xmm10,%xmm2,%xmm2
++	vpalignr	$8,%xmm8,%xmm8,%xmm14
++	vpxor	%xmm12,%xmm11,%xmm11
++	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
++	vmovdqu	128-32(%r9),%xmm15
++	vpxor	%xmm9,%xmm1,%xmm1
++
++	vxorps	%xmm7,%xmm14,%xmm14
++	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
++	vxorps	%xmm14,%xmm8,%xmm8
++
++	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
++	vmovdqu	112-32(%r9),%xmm0
++	vpxor	%xmm5,%xmm4,%xmm4
++	vpunpckhqdq	%xmm8,%xmm8,%xmm9
++	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
++	vpxor	%xmm8,%xmm9,%xmm9
++	vpxor	%xmm11,%xmm10,%xmm10
++	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
++	vpxor	%xmm1,%xmm2,%xmm2
++
++	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
++	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
++	vpxor	%xmm4,%xmm5,%xmm5
++	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
++	vpxor	%xmm10,%xmm7,%xmm7
++	vpxor	%xmm2,%xmm6,%xmm6
++
++	vpxor	%xmm5,%xmm7,%xmm4
++	vpxor	%xmm4,%xmm6,%xmm6
++	vpslldq	$8,%xmm6,%xmm1
++	vmovdqu	16(%r11),%xmm3
++	vpsrldq	$8,%xmm6,%xmm6
++	vpxor	%xmm1,%xmm5,%xmm8
++	vpxor	%xmm6,%xmm7,%xmm7
++
++	vpalignr	$8,%xmm8,%xmm8,%xmm2
++	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
++	vpxor	%xmm2,%xmm8,%xmm8
++
++	vpalignr	$8,%xmm8,%xmm8,%xmm2
++	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
++	vpxor	%xmm7,%xmm2,%xmm2
++	vpxor	%xmm2,%xmm8,%xmm8
++	vpshufb	(%r11),%xmm8,%xmm8
++	vmovdqu	%xmm8,-64(%r9)
++
++	vzeroupper
++	movq	-48(%rax),%r15
++.cfi_restore	%r15
++	movq	-40(%rax),%r14
++.cfi_restore	%r14
++	movq	-32(%rax),%r13
++.cfi_restore	%r13
++	movq	-24(%rax),%r12
++.cfi_restore	%r12
++	movq	-16(%rax),%rbp
++.cfi_restore	%rbp
++	movq	-8(%rax),%rbx
++.cfi_restore	%rbx
++	leaq	(%rax),%rsp
++.cfi_def_cfa_register	%rsp
++.Lgcm_enc_abort:
++	movq	%r10,%rax
++	.byte	0xf3,0xc3
++.cfi_endproc
++.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
++
++/* Some utility routines */
++
++/*
++ * clear all fpu registers
++ * void clear_fpu_regs_avx(void);
++ */
++.globl	clear_fpu_regs_avx
++.type	clear_fpu_regs_avx,@function
++.align	32
++clear_fpu_regs_avx:
++	vzeroall
++	ret
++.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
++
++/*
++ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
++ *
++ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
++ * stores the result at `dst'. The XOR is performed using FPU registers,
++ * so make sure FPU state is saved when running this in the kernel.
++ */
++.globl  gcm_xor_avx
++.type	gcm_xor_avx,@function
++.align	32
++gcm_xor_avx:
++	movdqu  (%rdi), %xmm0
++	movdqu  (%rsi), %xmm1
++	pxor    %xmm1, %xmm0
++	movdqu  %xmm0, (%rsi)
++	ret
++.size	gcm_xor_avx,.-gcm_xor_avx
++
++/*
++ * Toggle a boolean_t value atomically and return the new value.
++ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
++ */
++.globl	atomic_toggle_boolean_nv
++.type	atomic_toggle_boolean_nv,@function
++.align	32
++atomic_toggle_boolean_nv:
++	xorl	%eax, %eax
++	lock
++	xorl	$1, (%rdi)
++	jz	1f
++	movl	$1, %eax
++1:
++	ret
++.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
++
++.align	64
++.Lbswap_mask:
++.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
++.Lpoly:
++.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
++.Lone_msb:
++.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
++.Ltwo_lsb:
++.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++.Lone_lsb:
++.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
++.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
++.align	64
++
++/* Mark the stack non-executable. */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
+Index: zfs-linux-0.8.3/module/icp/asm-x86_64/modes/ghash-x86_64.S
+===================================================================
+--- /dev/null
++++ zfs-linux-0.8.3/module/icp/asm-x86_64/modes/ghash-x86_64.S
+@@ -0,0 +1,714 @@
++# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# March, June 2010
++#
++# The module implements "4-bit" GCM GHASH function and underlying
++# single multiplication operation in GF(2^128). "4-bit" means that
++# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
++# function features so called "528B" variant utilizing additional
++# 256+16 bytes of per-key storage [+512 bytes shared table].
++# Performance results are for this streamed GHASH subroutine and are
++# expressed in cycles per processed byte, less is better:
++#
++#		gcc 3.4.x(*)	assembler
++#
++# P4		28.6		14.0		+100%
++# Opteron	19.3		7.7		+150%
++# Core2		17.8		8.1(**)		+120%
++# Atom		31.6		16.8		+88%
++# VIA Nano	21.8		10.1		+115%
++#
++# (*)	comparison is not completely fair, because C results are
++#	for vanilla "256B" implementation, while assembler results
++#	are for "528B";-)
++# (**)	it's mystery [to me] why Core2 result is not same as for
++#	Opteron;
++
++# May 2010
++#
++# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
++# See ghash-x86.pl for background information and details about coding
++# techniques.
++#
++# Special thanks to David Woodhouse for providing access to a
++# Westmere-based system on behalf of Intel Open Source Technology Centre.
++
++# December 2012
++#
++# Overhaul: aggregate Karatsuba post-processing, improve ILP in
++# reduction_alg9, increase reduction aggregate factor to 4x. As for
++# the latter. ghash-x86.pl discusses that it makes lesser sense to
++# increase aggregate factor. Then why increase here? Critical path
++# consists of 3 independent pclmulqdq instructions, Karatsuba post-
++# processing and reduction. "On top" of this we lay down aggregated
++# multiplication operations, triplets of independent pclmulqdq's. As
++# issue rate for pclmulqdq is limited, it makes lesser sense to
++# aggregate more multiplications than it takes to perform remaining
++# non-multiplication operations. 2x is near-optimal coefficient for
++# contemporary Intel CPUs (therefore modest improvement coefficient),
++# but not for Bulldozer. Latter is because logical SIMD operations
++# are twice as slow in comparison to Intel, so that critical path is
++# longer. A CPU with higher pclmulqdq issue rate would also benefit
++# from higher aggregate factor...
++#
++# Westmere	1.78(+13%)
++# Sandy Bridge	1.80(+8%)
++# Ivy Bridge	1.80(+7%)
++# Haswell	0.55(+93%) (if system doesn't support AVX)
++# Broadwell	0.45(+110%)(if system doesn't support AVX)
++# Skylake	0.44(+110%)(if system doesn't support AVX)
++# Bulldozer	1.49(+27%)
++# Silvermont	2.88(+13%)
++# Knights L	2.12(-)    (if system doesn't support AVX)
++# Goldmont	1.08(+24%)
++
++# March 2013
++#
++# ... 8x aggregate factor AVX code path is using reduction algorithm
++# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
++# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
++# sub-optimally in comparison to above mentioned version. But thanks
++# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
++# it performs in 0.41 cycles per byte on Haswell processor, in
++# 0.29 on Broadwell, and in 0.36 on Skylake.
++#
++# Knights Landing achieves 1.09 cpb.
++#
++# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
++
++# Generated once from
++# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
++# and modified for ICP. Modification are kept at a bare minimum to ease later
++# upstream merges.
++
++#if defined(__x86_64__) && defined(HAVE_AVX) && \
++    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
++
++.text
++
++.globl	gcm_gmult_clmul
++.type	gcm_gmult_clmul,@function
++.align	16
++gcm_gmult_clmul:
++.cfi_startproc
++.L_gmult_clmul:
++	movdqu	(%rdi),%xmm0
++	movdqa	.Lbswap_mask(%rip),%xmm5
++	movdqu	(%rsi),%xmm2
++	movdqu	32(%rsi),%xmm4
++.byte	102,15,56,0,197
++	movdqa	%xmm0,%xmm1
++	pshufd	$78,%xmm0,%xmm3
++	pxor	%xmm0,%xmm3
++.byte	102,15,58,68,194,0
++.byte	102,15,58,68,202,17
++.byte	102,15,58,68,220,0
++	pxor	%xmm0,%xmm3
++	pxor	%xmm1,%xmm3
++
++	movdqa	%xmm3,%xmm4
++	psrldq	$8,%xmm3
++	pslldq	$8,%xmm4
++	pxor	%xmm3,%xmm1
++	pxor	%xmm4,%xmm0
++
++	movdqa	%xmm0,%xmm4
++	movdqa	%xmm0,%xmm3
++	psllq	$5,%xmm0
++	pxor	%xmm0,%xmm3
++	psllq	$1,%xmm0
++	pxor	%xmm3,%xmm0
++	psllq	$57,%xmm0
++	movdqa	%xmm0,%xmm3
++	pslldq	$8,%xmm0
++	psrldq	$8,%xmm3
++	pxor	%xmm4,%xmm0
++	pxor	%xmm3,%xmm1
++
++
++	movdqa	%xmm0,%xmm4
++	psrlq	$1,%xmm0
++	pxor	%xmm4,%xmm1
++	pxor	%xmm0,%xmm4
++	psrlq	$5,%xmm0
++	pxor	%xmm4,%xmm0
++	psrlq	$1,%xmm0
++	pxor	%xmm1,%xmm0
++.byte	102,15,56,0,197
++	movdqu	%xmm0,(%rdi)
++	.byte	0xf3,0xc3
++.cfi_endproc
++.size	gcm_gmult_clmul,.-gcm_gmult_clmul
++
++.globl	gcm_init_htab_avx
++.type	gcm_init_htab_avx,@function
++.align	32
++gcm_init_htab_avx:
++.cfi_startproc
++	vzeroupper
++
++	vmovdqu	(%rsi),%xmm2
++	// KCF/ICP stores H in network byte order with the hi qword first
++	// so we need to swap all bytes, not the 2 qwords.
++	vmovdqu	.Lbswap_mask(%rip),%xmm4
++	vpshufb	%xmm4,%xmm2,%xmm2
++
++
++	vpshufd	$255,%xmm2,%xmm4
++	vpsrlq	$63,%xmm2,%xmm3
++	vpsllq	$1,%xmm2,%xmm2
++	vpxor	%xmm5,%xmm5,%xmm5
++	vpcmpgtd	%xmm4,%xmm5,%xmm5
++	vpslldq	$8,%xmm3,%xmm3
++	vpor	%xmm3,%xmm2,%xmm2
++
++
++	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
++	vpxor	%xmm5,%xmm2,%xmm2
++
++	vpunpckhqdq	%xmm2,%xmm2,%xmm6
++	vmovdqa	%xmm2,%xmm0
++	vpxor	%xmm2,%xmm6,%xmm6
++	movq	$4,%r10
++	jmp	.Linit_start_avx
++.align	32
++.Linit_loop_avx:
++	vpalignr	$8,%xmm3,%xmm4,%xmm5
++	vmovdqu	%xmm5,-16(%rdi)
++	vpunpckhqdq	%xmm0,%xmm0,%xmm3
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
++	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
++	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
++	vpxor	%xmm0,%xmm1,%xmm4
++	vpxor	%xmm4,%xmm3,%xmm3
++
++	vpslldq	$8,%xmm3,%xmm4
++	vpsrldq	$8,%xmm3,%xmm3
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpxor	%xmm3,%xmm1,%xmm1
++	vpsllq	$57,%xmm0,%xmm3
++	vpsllq	$62,%xmm0,%xmm4
++	vpxor	%xmm3,%xmm4,%xmm4
++	vpsllq	$63,%xmm0,%xmm3
++	vpxor	%xmm3,%xmm4,%xmm4
++	vpslldq	$8,%xmm4,%xmm3
++	vpsrldq	$8,%xmm4,%xmm4
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpxor	%xmm4,%xmm1,%xmm1
++
++	vpsrlq	$1,%xmm0,%xmm4
++	vpxor	%xmm0,%xmm1,%xmm1
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpsrlq	$5,%xmm4,%xmm4
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpsrlq	$1,%xmm0,%xmm0
++	vpxor	%xmm1,%xmm0,%xmm0
++.Linit_start_avx:
++	vmovdqa	%xmm0,%xmm5
++	vpunpckhqdq	%xmm0,%xmm0,%xmm3
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
++	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
++	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
++	vpxor	%xmm0,%xmm1,%xmm4
++	vpxor	%xmm4,%xmm3,%xmm3
++
++	vpslldq	$8,%xmm3,%xmm4
++	vpsrldq	$8,%xmm3,%xmm3
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpxor	%xmm3,%xmm1,%xmm1
++	vpsllq	$57,%xmm0,%xmm3
++	vpsllq	$62,%xmm0,%xmm4
++	vpxor	%xmm3,%xmm4,%xmm4
++	vpsllq	$63,%xmm0,%xmm3
++	vpxor	%xmm3,%xmm4,%xmm4
++	vpslldq	$8,%xmm4,%xmm3
++	vpsrldq	$8,%xmm4,%xmm4
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpxor	%xmm4,%xmm1,%xmm1
++
++	vpsrlq	$1,%xmm0,%xmm4
++	vpxor	%xmm0,%xmm1,%xmm1
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpsrlq	$5,%xmm4,%xmm4
++	vpxor	%xmm4,%xmm0,%xmm0
++	vpsrlq	$1,%xmm0,%xmm0
++	vpxor	%xmm1,%xmm0,%xmm0
++	vpshufd	$78,%xmm5,%xmm3
++	vpshufd	$78,%xmm0,%xmm4
++	vpxor	%xmm5,%xmm3,%xmm3
++	vmovdqu	%xmm5,0(%rdi)
++	vpxor	%xmm0,%xmm4,%xmm4
++	vmovdqu	%xmm0,16(%rdi)
++	leaq	48(%rdi),%rdi
++	subq	$1,%r10
++	jnz	.Linit_loop_avx
++
++	vpalignr	$8,%xmm4,%xmm3,%xmm5
++	vmovdqu	%xmm5,-16(%rdi)
++
++	vzeroupper
++	.byte	0xf3,0xc3
++.cfi_endproc
++.size	gcm_init_htab_avx,.-gcm_init_htab_avx
++
++.globl	gcm_gmult_avx
++.type	gcm_gmult_avx,@function
++.align	32
++gcm_gmult_avx:
++.cfi_startproc
++	jmp	.L_gmult_clmul
++.cfi_endproc
++.size	gcm_gmult_avx,.-gcm_gmult_avx
++.globl	gcm_ghash_avx
++.type	gcm_ghash_avx,@function
++.align	32
++gcm_ghash_avx:
++.cfi_startproc
++	vzeroupper
++
++	vmovdqu	(%rdi),%xmm10
++	leaq	.L0x1c2_polynomial(%rip),%r10
++	leaq	64(%rsi),%rsi
++	vmovdqu	.Lbswap_mask(%rip),%xmm13
++	vpshufb	%xmm13,%xmm10,%xmm10
++	cmpq	$0x80,%rcx
++	jb	.Lshort_avx
++	subq	$0x80,%rcx
++
++	vmovdqu	112(%rdx),%xmm14
++	vmovdqu	0-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vmovdqu	32-64(%rsi),%xmm7
++
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vmovdqu	96(%rdx),%xmm15
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpxor	%xmm14,%xmm9,%xmm9
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	16-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vmovdqu	80(%rdx),%xmm14
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm15,%xmm8,%xmm8
++
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vmovdqu	48-64(%rsi),%xmm6
++	vpxor	%xmm14,%xmm9,%xmm9
++	vmovdqu	64(%rdx),%xmm15
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	80-64(%rsi),%xmm7
++
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	64-64(%rsi),%xmm6
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm15,%xmm8,%xmm8
++
++	vmovdqu	48(%rdx),%xmm14
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpxor	%xmm4,%xmm1,%xmm1
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vmovdqu	96-64(%rsi),%xmm6
++	vpxor	%xmm5,%xmm2,%xmm2
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	128-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++
++	vmovdqu	32(%rdx),%xmm15
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	112-64(%rsi),%xmm6
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm15,%xmm8,%xmm8
++
++	vmovdqu	16(%rdx),%xmm14
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpxor	%xmm4,%xmm1,%xmm1
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vmovdqu	144-64(%rsi),%xmm6
++	vpxor	%xmm5,%xmm2,%xmm2
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	176-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++
++	vmovdqu	(%rdx),%xmm15
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	160-64(%rsi),%xmm6
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
++
++	leaq	128(%rdx),%rdx
++	cmpq	$0x80,%rcx
++	jb	.Ltail_avx
++
++	vpxor	%xmm10,%xmm15,%xmm15
++	subq	$0x80,%rcx
++	jmp	.Loop8x_avx
++
++.align	32
++.Loop8x_avx:
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vmovdqu	112(%rdx),%xmm14
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpxor	%xmm15,%xmm8,%xmm8
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
++	vmovdqu	0-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
++	vmovdqu	32-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++
++	vmovdqu	96(%rdx),%xmm15
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpxor	%xmm3,%xmm10,%xmm10
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vxorps	%xmm4,%xmm11,%xmm11
++	vmovdqu	16-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm5,%xmm12,%xmm12
++	vxorps	%xmm15,%xmm8,%xmm8
++
++	vmovdqu	80(%rdx),%xmm14
++	vpxor	%xmm10,%xmm12,%xmm12
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpxor	%xmm11,%xmm12,%xmm12
++	vpslldq	$8,%xmm12,%xmm9
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vpsrldq	$8,%xmm12,%xmm12
++	vpxor	%xmm9,%xmm10,%xmm10
++	vmovdqu	48-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vxorps	%xmm12,%xmm11,%xmm11
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	80-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++	vpxor	%xmm2,%xmm5,%xmm5
++
++	vmovdqu	64(%rdx),%xmm15
++	vpalignr	$8,%xmm10,%xmm10,%xmm12
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	64-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm4,%xmm1,%xmm1
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vxorps	%xmm15,%xmm8,%xmm8
++	vpxor	%xmm5,%xmm2,%xmm2
++
++	vmovdqu	48(%rdx),%xmm14
++	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vmovdqu	96-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	128-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++	vpxor	%xmm2,%xmm5,%xmm5
++
++	vmovdqu	32(%rdx),%xmm15
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpxor	%xmm3,%xmm0,%xmm0
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	112-64(%rsi),%xmm6
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm4,%xmm1,%xmm1
++	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm15,%xmm8,%xmm8
++	vpxor	%xmm5,%xmm2,%xmm2
++	vxorps	%xmm12,%xmm10,%xmm10
++
++	vmovdqu	16(%rdx),%xmm14
++	vpalignr	$8,%xmm10,%xmm10,%xmm12
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
++	vpshufb	%xmm13,%xmm14,%xmm14
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
++	vmovdqu	144-64(%rsi),%xmm6
++	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
++	vxorps	%xmm11,%xmm12,%xmm12
++	vpunpckhqdq	%xmm14,%xmm14,%xmm9
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
++	vmovdqu	176-64(%rsi),%xmm7
++	vpxor	%xmm14,%xmm9,%xmm9
++	vpxor	%xmm2,%xmm5,%xmm5
++
++	vmovdqu	(%rdx),%xmm15
++	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
++	vpshufb	%xmm13,%xmm15,%xmm15
++	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
++	vmovdqu	160-64(%rsi),%xmm6
++	vpxor	%xmm12,%xmm15,%xmm15
++	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
++	vpxor	%xmm10,%xmm15,%xmm15
++
++	leaq	128(%rdx),%rdx
++	subq	$0x80,%rcx
++	jnc	.Loop8x_avx
++
++	addq	$0x80,%rcx
++	jmp	.Ltail_no_xor_avx
++
++.align	32
++.Lshort_avx:
++	vmovdqu	-16(%rdx,%rcx,1),%xmm14
++	leaq	(%rdx,%rcx,1),%rdx
++	vmovdqu	0-64(%rsi),%xmm6
++	vmovdqu	32-64(%rsi),%xmm7
++	vpshufb	%xmm13,%xmm14,%xmm15
++
++	vmovdqa	%xmm0,%xmm3
++	vmovdqa	%xmm1,%xmm4
++	vmovdqa	%xmm2,%xmm5
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-32(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	16-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vpsrldq	$8,%xmm7,%xmm7
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-48(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	48-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vmovdqu	80-64(%rsi),%xmm7
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-64(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	64-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vpsrldq	$8,%xmm7,%xmm7
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-80(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	96-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vmovdqu	128-64(%rsi),%xmm7
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-96(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	112-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vpsrldq	$8,%xmm7,%xmm7
++	subq	$0x10,%rcx
++	jz	.Ltail_avx
++
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vmovdqu	-112(%rdx),%xmm14
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vmovdqu	144-64(%rsi),%xmm6
++	vpshufb	%xmm13,%xmm14,%xmm15
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++	vmovq	184-64(%rsi),%xmm7
++	subq	$0x10,%rcx
++	jmp	.Ltail_avx
++
++.align	32
++.Ltail_avx:
++	vpxor	%xmm10,%xmm15,%xmm15
++.Ltail_no_xor_avx:
++	vpunpckhqdq	%xmm15,%xmm15,%xmm8
++	vpxor	%xmm0,%xmm3,%xmm3
++	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
++	vpxor	%xmm15,%xmm8,%xmm8
++	vpxor	%xmm1,%xmm4,%xmm4
++	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
++	vpxor	%xmm2,%xmm5,%xmm5
++	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
++
++	vmovdqu	(%r10),%xmm12
++
++	vpxor	%xmm0,%xmm3,%xmm10
++	vpxor	%xmm1,%xmm4,%xmm11
++	vpxor	%xmm2,%xmm5,%xmm5
++
++	vpxor	%xmm10,%xmm5,%xmm5
++	vpxor	%xmm11,%xmm5,%xmm5
++	vpslldq	$8,%xmm5,%xmm9
++	vpsrldq	$8,%xmm5,%xmm5
++	vpxor	%xmm9,%xmm10,%xmm10
++	vpxor	%xmm5,%xmm11,%xmm11
++
++	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
++	vpalignr	$8,%xmm10,%xmm10,%xmm10
++	vpxor	%xmm9,%xmm10,%xmm10
++
++	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
++	vpalignr	$8,%xmm10,%xmm10,%xmm10
++	vpxor	%xmm11,%xmm10,%xmm10
++	vpxor	%xmm9,%xmm10,%xmm10
++
++	cmpq	$0,%rcx
++	jne	.Lshort_avx
++
++	vpshufb	%xmm13,%xmm10,%xmm10
++	vmovdqu	%xmm10,(%rdi)
++	vzeroupper
++	.byte	0xf3,0xc3
++.cfi_endproc
++.size	gcm_ghash_avx,.-gcm_ghash_avx
++.align	64
++.Lbswap_mask:
++.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
++.L0x1c2_polynomial:
++.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
++.L7_mask:
++.long	7,0,7,0
++.L7_mask_poly:
++.long	7,0,450,0
++.align	64
++.type	.Lrem_4bit,@object
++.Lrem_4bit:
++.long	0,0,0,471859200,0,943718400,0,610271232
++.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
++.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
++.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
++.type	.Lrem_8bit,@object
++.Lrem_8bit:
++.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
++.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
++.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
++.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
++.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
++.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
++.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
++.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
++.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
++.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
++.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
++.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
++.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
++.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
++.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
++.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
++.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
++.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
++.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
++.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
++.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
++.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
++.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
++.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
++.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
++.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
++.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
++.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
++.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
++.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
++.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
++.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
++
++.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
++.align	64
++
++/* Mark the stack non-executable. */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
+Index: zfs-linux-0.8.3/module/icp/include/aes/aes_impl.h
+===================================================================
+--- zfs-linux-0.8.3.orig/module/icp/include/aes/aes_impl.h
++++ zfs-linux-0.8.3/module/icp/include/aes/aes_impl.h
+@@ -107,6 +107,11 @@ typedef union {
+ } aes_ks_t;
+ 
+ typedef struct aes_impl_ops aes_impl_ops_t;
++
++/*
++ * The absolute offset of the encr_ks (0) and the nr (504) fields are hard
++ * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly).
++ */
+ typedef struct aes_key aes_key_t;
+ struct aes_key {
+ 	aes_ks_t	encr_ks;  /* encryption key schedule */
+Index: zfs-linux-0.8.3/module/icp/include/modes/modes.h
+===================================================================
+--- zfs-linux-0.8.3.orig/module/icp/include/modes/modes.h
++++ zfs-linux-0.8.3/module/icp/include/modes/modes.h
+@@ -34,6 +34,16 @@ extern "C" {
+ #include <sys/crypto/common.h>
+ #include <sys/crypto/impl.h>
+ 
++/*
++ * Does the build chain support all instructions needed for the GCM assembler
++ * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure
++ * anyhow.
++ */
++#if defined(__x86_64__) && defined(HAVE_AVX) && \
++    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
++#define	CAN_USE_GCM_ASM
++#endif
++
+ #define	ECB_MODE			0x00000002
+ #define	CBC_MODE			0x00000004
+ #define	CTR_MODE			0x00000008
+@@ -189,13 +199,17 @@ typedef struct ccm_ctx {
+  *
+  * gcm_H:		Subkey.
+  *
++ * gcm_Htable:		Pre-computed and pre-shifted H, H^2, ... H^6 for the
++ *			Karatsuba Algorithm in host byte order.
++ *
+  * gcm_J0:		Pre-counter block generated from the IV.
+  *
+  * gcm_len_a_len_c:	64-bit representations of the bit lengths of
+  *			AAD and ciphertext.
+  *
+- * gcm_kmflag:		Current value of kmflag. Used only for allocating
+- *			the plaintext buffer during decryption.
++ * gcm_kmflag:		Current value of kmflag. Used for allocating
++ *			the plaintext buffer during decryption and a
++ *			gcm_avx_chunk_size'd buffer for avx enabled encryption.
+  */
+ typedef struct gcm_ctx {
+ 	struct common_ctx gcm_common;
+@@ -203,12 +217,23 @@ typedef struct gcm_ctx {
+ 	size_t gcm_processed_data_len;
+ 	size_t gcm_pt_buf_len;
+ 	uint32_t gcm_tmp[4];
++	/*
++	 * The relative positions of gcm_ghash, gcm_H and pre-computed
++	 * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S,
++	 * so please don't change (or adjust accordingly).
++	 */
+ 	uint64_t gcm_ghash[2];
+ 	uint64_t gcm_H[2];
++#ifdef CAN_USE_GCM_ASM
++	uint64_t gcm_Htable[12][2];
++#endif
+ 	uint64_t gcm_J0[2];
+ 	uint64_t gcm_len_a_len_c[2];
+ 	uint8_t *gcm_pt_buf;
+ 	int gcm_kmflag;
++#ifdef CAN_USE_GCM_ASM
++	boolean_t gcm_use_avx;
++#endif
+ } gcm_ctx_t;
+ 
+ #define	gcm_keysched		gcm_common.cc_keysched
+Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh
+===================================================================
+--- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh
++++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh
+@@ -53,7 +53,7 @@ set -A ENCRYPTION_ALGS \
+ 	"encryption=aes-256-gcm"
+ 
+ set -A ENCRYPTION_PROPS \
+-	"encryption=aes-256-ccm" \
++	"encryption=aes-256-gcm" \
+ 	"encryption=aes-128-ccm" \
+ 	"encryption=aes-192-ccm" \
+ 	"encryption=aes-256-ccm" \
+Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh
+===================================================================
+--- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh
++++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh
+@@ -48,7 +48,7 @@ set -A ENCRYPTION_ALGS "encryption=on" \
+ 	"encryption=aes-192-gcm" \
+ 	"encryption=aes-256-gcm"
+ 
+-set -A ENCRYPTION_PROPS "encryption=aes-256-ccm" \
++set -A ENCRYPTION_PROPS "encryption=aes-256-gcm" \
+ 	"encryption=aes-128-ccm" \
+ 	"encryption=aes-192-ccm" \
+ 	"encryption=aes-256-ccm" \
+Index: zfs-linux-0.8.3/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
+===================================================================
+--- zfs-linux-0.8.3.orig/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
++++ zfs-linux-0.8.3/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
+@@ -124,7 +124,7 @@ ds=$TESTPOOL/recv
+ log_must eval "zfs send $snap > $sendfile"
+ log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \
+ 	"-o keylocation=file://$keyfile $ds < $sendfile"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile"
+@@ -140,7 +140,7 @@ ds=$TESTPOOL/recv
+ log_must eval "zfs send -p $snap > $sendfile"
+ log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \
+ 	"-o keylocation=file://$keyfile $ds < $sendfile"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile"
+@@ -158,7 +158,7 @@ ds=$TESTPOOL/recv
+ log_must eval "zfs send -R $snap > $sendfile"
+ log_must eval "zfs recv -o encryption=on -o keyformat=passphrase" \
+ 	"-o keylocation=file://$keyfile $ds < $sendfile"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$ds"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'keylocation' $ds)" == "file://$keyfile"
+@@ -174,7 +174,7 @@ ds=$TESTPOOL/crypt/recv
+ log_must eval "zfs send -p $snap > $sendfile"
+ log_must eval "zfs recv -x encryption $ds < $sendfile"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'mounted' $ds)" == "yes"
+ recv_cksum=$(md5digest /$ds/$TESTFILE0)
+@@ -188,7 +188,7 @@ ds=$TESTPOOL/crypt/recv
+ log_must eval "zfs send -R $snap > $sendfile"
+ log_must eval "zfs recv -x encryption $ds < $sendfile"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'mounted' $ds)" == "yes"
+ recv_cksum=$(md5digest /$ds/$TESTFILE0)
+@@ -202,7 +202,7 @@ ds=$TESTPOOL/crypt/recv
+ log_must eval "zfs send -R $snap2 > $sendfile"
+ log_must eval "zfs recv -x encryption $ds < $sendfile"
+ log_must test "$(get_prop 'encryptionroot' $ds)" == "$TESTPOOL/crypt"
+-log_must test "$(get_prop 'encryption' $ds)" == "aes-256-ccm"
++log_must test "$(get_prop 'encryption' $ds)" == "aes-256-gcm"
+ log_must test "$(get_prop 'keyformat' $ds)" == "passphrase"
+ log_must test "$(get_prop 'mounted' $ds)" == "yes"
+ recv_cksum=$(md5digest /$ds/$TESTFILE0)
diff -Nru zfs-linux-0.8.3/debian/patches/4620-zfs-vol-wait-fix-locked-encrypted-vols.patch zfs-linux-0.8.3/debian/patches/4620-zfs-vol-wait-fix-locked-encrypted-vols.patch
--- zfs-linux-0.8.3/debian/patches/4620-zfs-vol-wait-fix-locked-encrypted-vols.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4620-zfs-vol-wait-fix-locked-encrypted-vols.patch	2020-07-22 08:56:05.000000000 +0000
@@ -0,0 +1,39 @@
+Description: don't wait for links when volume has property keystatus=unavailable
+  zfs-volume-wait.service systemd unit does not start if the encrypted
+  zvol is locked. The /sbin/zvol_wait should not wait for links when the
+  volume has property keystatus=unavailable. This patch fixes this issue
+Bug: https://bugs.launchpad.net/ubuntu/+source/zfs-linux/+bug/1888405
+Author: James Dingwall
+Origin: ubuntu
+Forwarded: no
+Reviewed-By: Colin Ian King <colin.king@canonical.com>
+Last-Update: 2020-07-22
+
+Index: zfs-linux-0.8.3/cmd/zvol_wait/zvol_wait
+===================================================================
+--- zfs-linux-0.8.3.orig/cmd/zvol_wait/zvol_wait
++++ zfs-linux-0.8.3/cmd/zvol_wait/zvol_wait
+@@ -24,6 +24,14 @@ filter_out_deleted_zvols() {
+ 	done
+ }
+ 
++filter_out_locked_zvols() {
++	while read -r zvol; do
++		if ! [ "$(zfs list -H -o keystatus rpool/export/vault/block "$zvol")" = "unavailable" ]; then
++			echo "$zvol"
++		fi
++	done
++}
++
+ list_zvols() {
+ 	zfs list -t volume -H -o name,volmode,receive_resume_token |
+ 		while read -r zvol_line; do
+@@ -71,7 +79,7 @@ while [ "$outer_loop" -lt 20 ]; do
+ 	while [ "$inner_loop" -lt 30 ]; do
+ 		inner_loop=$((inner_loop + 1))
+ 
+-		zvols="$(echo "$zvols" | filter_out_zvols_with_links)"
++		zvols="$(echo "$zvols" | filter_out_zvols_with_links | filter_out_locked_zvols)"
+ 
+ 		zvols_count=$(count_zvols)
+ 		if [ "$zvols_count" -eq 0 ]; then
diff -Nru zfs-linux-0.8.3/debian/patches/4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch zfs-linux-0.8.3/debian/patches/4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch
--- zfs-linux-0.8.3/debian/patches/4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch	2020-08-18 09:10:41.000000000 +0000
@@ -0,0 +1,56 @@
+From 46cd180400093965271820d34fa1071f9769a0fb Mon Sep 17 00:00:00 2001
+From: Juerg Haefliger <juergh@canonical.com>
+Date: Tue, 18 Aug 2020 10:52:25 +0200
+Subject: [PATCH] Fix DKMS build on arm64 with PREEMPTION and BLK_CGROUP
+ enabled
+
+With PREEMPTION=y and BLK_CGROUP=y preempt_schedule_notrace() is being
+used on arm64 which is a GPL-only function and hence the build of the
+DKMS kernel module fails.
+
+'Fix' that by redefining preempt_schedule_notrace() to preempt_schedule()
+which should be safe as long as tracing is not used.
+
+Signed-off-by: Juerg Haefliger <juergh@canonical.com>
+---
+ module/zfs/vdev_disk.c  |  2 ++
+ module/zfs/zfs_compat.h | 14 ++++++++++++++
+ 2 files changed, 16 insertions(+)
+ create mode 100644 module/zfs/zfs_compat.h
+
+diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
+index 8544bb8ffb6f..2a7096a6436d 100644
+--- a/module/zfs/vdev_disk.c
++++ b/module/zfs/vdev_disk.c
+@@ -26,6 +26,8 @@
+  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+  */
+ 
++#include "zfs_compat.h"
++
+ #include <sys/zfs_context.h>
+ #include <sys/spa_impl.h>
+ #include <sys/vdev_disk.h>
+diff --git a/module/zfs/zfs_compat.h b/module/zfs/zfs_compat.h
+new file mode 100644
+index 000000000000..6ef26f436f3c
+--- /dev/null
++++ b/module/zfs/zfs_compat.h
+@@ -0,0 +1,14 @@
++#ifndef _ZFS_COMPAT_H_
++#define _ZFS_COMPAT_H_
++
++/*
++ * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
++ * replace it with preempt_schedule under the following condition:
++*/
++#if defined(CONFIG_ARM64) && \
++    defined(CONFIG_PREEMPTION) && \
++    defined(CONFIG_BLK_CGROUP)
++#define preempt_schedule_notrace(x) preempt_schedule(x)
++#endif
++
++#endif /* _ZFS_COMPAT_H_ */
+-- 
+2.25.1
+
diff -Nru zfs-linux-0.8.3/debian/patches/4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch zfs-linux-0.8.3/debian/patches/4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch
--- zfs-linux-0.8.3/debian/patches/4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch	2020-11-30 19:00:00.000000000 +0000
@@ -0,0 +1,53 @@
+From d1b84da8c1a69c084f04b504beefe804591bca07 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Tue, 26 May 2020 16:07:50 -0700
+Subject: [PATCH] Revert "Let zfs mount all tolerate in-progress mounts"
+
+This reverts commit a9cd8bf which introduced a segfault when running
+`zfs mount -a` multiple times when there are mountpoints which are
+not empty.  This segfault is now seen frequently by the CI after
+the mount code was updated to directly call mount(2).
+
+The original reason this logic was added is described in #8881.
+Since then the systemd `zfs-share.target` has been updated to run
+"After" the `zfs-mount.server` which should avoid this issue.
+
+Reviewed-by: Don Brady <don.brady@delphix.com>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #9560
+Closes #10364
+---
+ cmd/zfs/zfs_main.c | 19 +------------------
+ 1 file changed, 1 insertion(+), 18 deletions(-)
+
+Index: zfs-linux-0.8.3/cmd/zfs/zfs_main.c
+===================================================================
+--- zfs-linux-0.8.3.orig/cmd/zfs/zfs_main.c
++++ zfs-linux-0.8.3/cmd/zfs/zfs_main.c
+@@ -6447,25 +6447,8 @@ share_mount_one(zfs_handle_t *zhp, int o
+ 			return (1);
+ 		}
+ 
+-		if (zfs_mount(zhp, options, flags) != 0) {
+-			/*
+-			 * Check if a mount sneaked in after we checked
+-			 */
+-			if (!explicit &&
+-			    libzfs_errno(g_zfs) == EZFS_MOUNTFAILED) {
+-				usleep(10 * MILLISEC);
+-				libzfs_mnttab_cache(g_zfs, B_FALSE);
+-
+-				if (zfs_is_mounted(zhp, NULL)) {
+-					(void) fprintf(stderr, gettext(
+-					    "Ignoring previous 'already "
+-					    "mounted' error for '%s'\n"),
+-					    zfs_get_name(zhp));
+-					return (0);
+-				}
+-			}
++		if (zfs_mount(zhp, options, flags) != 0)
+ 			return (1);
+-		}
+ 		break;
+ 	}
+ 
diff -Nru zfs-linux-0.8.3/debian/patches/4800-fix-iput-race-in-zfs_iput_async.patch zfs-linux-0.8.3/debian/patches/4800-fix-iput-race-in-zfs_iput_async.patch
--- zfs-linux-0.8.3/debian/patches/4800-fix-iput-race-in-zfs_iput_async.patch	1970-01-01 00:00:00.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/4800-fix-iput-race-in-zfs_iput_async.patch	2021-02-25 19:48:51.000000000 +0000
@@ -0,0 +1,52 @@
+From 43eaef6de817dab3e098488f8e02a11fe57944d0 Mon Sep 17 00:00:00 2001
+From: Paul Dagnelie <paulcd2000@gmail.com>
+Date: Wed, 27 Jan 2021 21:29:58 -0800
+Subject: [PATCH] Fix zrele race in zrele_async that can cause hang
+
+There is a race condition in zfs_zrele_async when we are checking if
+we would be the one to evict an inode. This can lead to a txg sync
+deadlock.
+
+Instead of calling into iput directly, we attempt to perform the atomic
+decrement ourselves, unless that would set the i_count value to zero.
+In that case, we dispatch a call to iput to run later, to prevent a
+deadlock from occurring.
+
+Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
+Signed-off-by: Paul Dagnelie <pcd@delphix.com>
+Closes #11527
+Closes #11530
+
+Origin: backport, https://github.com/openzfs/zfs/commit/43eaef6de817
+Bug-Ubuntu: https://bugs.launchpad.net/bugs/1916486
+---
+ module/zfs/zfs_vnops.c | 13 +++++++++++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+Index: zfs-linux/module/zfs/zfs_vnops.c
+===================================================================
+--- zfs-linux.orig/module/zfs/zfs_vnops.c
++++ zfs-linux/module/zfs/zfs_vnops.c
+@@ -987,11 +987,18 @@ zfs_iput_async(struct inode *ip)
+ 	ASSERT(atomic_read(&ip->i_count) > 0);
+ 	ASSERT(os != NULL);
+
+-	if (atomic_read(&ip->i_count) == 1)
++	/*
++	 * If decrementing the count would put us at 0, we can't do it inline
++	 * here, because that would be synchronous. Instead, dispatch an iput
++	 * to run later.
++	 *
++	 * For more information on the dangers of a synchronous iput, see the
++	 * header comment of this file.
++	 */
++	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
+ 		VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
+ 		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+-	else
+-		iput(ip);
++	}
+ }
+
+ /* ARGSUSED */
diff -Nru zfs-linux-0.8.3/debian/patches/series zfs-linux-0.8.3/debian/patches/series
--- zfs-linux-0.8.3/debian/patches/series	2020-04-14 09:14:33.000000000 +0000
+++ zfs-linux-0.8.3/debian/patches/series	2021-02-25 19:48:51.000000000 +0000
@@ -12,9 +12,15 @@
 force-verbose-rules.patch
 
 #unapplied/init-debian-openrc-workaround.patch # OpenRC users can apply this locally
+4510-silently-ignore-modprobe-failure.patch
 4550-Linux-5.5-compat-blkg_tryget.patch
 4600-Linux-5.6-compat-struct-proc_ops.patch
 4601-Linux-5.6-compat-timestamp_truncate.patch
 4602-Linux-5.6-compat-ktime_get_raw_ts64.patch
 4603-Linux-5.6-compat-time_t.patch
 zfs-mount-container-start.patch
+4610-ICP-Improve-AES-GCM-performance.patch
+4620-zfs-vol-wait-fix-locked-encrypted-vols.patch
+4700-Fix-DKMS-build-on-arm64-with-PREEMPTION-and-BLK_CGRO.patch
+4702-Revert-Let-zfs-mount-all-tolerate-in-progress-mounts.patch
+4800-fix-iput-race-in-zfs_iput_async.patch
diff -Nru zfs-linux-0.8.3/debian/rules zfs-linux-0.8.3/debian/rules
--- zfs-linux-0.8.3/debian/rules	2020-01-21 12:40:40.000000000 +0000
+++ zfs-linux-0.8.3/debian/rules	2021-04-07 12:42:29.000000000 +0000
@@ -130,6 +130,8 @@
 
 override_dh_dkms:
 	'$(CURDIR)/scripts/dkms.mkconf' -n $(NAME) -v $(DEB_VERSION_UPSTREAM) -f '$(CURDIR)/scripts/zfs-dkms.dkms'
+	sed -ie '/^PACKAGE_VERSION/a BUILD_EXCLUSIVE_KERNEL="^(4\\.[0-9]+|5\\.[01234])\\."' \
+		'$(CURDIR)/scripts/zfs-dkms.dkms'
 	dh_dkms
 	rm -f '$(CURDIR)/scripts/zfs-dkms.dkms'