diff -Nru python-sabyenc-5.3.0/debian/changelog python-sabyenc-5.4.2/debian/changelog
--- python-sabyenc-5.3.0/debian/changelog	2022-05-19 11:11:00.000000000 +0000
+++ python-sabyenc-5.4.2/debian/changelog	2022-05-29 14:18:54.000000000 +0000
@@ -1,3 +1,10 @@
+python-sabyenc (5.4.2-0ubuntu1~jcfp1~21.10) impish; urgency=medium
+
+  * New upstream release.
+  * Control: bump version of breaks to 3.6.0~rc3.
+
+ -- Jeroen Ploemen <jcfp@debian.org>  Sun, 29 May 2022 14:18:54 +0000
+
 python-sabyenc (5.3.0-0ubuntu1~jcfp1~21.10) impish; urgency=medium
 
   * New upstream release.
diff -Nru python-sabyenc-5.3.0/debian/control python-sabyenc-5.4.2/debian/control
--- python-sabyenc-5.3.0/debian/control	2022-05-19 11:11:00.000000000 +0000
+++ python-sabyenc-5.4.2/debian/control	2022-05-29 14:18:54.000000000 +0000
@@ -18,7 +18,7 @@
 Package: python3-sabyenc
 Architecture: any
 Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends}
-Breaks: sabnzbdplus (<< 3.6.0~rc2)
+Breaks: sabnzbdplus (<< 3.6.0~rc3)
 Multi-Arch: same
 Description: yEnc extension for Python, optimized for Usenet
  Modified version of the original python-yenc module by Alessandro Duca,
diff -Nru python-sabyenc-5.3.0/setup.py python-sabyenc-5.4.2/setup.py
--- python-sabyenc-5.3.0/setup.py	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/setup.py	2022-05-29 12:28:02.000000000 +0000
@@ -62,6 +62,7 @@
         f.flush()
 
         try:
+            log.info("==> Please ignore any errors shown below!")
             result_files = compiler.compile([f.name], extra_postargs=extra_postargs)
             log.info("==> Success!")
         except CompileError:
@@ -89,6 +90,7 @@
         gcc_arm_neon_flags = []
         gcc_arm_crc_flags = []
         gcc_vpclmulqdq_flags = []
+        gcc_vbmi2_flags = []
         gcc_macros = []
         if self.compiler.compiler_type == "msvc":
             # LTCG not enabled due to issues seen with code generation where
@@ -135,23 +137,34 @@
                     IS_AARCH64 = False
                 if autoconf_check(self.compiler, flag_check="-march=armv8-a+crc"):
                     gcc_arm_crc_flags.append("-march=armv8-a+crc")
-                if autoconf_check(self.compiler, flag_check="-mfpu=neon"):
+                if not IS_AARCH64 and autoconf_check(self.compiler, flag_check="-mfpu=neon"):
                     gcc_arm_neon_flags.append("-mfpu=neon")
 
             # Check for special x32 case
             if (
-                IS_X86 
-                and not IS_MACOS 
+                IS_X86
+                and not IS_MACOS
                 and autoconf_check(self.compiler, define_check="__ILP32__")
                 and autoconf_check(self.compiler, define_check="__x86_64__")
             ):
                 log.info("==> Detected x32 platform, setting CRCUTIL_USE_ASM=0")
                 ext.define_macros.append(("CRCUTIL_USE_ASM", "0"))
                 gcc_macros.append(("CRCUTIL_USE_ASM", "0"))
-            
+
             if IS_X86 and autoconf_check(self.compiler, flag_check="-mvpclmulqdq"):
                 gcc_vpclmulqdq_flags = ["-mavx2", "-mvpclmulqdq", "-mpclmul"]
 
+            if IS_X86 and autoconf_check(self.compiler, flag_check="-mavx512vbmi2"):
+                gcc_vbmi2_flags = [
+                    "-mavx512vbmi2",
+                    "-mavx512vl",
+                    "-mavx512bw",
+                    "-mpopcnt",
+                    "-mbmi",
+                    "-mbmi2",
+                    "-mlzcnt",
+                ]
+
         srcdeps_crc_common = ["src/yencode/common.h", "src/yencode/crc_common.h", "src/yencode/crc.h"]
         srcdeps_dec_common = ["src/yencode/common.h", "src/yencode/decoder_common.h", "src/yencode/decoder.h"]
         srcdeps_enc_common = ["src/yencode/common.h", "src/yencode/encoder_common.h", "src/yencode/encoder.h"]
@@ -225,6 +238,18 @@
                 "msvc_x86_flags": ["/arch:AVX2"],
             },
             {
+                "sources": ["src/yencode/encoder_vbmi2.cc"],
+                "depends": srcdeps_enc_common + ["encoder_avx_base.h"],
+                "gcc_x86_flags": gcc_vbmi2_flags,
+                "msvc_x86_flags": ["/arch:AVX512"],
+            },
+            {
+                "sources": ["src/yencode/decoder_vbmi2.cc"],
+                "depends": srcdeps_dec_common + ["decoder_avx2_base.h"],
+                "gcc_x86_flags": gcc_vbmi2_flags,
+                "msvc_x86_flags": ["/arch:AVX512"],
+            },
+            {
                 "sources": ["src/yencode/encoder_neon.cc"],
                 "depends": srcdeps_enc_common,
                 "gcc_arm_flags": gcc_arm_neon_flags,
diff -Nru python-sabyenc-5.3.0/src/sabyenc3.cc python-sabyenc-5.4.2/src/sabyenc3.cc
--- python-sabyenc-5.3.0/src/sabyenc3.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/sabyenc3.cc	2022-05-29 12:28:02.000000000 +0000
@@ -536,10 +536,10 @@
 #if PY_MINOR_VERSION < 9
     Py_SIZE(sv) = output_len;
 #else
-	Py_SET_SIZE(sv, output_len);
+    Py_SET_SIZE(sv, output_len);
 #endif
     sv->ob_sval[output_len] = '\0';
-	// Reset hash, this was removed in Python 3.11
+    // Reset hash, this was removed in Python 3.11
 #if PY_MINOR_VERSION < 11
     sv->ob_shash = -1;
 #endif
@@ -556,30 +556,30 @@
 
 
 static inline size_t YENC_MAX_SIZE(size_t len, size_t line_size) {
-	size_t ret = len * 2    /* all characters escaped */
-		+ 2 /* allocation for offset and that a newline may occur early */
+    size_t ret = len * 2    /* all characters escaped */
+        + 2 /* allocation for offset and that a newline may occur early */
 #if !defined(YENC_DISABLE_AVX256)
-		+ 64 /* allocation for YMM overflowing */
+        + 64 /* allocation for YMM overflowing */
 #else
-		+ 32 /* allocation for XMM overflowing */
+        + 32 /* allocation for XMM overflowing */
 #endif
-	;
-	/* add newlines, considering the possibility of all chars escaped */
-	if(line_size == 128) // optimize common case
-		return ret + 2 * (len >> 6);
-	return ret + 2 * ((len*2) / line_size);
+    ;
+    /* add newlines, considering the possibility of all chars escaped */
+    if(line_size == 128) // optimize common case
+        return ret + 2 * (len >> 6);
+    return ret + 2 * ((len*2) / line_size);
 }
 
 PyObject* encode(PyObject* self, PyObject* Py_input_string)
 {
     (void)self;
-	PyObject *Py_output_string;
-	PyObject *retval = NULL;
+    PyObject *Py_output_string;
+    PyObject *retval = NULL;
 
-	char *input_buffer = NULL;
-	char *output_buffer = NULL;
-	size_t input_len = 0;
-	size_t output_len = 0;
+    char *input_buffer = NULL;
+    char *output_buffer = NULL;
+    size_t input_len = 0;
+    size_t output_len = 0;
     uint32_t crc;
 
     // Verify the input is a bytes string
@@ -589,29 +589,29 @@
     }
 
     // Initialize buffers and CRC's
-	input_len = PyBytes_Size(Py_input_string);
-	input_buffer = (char *)PyBytes_AsString(Py_input_string);
-	output_buffer = (char *)malloc(YENC_MAX_SIZE(input_len, LINESIZE));
-	if(!output_buffer)
-		return PyErr_NoMemory();
+    input_len = PyBytes_Size(Py_input_string);
+    input_buffer = (char *)PyBytes_AsString(Py_input_string);
+    output_buffer = (char *)malloc(YENC_MAX_SIZE(input_len, LINESIZE));
+    if(!output_buffer)
+        return PyErr_NoMemory();
 
-	// Free GIL, in case it helps
+    // Free GIL, in case it helps
     Py_BEGIN_ALLOW_THREADS;
 
     // Encode result
     int column = 0;
-	output_len = do_encode(LINESIZE, &column, (unsigned char*)input_buffer, (unsigned char*)output_buffer, input_len, 1);
+    output_len = do_encode(LINESIZE, &column, (unsigned char*)input_buffer, (unsigned char*)output_buffer, input_len, 1);
     crc = do_crc32(input_buffer, input_len, 0);
 
-	// Restore GIL so we can build Python strings
-	Py_END_ALLOW_THREADS;
+    // Restore GIL so we can build Python strings
+    Py_END_ALLOW_THREADS;
 
-	// Build output string
-	Py_output_string = PyBytes_FromStringAndSize((char *)output_buffer, output_len);
-	if(Py_output_string)
-		retval = Py_BuildValue("(S,L)", Py_output_string, (long long)crc);
+    // Build output string
+    Py_output_string = PyBytes_FromStringAndSize((char *)output_buffer, output_len);
+    if(Py_output_string)
+        retval = Py_BuildValue("(S,L)", Py_output_string, (long long)crc);
 
     Py_XDECREF(Py_output_string);
-	free(output_buffer);
-	return retval;
+    free(output_buffer);
+    return retval;
 }
diff -Nru python-sabyenc-5.3.0/src/sabyenc3.h python-sabyenc-5.4.2/src/sabyenc3.h
--- python-sabyenc-5.3.0/src/sabyenc3.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/sabyenc3.h	2022-05-29 12:28:02.000000000 +0000
@@ -25,7 +25,7 @@
 #include <string.h>
 
 /* Version information */
-#define SABYENC_VERSION "5.3.0"
+#define SABYENC_VERSION "5.4.2"
 
 /* Do we CRC check? */
 #define CRC_CHECK   1
diff -Nru python-sabyenc-5.3.0/src/yencode/common.h python-sabyenc-5.4.2/src/yencode/common.h
--- python-sabyenc-5.3.0/src/yencode/common.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/common.h	2022-05-29 12:28:02.000000000 +0000
@@ -40,22 +40,22 @@
 
 #include <stdlib.h>
 #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
-	// MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
-	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
-	#define ALIGN_FREE _aligned_free
+    // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
+    #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
+    #define ALIGN_FREE _aligned_free
 #elif defined(_ISOC11_SOURCE)
-	// C11 method
-	// len needs to be a multiple of alignment, although it sometimes works if it isn't...
-	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
-	#define ALIGN_FREE free
+    // C11 method
+    // len needs to be a multiple of alignment, although it sometimes works if it isn't...
+    #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
+    #define ALIGN_FREE free
 #elif defined(__cplusplus) && __cplusplus >= 201700
-	// C++17 method
-	#include <cstdlib>
-	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
-	#define ALIGN_FREE free
+    // C++17 method
+    #include <cstdlib>
+    #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
+    #define ALIGN_FREE free
 #else
-	#define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
-	#define ALIGN_FREE free
+    #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
+    #define ALIGN_FREE free
 #endif
 
 
@@ -185,18 +185,24 @@
 # endif
 }
 
+# ifdef _MSC_VER
+#  define _CREATE_TUPLE(type, ...) type{{ __VA_ARGS__ }}
+# else
+#  define _CREATE_TUPLE(type, ...) (type){{ __VA_ARGS__ }}
+# endif
 static HEDLEY_ALWAYS_INLINE uint8x16x2_t vcreate2_u8(uint8x16_t a, uint8x16_t b) {
-    return {a, b};
+    return _CREATE_TUPLE(uint8x16x2_t, a, b);
 }
 static HEDLEY_ALWAYS_INLINE int8x16x2_t vcreate2_s8(int8x16_t a, int8x16_t b) {
-    return {a, b};
+    return _CREATE_TUPLE(int8x16x2_t, a, b);
 }
 static HEDLEY_ALWAYS_INLINE uint8x16x3_t vcreate3_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
-    return {a, b, c};
+    return _CREATE_TUPLE(uint8x16x3_t, a, b, c);
 }
 static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) {
-    return {a, b, c, d};
+    return _CREATE_TUPLE(uint8x16x4_t, a, b, c, d);
 }
+# undef _CREATE_TUPLE
 #endif
 #ifdef PLATFORM_ARM
 bool cpu_supports_neon();
@@ -253,8 +259,6 @@
 int cpu_supports_isa();
 #endif // PLATFORM_X86
 
-const char* simd_detected();
-
 #include <string.h>
 
 #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900
@@ -269,7 +273,7 @@
 
 
 // GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KNOT16 _knot_mask16
 # define KNOT32 _knot_mask32
 #else
diff -Nru python-sabyenc-5.3.0/src/yencode/crc_arm.cc python-sabyenc-5.4.2/src/yencode/crc_arm.cc
--- python-sabyenc-5.3.0/src/yencode/crc_arm.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/crc_arm.cc	2022-05-29 12:28:02.000000000 +0000
@@ -26,14 +26,30 @@
 #include <arm_acle.h>
 #endif
 
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# ifdef __GNUC__
+#  define _LE16 __builtin_bswap16
+#  define _LE32 __builtin_bswap32
+#  define _LE64 __builtin_bswap64
+# else
+// currently not supported
+#  error No endian swap intrinsic defined
+# endif
+#else
+# define _LE16(x) (x)
+# define _LE32(x) (x)
+# define _LE64(x) (x)
+#endif
+
 #ifdef __aarch64__
 # define WORD_T uint64_t
 # define WORDSIZE_LOG 3  // sizeof(WORD_T) == 1<<WORDSIZE_LOG
-# define CRC_WORD __crc32d
+# define CRC_WORD(crc, data) __crc32d(crc, _LE64(data))
 #else
 # define WORD_T uint32_t
 # define WORDSIZE_LOG 2  // sizeof(WORD_T) == 1<<WORDSIZE_LOG
-# define CRC_WORD __crc32w
+# define CRC_WORD(crc, data) __crc32w(crc, _LE32(data))
 #endif
 
 
@@ -87,13 +103,13 @@
             len--;
         }
         if ((uintptr_t)src & sizeof(uint16_t)) {
-            crc = __crc32h(crc, *((uint16_t *)src));
+            crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
             src += sizeof(uint16_t);
             len -= sizeof(uint16_t);
         }
 #ifdef __aarch64__
         if ((uintptr_t)src & sizeof(uint32_t)) {
-            crc = __crc32w(crc, *((uint32_t *)src));
+            crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
             src += sizeof(uint32_t);
             len -= sizeof(uint32_t);
         }
@@ -159,12 +175,12 @@
 
 #ifdef __aarch64__
     if (len & sizeof(uint32_t)) {
-        crc = __crc32w(crc, *((uint32_t *)src));
+        crc = __crc32w(crc, _LE32(*((uint32_t *)src)));
         src += sizeof(uint32_t);
     }
 #endif
     if (len & sizeof(uint16_t)) {
-        crc = __crc32h(crc, *((uint16_t *)src));
+        crc = __crc32h(crc, _LE16(*((uint16_t *)src)));
         src += sizeof(uint16_t);
     }
     if (len & sizeof(uint8_t))
diff -Nru python-sabyenc-5.3.0/src/yencode/crc_folding_256.cc python-sabyenc-5.4.2/src/yencode/crc_folding_256.cc
--- python-sabyenc-5.3.0/src/yencode/crc_folding_256.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/crc_folding_256.cc	2022-05-29 12:28:02.000000000 +0000
@@ -1,8 +1,8 @@
 // 256-bit version of crc_folding
 
 #include "crc_common.h"
- 
-#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86)))
+
+#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__)))
 #include <inttypes.h>
 #include <immintrin.h>
 
@@ -62,11 +62,11 @@
 static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) {
     __m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15))));
     __m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15));
-    
+
     *crc0 = _mm256_shuffle_epi8(*crc0, shuf);
     *crc1 = _mm256_shuffle_epi8(*crc1, shuf);
     crc_part = _mm256_shuffle_epi8(crc_part, shuf);
-    
+
     __m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08);  // move bottom->top
     __m256i crc01, crc1p;
     if(len >= 16) {
@@ -81,10 +81,10 @@
         crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21);
         crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21);
     }
-    
+
     *crc0 = MM256_BLENDV(*crc0, crc01, mask);
     *crc1 = MM256_BLENDV(*crc1, crc1p, mask);
-    
+
     *crc1 = do_one_fold(crc_out, *crc1);
 }
 
@@ -103,7 +103,7 @@
     // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/
     // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7
     __m128i xmm_t0 = _mm_cvtsi32_si128(~initial);
-    
+
     xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0);  // reverse(0x487b9c8a)<<1 == 0xa273bc24
     __m128i reduction = _mm_set_epi32( // polynomial reduction factors
       1, 0xdb710640, // G* = 0x04c11db7
@@ -111,11 +111,11 @@
     );
     __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0);
     xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10);
-    
+
     xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8);
     __m256i crc0 = zext128_256(xmm_t0);
     __m256i crc1 = _mm256_setzero_si256();
-    
+
     if (len < 32) {
         if (len == 0)
             return initial;
@@ -129,31 +129,31 @@
             src += algn_diff;
             len -= algn_diff;
         }
-        
+
         while (len >= 64) {
             crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
             crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1));
             src += 64;
             len -= 64;
         }
-        
+
         if (len >= 32) {
             __m256i old = crc1;
             crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src));
             crc0 = old;
-            
+
             len -= 32;
             src += 32;
         }
-        
+
         if(len != 0) {
             partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src));
         }
     }
-    
+
     const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0);
     __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
-    
+
     __m128i xmm_crc0 = _mm256_castsi256_si128(crc0);
     __m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1);
     __m128i xmm_crc2 = _mm256_castsi256_si128(crc1);
diff -Nru python-sabyenc-5.3.0/src/yencode/crc_folding.cc python-sabyenc-5.4.2/src/yencode/crc_folding.cc
--- python-sabyenc-5.3.0/src/yencode/crc_folding.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/crc_folding.cc	2022-05-29 12:28:02.000000000 +0000
@@ -1,6 +1,6 @@
 // taken from zlib-ng / Intel's zlib patch, modified to remove zlib dependencies
 /*
- * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
  * instruction.
  *
  * A white paper describing this algorithm can be found at:
@@ -19,7 +19,7 @@
 
 #include "crc_common.h"
 
-#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86))
+#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__))
 #include <inttypes.h>
 #include <immintrin.h>
 #include <wmmintrin.h>
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx2_base.h python-sabyenc-5.4.2/src/yencode/decoder_avx2_base.h
--- python-sabyenc-5.3.0/src/yencode/decoder_avx2_base.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_avx2_base.h	2022-05-29 12:28:02.000000000 +0000
@@ -1,8 +1,8 @@
 
 #ifdef __AVX2__
 
-// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
-#if defined(__GNUC__) && __GNUC__ >= 7
+// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b))
 # define KAND32(a, b) _kand_mask32((a), (b))
 # define KOR32(a, b) _kor_mask32((a), (b))
@@ -60,6 +60,17 @@
             '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.'
         );
     }
+
+    // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask
+    // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code
+    // so just disable the optimisation as it seems to be problematic there
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
+    const bool useAVX3MaskCmp = false;
+# else
+    const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
+# endif
+#endif
     intptr_t i;
     for(i = -len; i; i += sizeof(__m256i)*2) {
         __m256i oDataA = _mm256_load_si256((__m256i *)(src+i));
@@ -126,7 +137,7 @@
                 __mmask32 match2EqMaskA, match2EqMaskB;
                 __mmask32 match0CrMaskA, match0CrMaskB;
                 __mmask32 match2CrXDtMaskA, match2CrXDtMaskB;
-                if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
+                if(useAVX3MaskCmp && searchEnd) {
                     match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A);
                     match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B);
                 } else
@@ -142,7 +153,7 @@
                     // find patterns of \r_.
 
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r'));
                         match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r'));
                         match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.'));
@@ -172,7 +183,7 @@
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
                     __mmask32 match1NlMaskA, match1NlMaskB;
                     __mmask32 match2NlDotMaskA, match2NlDotMaskB;
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match1NlMaskA = _mm256_mask_cmpeq_epi8_mask(
                             match0CrMaskA,
                             _mm256_set1_epi8('\n'),
@@ -228,7 +239,7 @@
 
                         int matchEnd;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                        if(use_isa >= ISA_LEVEL_AVX3) {
+                        if(useAVX3MaskCmp) {
                             __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
                                 match2EqMaskA,
                                 _mm256_set1_epi8('y'),
@@ -307,7 +318,7 @@
                         }
                     }
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         mask |= (uint64_t)match2NlDotMaskA << 2;
                         mask |= (uint64_t)match2NlDotMaskB << 34;
                         minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.'));
@@ -325,7 +336,7 @@
                     __m256i match3EqYA, match3EqYB;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
                     __mmask32 match3EqYMaskA, match3EqYMaskB;
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask(
                             match2EqMaskA,
                             _mm256_set1_epi8('y'),
@@ -355,7 +366,7 @@
                     if(LIKELIHOOD(0.002, partialEndFound)) {
                         bool endFound;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                        if(use_isa >= ISA_LEVEL_AVX3) {
+                        if(useAVX3MaskCmp) {
                             __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask(
                                 match3EqYMaskA,
                                 _mm256_set1_epi8('\n'),
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx2.cc python-sabyenc-5.4.2/src/yencode/decoder_avx2.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_avx2.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_avx2.cc	2022-05-29 12:28:02.000000000 +0000
@@ -9,6 +9,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_AVX2> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_AVX2> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_AVX2> >;
+    _decode_simd_level = ISA_LEVEL_AVX2;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx.cc python-sabyenc-5.4.2/src/yencode/decoder_avx.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_avx.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_avx.cc	2022-05-29 12:28:02.000000000 +0000
@@ -9,6 +9,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE4_POPCNT> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE4_POPCNT> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE4_POPCNT> >;
+    _decode_simd_level = ISA_LEVEL_AVX;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder.cc python-sabyenc-5.4.2/src/yencode/decoder.cc
--- python-sabyenc-5.3.0/src/yencode/decoder.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder.cc	2022-05-29 12:28:02.000000000 +0000
@@ -10,6 +10,7 @@
                                  YencDecoderState *) = &do_decode_scalar<true, false>;
 YencDecoderEnd (*_do_decode_end_raw)(const unsigned char *HEDLEY_RESTRICT *, unsigned char *HEDLEY_RESTRICT *, size_t,
                                      YencDecoderState *) = &do_decode_end_scalar<true>;
+int _decode_simd_level = 0;
 }
 
 void decoder_set_sse2_funcs();
@@ -20,6 +21,8 @@
 
 void decoder_set_avx2_funcs();
 
+void decoder_set_vbmi2_funcs();
+
 void decoder_set_neon_funcs();
 
 
@@ -32,6 +35,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_NATIVE> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_NATIVE> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_NATIVE> >;
+    _decode_simd_level = ISA_NATIVE;
 }
 # else
 #  include "decoder_sse_base.h"
@@ -41,6 +45,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_NATIVE> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_NATIVE> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_NATIVE> >;
+    _decode_simd_level = ISA_NATIVE;
 }
 # endif
 #endif
@@ -51,7 +56,9 @@
     decoder_set_native_funcs();
 # else
     int use_isa = cpu_supports_isa();
-    if(use_isa >= ISA_LEVEL_AVX2) {
+    if(use_isa >= ISA_LEVEL_VBMI2) {
+        decoder_set_vbmi2_funcs();
+    } else if(use_isa >= ISA_LEVEL_AVX2) {
         decoder_set_avx2_funcs();
     } else if(use_isa >= ISA_LEVEL_AVX) {
         decoder_set_avx_funcs();
@@ -68,3 +75,31 @@
     }
 #endif
 }
+
+const char* simd_detected() {
+#ifdef PLATFORM_X86
+    if(_decode_simd_level >= ISA_LEVEL_VBMI2)
+        return "AVX512VL+VBMI2";
+    if(_decode_simd_level >= ISA_LEVEL_AVX3)
+        return "AVX512VL";
+    if(_decode_simd_level >= ISA_LEVEL_AVX2)
+        return "AVX2";
+    if(_decode_simd_level >= ISA_LEVEL_AVX)
+        return "AVX";
+    if(_decode_simd_level >= ISA_LEVEL_SSE4_POPCNT)
+        return "SSE4.1+POPCNT";
+    if(_decode_simd_level >= ISA_LEVEL_SSE41)
+        return "SSE4.1";
+    if(_decode_simd_level >= ISA_LEVEL_SSSE3)
+        return "SSSE3";
+    if(_decode_simd_level >= (ISA_LEVEL_SSE2 | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT))
+        return "SSE2+ABM";
+    return "SSE2";
+#endif
+#ifdef PLATFORM_ARM
+    if(_decode_simd_level >= 1) {
+        return "NEON";
+    }
+#endif
+    return "";
+}
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder.h python-sabyenc-5.4.2/src/yencode/decoder.h
--- python-sabyenc-5.3.0/src/yencode/decoder.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder.h	2022-05-29 12:28:02.000000000 +0000
@@ -38,6 +38,8 @@
 (*_do_decode_end_raw)(const unsigned char *HEDLEY_RESTRICT *, unsigned char *HEDLEY_RESTRICT *, size_t,
                       YencDecoderState *);
 
+extern int _decode_simd_level;
+
 static inline size_t
 do_decode(int isRaw, const unsigned char *HEDLEY_RESTRICT src, unsigned char *HEDLEY_RESTRICT dest, size_t len,
           YencDecoderState *state) {
@@ -54,6 +56,8 @@
 
 void decoder_init();
 
+const char* simd_detected();
+
 
 #ifdef __cplusplus
 }
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_neon64.cc python-sabyenc-5.4.2/src/yencode/decoder_neon64.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_neon64.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_neon64.cc	2022-05-29 12:28:02.000000000 +0000
@@ -448,6 +448,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*4, do_decode_neon<false, false> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*4, do_decode_neon<true, false> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*4, do_decode_neon<true, true> >;
+    _decode_simd_level = 1;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_neon.cc python-sabyenc-5.4.2/src/yencode/decoder_neon.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_neon.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_neon.cc	2022-05-29 12:28:02.000000000 +0000
@@ -20,8 +20,8 @@
 #endif
 
 
-// for compilers that lack these functions
-#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
+// for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads)
+#if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8))
 # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n))
 #else
 # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2))
@@ -469,6 +469,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(uint8x16_t)*2, do_decode_neon<false, false> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(uint8x16_t)*2, do_decode_neon<true, false> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(uint8x16_t)*2, do_decode_neon<true, true> >;
+    _decode_simd_level = 1;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_sse2.cc python-sabyenc-5.4.2/src/yencode/decoder_sse2.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_sse2.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_sse2.cc	2022-05-29 12:28:02.000000000 +0000
@@ -10,6 +10,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSE2> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSE2> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSE2> >;
+    _decode_simd_level = ISA_LEVEL_SSE2;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_sse_base.h python-sabyenc-5.4.2/src/yencode/decoder_sse_base.h
--- python-sabyenc-5.3.0/src/yencode/decoder_sse_base.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_sse_base.h	2022-05-29 12:28:02.000000000 +0000
@@ -8,7 +8,7 @@
 #endif
 
 // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b))
 # define KAND16(a, b) _kand_mask16((a), (b))
 # define KOR16(a, b) _kor_mask16((a), (b))
@@ -112,7 +112,7 @@
         -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64
     ) : _mm_set1_epi8(-42);
 
-#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__)
+#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__)
     const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3);
 #else
     const bool _USING_FAST_MATCH = false;
@@ -122,6 +122,13 @@
 #else
     const bool _USING_BLEND_ADD = false;
 #endif
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__)
+    const bool useAVX3MaskCmp = false;
+# else
+    const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3);
+# endif
+#endif
 
     __m128i lfCompare = _mm_set1_epi8('\n');
     __m128i minMask = _mm_set1_epi8('.');
@@ -214,7 +221,7 @@
                 __mmask16 match2EqMaskA, match2EqMaskB;
                 __mmask16 match0CrMaskA, match0CrMaskB;
                 __mmask16 match2CrXDtMaskA, match2CrXDtMaskB;
-                if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) {
+                if(useAVX3MaskCmp && searchEnd) {
                     match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A);
                     match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B);
                 } else
@@ -230,7 +237,7 @@
                 __m128i match2CrXDtA, match2CrXDtB;
                 if(isRaw) {
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r'));
                         match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r'));
                         match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.'));
@@ -256,7 +263,7 @@
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
                     __mmask16 match1NlMaskA, match1NlMaskB;
                     __mmask16 match2NlDotMaskA, match2NlDotMaskB;
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match1NlMaskA = _mm_mask_cmpeq_epi8_mask(
                             match0CrMaskA,
                             _mm_set1_epi8('\n'),
@@ -299,7 +306,7 @@
 
                         int matchEnd;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                        if(use_isa >= ISA_LEVEL_AVX3) {
+                        if(useAVX3MaskCmp) {
                             __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
                                 match2EqMaskA, _mm_set1_epi8('y'), tmpData3A
                             );
@@ -373,7 +380,7 @@
                         }
                     }
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         mask |= match2NlDotMaskA << 2;
                         mask |= (match2NlDotMaskB << 18) & 0xffffffff;
                         minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.'));
@@ -398,7 +405,7 @@
                     __m128i match3EqYA, match3EqYB;
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
                     __mmask16 match3EqYMaskA, match3EqYMaskB;
-                    if(use_isa >= ISA_LEVEL_AVX3) {
+                    if(useAVX3MaskCmp) {
                         match3EqYMaskA = _mm_mask_cmpeq_epi8_mask(
                             match2EqMaskA,
                             _mm_set1_epi8('y'),
@@ -434,7 +441,7 @@
                         bool endFound;
 
 #if defined(__AVX512VL__) && defined(__AVX512BW__)
-                        if(use_isa >= ISA_LEVEL_AVX3) {
+                        if(useAVX3MaskCmp) {
                             __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask(
                                 match3EqYMaskA,
                                 _mm_set1_epi8('\n'),
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_ssse3.cc python-sabyenc-5.4.2/src/yencode/decoder_ssse3.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_ssse3.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_ssse3.cc	2022-05-29 12:28:02.000000000 +0000
@@ -9,6 +9,7 @@
     _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_SSSE3> >;
     _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_SSSE3> >;
     _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_SSSE3> >;
+    _decode_simd_level = ISA_LEVEL_SSSE3;
 }
 #else
 
diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_vbmi2.cc python-sabyenc-5.4.2/src/yencode/decoder_vbmi2.cc
--- python-sabyenc-5.3.0/src/yencode/decoder_vbmi2.cc	1970-01-01 00:00:00.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/decoder_vbmi2.cc	2022-05-29 12:28:02.000000000 +0000
@@ -0,0 +1,34 @@
+#include "common.h"
+
+#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
+# include "decoder_common.h"
+# ifndef YENC_DISABLE_AVX256
+#  include "decoder_avx2_base.h"
+void decoder_set_vbmi2_funcs() {
+    ALIGN_ALLOC(lookups, sizeof(*lookups), 16);
+    // TODO: consider removing compact LUT
+    decoder_init_lut(lookups->eqFix, lookups->compact);
+    _do_decode = &do_decode_simd<false, false, sizeof(__m256i)*2, do_decode_avx2<false, false, ISA_LEVEL_VBMI2> >;
+    _do_decode_raw = &do_decode_simd<true, false, sizeof(__m256i)*2, do_decode_avx2<true, false, ISA_LEVEL_VBMI2> >;
+    _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m256i)*2, do_decode_avx2<true, true, ISA_LEVEL_VBMI2> >;
+    _decode_simd_level = ISA_LEVEL_VBMI2;
+}
+# else
+#  include "decoder_sse_base.h"
+void decoder_set_vbmi2_funcs() {
+    decoder_sse_init();
+    decoder_init_lut(lookups->eqFix, lookups->compact);
+    _do_decode = &do_decode_simd<false, false, sizeof(__m128i)*2, do_decode_sse<false, false, ISA_LEVEL_VBMI2> >;
+    _do_decode_raw = &do_decode_simd<true, false, sizeof(__m128i)*2, do_decode_sse<true, false, ISA_LEVEL_VBMI2> >;
+    _do_decode_end_raw = &do_decode_simd<true, true, sizeof(__m128i)*2, do_decode_sse<true, true, ISA_LEVEL_VBMI2> >;
+    _decode_simd_level = ISA_LEVEL_VBMI2;
+}
+# endif
+#else
+
+void decoder_set_avx2_funcs();
+
+void decoder_set_vbmi2_funcs() {
+    decoder_set_avx2_funcs();
+}
+#endif
diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_avx_base.h python-sabyenc-5.4.2/src/yencode/encoder_avx_base.h
--- python-sabyenc-5.3.0/src/yencode/encoder_avx_base.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/encoder_avx_base.h	2022-05-29 12:28:02.000000000 +0000
@@ -7,7 +7,7 @@
 #include "encoder_common.h"
 #define YMM_SIZE 32
 
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs))
 #else
 # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)])
@@ -294,7 +294,7 @@
                     asm(
                         "shrq $1, %[eqMask] \n"
                         "shrq %%cl, %[eqMask] \n"
-                        "adcq %[col], %[p] \n"
+                        "adcq %q[col], %q[p] \n"
                         : [eqMask]"+r"(eqMask), [p]"+r"(p)
                         : "c"(shiftAmt), [col]"r"(~col)
                     );
diff -Nru python-sabyenc-5.3.0/src/yencode/encoder.cc python-sabyenc-5.4.2/src/yencode/encoder.cc
--- python-sabyenc-5.3.0/src/yencode/encoder.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/encoder.cc	2022-05-29 12:28:02.000000000 +0000
@@ -134,6 +134,8 @@
 
 void encoder_avx2_init();
 
+void encoder_vbmi2_init();
+
 void encoder_neon_init();
 
 #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE != 0
@@ -159,7 +161,9 @@
     encoder_native_init();
 # else
     int use_isa = cpu_supports_isa();
-    if(use_isa >= ISA_LEVEL_AVX2) {
+    if(use_isa >= ISA_LEVEL_VBMI2) {
+        encoder_vbmi2_init();
+    } else if(use_isa >= ISA_LEVEL_AVX2) {
         encoder_avx2_init();
     } else if(use_isa >= ISA_LEVEL_AVX) {
         encoder_avx_init();
diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_neon.cc python-sabyenc-5.4.2/src/yencode/encoder_neon.cc
--- python-sabyenc-5.3.0/src/yencode/encoder_neon.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/encoder_neon.cc	2022-05-29 12:28:02.000000000 +0000
@@ -241,7 +241,7 @@
 }
 
 
-static HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
+HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) {
     // offset position to enable simpler loop condition checking
     const int INPUT_OFFSET = sizeof(uint8x16_t)*4 -1; // extra chars for EOL handling, -1 to change <= to <
     if(len <= INPUT_OFFSET || line_size < (int)sizeof(uint8x16_t)*4) return;
diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_sse_base.h python-sabyenc-5.4.2/src/yencode/encoder_sse_base.h
--- python-sabyenc-5.3.0/src/yencode/encoder_sse_base.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/encoder_sse_base.h	2022-05-29 12:28:02.000000000 +0000
@@ -8,7 +8,7 @@
 # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8
 #endif
 
-#if defined(__GNUC__) && __GNUC__ >= 7
+#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924)
 # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs))
 #else
 # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)])
@@ -167,7 +167,7 @@
     if (len <= INPUT_OFFSET || line_size < XMM_SIZE) return;
 
     // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution
-#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__)
+#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__)
     const bool _PREFER_BRANCHING = true;
 #else
     const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3);
@@ -424,8 +424,8 @@
                         asm(
                             "shrl $1, %[eqMask] \n"
                             "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift?
-# if defined(PLATFORM_AMD64)
-                            "adcq %[col], %[p] \n"
+# if defined(PLATFORM_AMD64) && !defined(__ILP32__)
+                            "adcq %q[col], %q[p] \n"
 # else
                             "adcl %[col], %[p] \n"
 # endif
@@ -551,7 +551,6 @@
                     dataA = _mm_shuffle_epi8(dataA, shufMaskA);
 
 # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__)
-                    // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2)
                     if(use_isa >= ISA_LEVEL_SSE41) {
                         dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB);
                     } else
diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_vbmi2.cc python-sabyenc-5.4.2/src/yencode/encoder_vbmi2.cc
--- python-sabyenc-5.3.0/src/yencode/encoder_vbmi2.cc	1970-01-01 00:00:00.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/encoder_vbmi2.cc	2022-05-29 12:28:02.000000000 +0000
@@ -0,0 +1,25 @@
+#include "common.h"
+
+#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__)
+# ifndef YENC_DISABLE_AVX256
+#  include "encoder_avx_base.h"
+
+void encoder_vbmi2_init() {
+    _do_encode = &do_encode_simd< do_encode_avx2<ISA_LEVEL_VBMI2> >;
+    encoder_avx2_lut<ISA_LEVEL_VBMI2>();
+}
+# else
+#  include "encoder_sse_base.h"
+void encoder_vbmi2_init() {
+    _do_encode = &do_encode_simd< do_encode_sse<ISA_LEVEL_VBMI2> >;
+    encoder_sse_lut<ISA_LEVEL_VBMI2>();
+}
+# endif
+#else
+
+void encoder_avx2_init();
+
+void encoder_vbmi2_init() {
+    encoder_avx2_init();
+}
+#endif
diff -Nru python-sabyenc-5.3.0/src/yencode/platform.cc python-sabyenc-5.4.2/src/yencode/platform.cc
--- python-sabyenc-5.3.0/src/yencode/platform.cc	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/platform.cc	2022-05-29 12:28:02.000000000 +0000
@@ -149,7 +149,7 @@
 int cpu_supports_crc_isa() {
     int flags[4];
     _cpuid1(flags);
-    
+
     if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL
         if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX
             int xcr = _GET_XCR() & 0xff; // ignore unused bits
@@ -167,24 +167,3 @@
 }
 
 #endif // PLATFORM_X86
-
-const char* simd_detected() {
-#ifdef PLATFORM_X86
-    int use_isa = cpu_supports_isa();
-    if(use_isa >= ISA_LEVEL_AVX2) {
-        return "AVX2";
-    } else if(use_isa >= ISA_LEVEL_AVX) {
-        return "AVX";
-    } else if(use_isa >= ISA_LEVEL_SSSE3) {
-        return "SSSE3";
-    } else {
-        return "SSE2";
-    }
-#endif
-#ifdef PLATFORM_ARM
-    if(cpu_supports_neon()) {
-        return "NEON";
-    }
-#endif
-    return "";
-}
diff -Nru python-sabyenc-5.3.0/src/yencode/stdint.h python-sabyenc-5.4.2/src/yencode/stdint.h
--- python-sabyenc-5.3.0/src/yencode/stdint.h	2022-05-19 08:10:58.000000000 +0000
+++ python-sabyenc-5.4.2/src/yencode/stdint.h	2022-05-29 12:28:02.000000000 +0000
@@ -1,32 +1,32 @@
 // ISO C9x  compliant stdint.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
 //  Copyright (c) 2006-2008 Alexander Chemeris
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
-// 
+//
 //   1. Redistributions of source code must retain the above copyright notice,
 //      this list of conditions and the following disclaimer.
-// 
+//
 //   2. Redistributions in binary form must reproduce the above copyright
 //      notice, this list of conditions and the following disclaimer in the
 //      documentation and/or other materials provided with the distribution.
-// 
+//
 //   3. The name of the author may be used to endorse or promote products
 //      derived from this software without specific prior written permission.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
+//
 ///////////////////////////////////////////////////////////////////////////////
 
 #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900