diff -Nru python-sabyenc-5.3.0/debian/changelog python-sabyenc-5.4.2/debian/changelog --- python-sabyenc-5.3.0/debian/changelog 2022-05-19 11:11:00.000000000 +0000 +++ python-sabyenc-5.4.2/debian/changelog 2022-05-29 14:18:54.000000000 +0000 @@ -1,3 +1,10 @@ +python-sabyenc (5.4.2-0ubuntu1~jcfp1~21.10) impish; urgency=medium + + * New upstream release. + * Control: bump version of breaks to 3.6.0~rc3. + + -- Jeroen Ploemen Sun, 29 May 2022 14:18:54 +0000 + python-sabyenc (5.3.0-0ubuntu1~jcfp1~21.10) impish; urgency=medium * New upstream release. diff -Nru python-sabyenc-5.3.0/debian/control python-sabyenc-5.4.2/debian/control --- python-sabyenc-5.3.0/debian/control 2022-05-19 11:11:00.000000000 +0000 +++ python-sabyenc-5.4.2/debian/control 2022-05-29 14:18:54.000000000 +0000 @@ -18,7 +18,7 @@ Package: python3-sabyenc Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends} -Breaks: sabnzbdplus (<< 3.6.0~rc2) +Breaks: sabnzbdplus (<< 3.6.0~rc3) Multi-Arch: same Description: yEnc extension for Python, optimized for Usenet Modified version of the original python-yenc module by Alessandro Duca, diff -Nru python-sabyenc-5.3.0/setup.py python-sabyenc-5.4.2/setup.py --- python-sabyenc-5.3.0/setup.py 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/setup.py 2022-05-29 12:28:02.000000000 +0000 @@ -62,6 +62,7 @@ f.flush() try: + log.info("==> Please ignore any errors shown below!") result_files = compiler.compile([f.name], extra_postargs=extra_postargs) log.info("==> Success!") except CompileError: @@ -89,6 +90,7 @@ gcc_arm_neon_flags = [] gcc_arm_crc_flags = [] gcc_vpclmulqdq_flags = [] + gcc_vbmi2_flags = [] gcc_macros = [] if self.compiler.compiler_type == "msvc": # LTCG not enabled due to issues seen with code generation where @@ -135,23 +137,34 @@ IS_AARCH64 = False if autoconf_check(self.compiler, flag_check="-march=armv8-a+crc"): gcc_arm_crc_flags.append("-march=armv8-a+crc") - if autoconf_check(self.compiler, flag_check="-mfpu=neon"): + if not IS_AARCH64 and autoconf_check(self.compiler, flag_check="-mfpu=neon"): gcc_arm_neon_flags.append("-mfpu=neon") # Check for special x32 case if ( - IS_X86 - and not IS_MACOS + IS_X86 + and not IS_MACOS and autoconf_check(self.compiler, define_check="__ILP32__") and autoconf_check(self.compiler, define_check="__x86_64__") ): log.info("==> Detected x32 platform, setting CRCUTIL_USE_ASM=0") ext.define_macros.append(("CRCUTIL_USE_ASM", "0")) gcc_macros.append(("CRCUTIL_USE_ASM", "0")) - + if IS_X86 and autoconf_check(self.compiler, flag_check="-mvpclmulqdq"): gcc_vpclmulqdq_flags = ["-mavx2", "-mvpclmulqdq", "-mpclmul"] + if IS_X86 and autoconf_check(self.compiler, flag_check="-mavx512vbmi2"): + gcc_vbmi2_flags = [ + "-mavx512vbmi2", + "-mavx512vl", + "-mavx512bw", + "-mpopcnt", + "-mbmi", + "-mbmi2", + "-mlzcnt", + ] + srcdeps_crc_common = ["src/yencode/common.h", "src/yencode/crc_common.h", "src/yencode/crc.h"] srcdeps_dec_common = ["src/yencode/common.h", "src/yencode/decoder_common.h", "src/yencode/decoder.h"] srcdeps_enc_common = ["src/yencode/common.h", "src/yencode/encoder_common.h", "src/yencode/encoder.h"] @@ -225,6 +238,18 @@ "msvc_x86_flags": ["/arch:AVX2"], }, { + "sources": ["src/yencode/encoder_vbmi2.cc"], + "depends": srcdeps_enc_common + ["encoder_avx_base.h"], + "gcc_x86_flags": gcc_vbmi2_flags, + "msvc_x86_flags": ["/arch:AVX512"], + }, + { + "sources": ["src/yencode/decoder_vbmi2.cc"], + "depends": srcdeps_dec_common + ["decoder_avx2_base.h"], + "gcc_x86_flags": gcc_vbmi2_flags, + "msvc_x86_flags": ["/arch:AVX512"], + }, + { "sources": ["src/yencode/encoder_neon.cc"], "depends": srcdeps_enc_common, "gcc_arm_flags": gcc_arm_neon_flags, diff -Nru python-sabyenc-5.3.0/src/sabyenc3.cc python-sabyenc-5.4.2/src/sabyenc3.cc --- python-sabyenc-5.3.0/src/sabyenc3.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/sabyenc3.cc 2022-05-29 12:28:02.000000000 +0000 @@ -536,10 +536,10 @@ #if PY_MINOR_VERSION < 9 Py_SIZE(sv) = output_len; #else - Py_SET_SIZE(sv, output_len); + Py_SET_SIZE(sv, output_len); #endif sv->ob_sval[output_len] = '\0'; - // Reset hash, this was removed in Python 3.11 + // Reset hash, this was removed in Python 3.11 #if PY_MINOR_VERSION < 11 sv->ob_shash = -1; #endif @@ -556,30 +556,30 @@ static inline size_t YENC_MAX_SIZE(size_t len, size_t line_size) { - size_t ret = len * 2 /* all characters escaped */ - + 2 /* allocation for offset and that a newline may occur early */ + size_t ret = len * 2 /* all characters escaped */ + + 2 /* allocation for offset and that a newline may occur early */ #if !defined(YENC_DISABLE_AVX256) - + 64 /* allocation for YMM overflowing */ + + 64 /* allocation for YMM overflowing */ #else - + 32 /* allocation for XMM overflowing */ + + 32 /* allocation for XMM overflowing */ #endif - ; - /* add newlines, considering the possibility of all chars escaped */ - if(line_size == 128) // optimize common case - return ret + 2 * (len >> 6); - return ret + 2 * ((len*2) / line_size); + ; + /* add newlines, considering the possibility of all chars escaped */ + if(line_size == 128) // optimize common case + return ret + 2 * (len >> 6); + return ret + 2 * ((len*2) / line_size); } PyObject* encode(PyObject* self, PyObject* Py_input_string) { (void)self; - PyObject *Py_output_string; - PyObject *retval = NULL; + PyObject *Py_output_string; + PyObject *retval = NULL; - char *input_buffer = NULL; - char *output_buffer = NULL; - size_t input_len = 0; - size_t output_len = 0; + char *input_buffer = NULL; + char *output_buffer = NULL; + size_t input_len = 0; + size_t output_len = 0; uint32_t crc; // Verify the input is a bytes string @@ -589,29 +589,29 @@ } // Initialize buffers and CRC's - input_len = PyBytes_Size(Py_input_string); - input_buffer = (char *)PyBytes_AsString(Py_input_string); - output_buffer = (char *)malloc(YENC_MAX_SIZE(input_len, LINESIZE)); - if(!output_buffer) - return PyErr_NoMemory(); + input_len = PyBytes_Size(Py_input_string); + input_buffer = (char *)PyBytes_AsString(Py_input_string); + output_buffer = (char *)malloc(YENC_MAX_SIZE(input_len, LINESIZE)); + if(!output_buffer) + return PyErr_NoMemory(); - // Free GIL, in case it helps + // Free GIL, in case it helps Py_BEGIN_ALLOW_THREADS; // Encode result int column = 0; - output_len = do_encode(LINESIZE, &column, (unsigned char*)input_buffer, (unsigned char*)output_buffer, input_len, 1); + output_len = do_encode(LINESIZE, &column, (unsigned char*)input_buffer, (unsigned char*)output_buffer, input_len, 1); crc = do_crc32(input_buffer, input_len, 0); - // Restore GIL so we can build Python strings - Py_END_ALLOW_THREADS; + // Restore GIL so we can build Python strings + Py_END_ALLOW_THREADS; - // Build output string - Py_output_string = PyBytes_FromStringAndSize((char *)output_buffer, output_len); - if(Py_output_string) - retval = Py_BuildValue("(S,L)", Py_output_string, (long long)crc); + // Build output string + Py_output_string = PyBytes_FromStringAndSize((char *)output_buffer, output_len); + if(Py_output_string) + retval = Py_BuildValue("(S,L)", Py_output_string, (long long)crc); Py_XDECREF(Py_output_string); - free(output_buffer); - return retval; + free(output_buffer); + return retval; } diff -Nru python-sabyenc-5.3.0/src/sabyenc3.h python-sabyenc-5.4.2/src/sabyenc3.h --- python-sabyenc-5.3.0/src/sabyenc3.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/sabyenc3.h 2022-05-29 12:28:02.000000000 +0000 @@ -25,7 +25,7 @@ #include /* Version information */ -#define SABYENC_VERSION "5.3.0" +#define SABYENC_VERSION "5.4.2" /* Do we CRC check? */ #define CRC_CHECK 1 diff -Nru python-sabyenc-5.3.0/src/yencode/common.h python-sabyenc-5.4.2/src/yencode/common.h --- python-sabyenc-5.3.0/src/yencode/common.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/common.h 2022-05-29 12:28:02.000000000 +0000 @@ -40,22 +40,22 @@ #include #if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__) - // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007 - #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align) - #define ALIGN_FREE _aligned_free + // MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007 + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align) + #define ALIGN_FREE _aligned_free #elif defined(_ISOC11_SOURCE) - // C11 method - // len needs to be a multiple of alignment, although it sometimes works if it isn't... - #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) - #define ALIGN_FREE free + // C11 method + // len needs to be a multiple of alignment, although it sometimes works if it isn't... + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) + #define ALIGN_FREE free #elif defined(__cplusplus) && __cplusplus >= 201700 - // C++17 method - #include - #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) - #define ALIGN_FREE free + // C++17 method + #include + #define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1)) + #define ALIGN_FREE free #else - #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL - #define ALIGN_FREE free + #define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL + #define ALIGN_FREE free #endif @@ -185,18 +185,24 @@ # endif } +# ifdef _MSC_VER +# define _CREATE_TUPLE(type, ...) type{{ __VA_ARGS__ }} +# else +# define _CREATE_TUPLE(type, ...) (type){{ __VA_ARGS__ }} +# endif static HEDLEY_ALWAYS_INLINE uint8x16x2_t vcreate2_u8(uint8x16_t a, uint8x16_t b) { - return {a, b}; + return _CREATE_TUPLE(uint8x16x2_t, a, b); } static HEDLEY_ALWAYS_INLINE int8x16x2_t vcreate2_s8(int8x16_t a, int8x16_t b) { - return {a, b}; + return _CREATE_TUPLE(int8x16x2_t, a, b); } static HEDLEY_ALWAYS_INLINE uint8x16x3_t vcreate3_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { - return {a, b, c}; + return _CREATE_TUPLE(uint8x16x3_t, a, b, c); } static HEDLEY_ALWAYS_INLINE uint8x16x4_t vcreate4_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) { - return {a, b, c, d}; + return _CREATE_TUPLE(uint8x16x4_t, a, b, c, d); } +# undef _CREATE_TUPLE #endif #ifdef PLATFORM_ARM bool cpu_supports_neon(); @@ -253,8 +259,6 @@ int cpu_supports_isa(); #endif // PLATFORM_X86 -const char* simd_detected(); - #include #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900 @@ -269,7 +273,7 @@ // GCC 8/9/10(dev) fails to optimize cases where KNOT should be used, so use intrinsic explicitly; Clang 6+ has no issue, but Clang 6/7 doesn't have the intrinsic; MSVC 2019 also fails and lacks the intrinsic -#if defined(__GNUC__) && __GNUC__ >= 7 +#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924) # define KNOT16 _knot_mask16 # define KNOT32 _knot_mask32 #else diff -Nru python-sabyenc-5.3.0/src/yencode/crc_arm.cc python-sabyenc-5.4.2/src/yencode/crc_arm.cc --- python-sabyenc-5.3.0/src/yencode/crc_arm.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/crc_arm.cc 2022-05-29 12:28:02.000000000 +0000 @@ -26,14 +26,30 @@ #include #endif + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# ifdef __GNUC__ +# define _LE16 __builtin_bswap16 +# define _LE32 __builtin_bswap32 +# define _LE64 __builtin_bswap64 +# else +// currently not supported +# error No endian swap intrinsic defined +# endif +#else +# define _LE16(x) (x) +# define _LE32(x) (x) +# define _LE64(x) (x) +#endif + #ifdef __aarch64__ # define WORD_T uint64_t # define WORDSIZE_LOG 3 // sizeof(WORD_T) == 1<= 1920 && defined(PLATFORM_X86))) + +#if !defined(YENC_DISABLE_AVX256) && ((defined(__VPCLMULQDQ__) && defined(__AVX2__) && defined(__PCLMUL__)) || (defined(_MSC_VER) && _MSC_VER >= 1920 && defined(PLATFORM_X86) && !defined(__clang__))) #include #include @@ -62,11 +62,11 @@ static void partial_fold(const size_t len, __m256i *crc0, __m256i *crc1, __m256i crc_part) { __m256i shuf = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(pshufb_rot_table + (len&15)))); __m256i mask = _mm256_cmpgt_epi8(shuf, _mm256_set1_epi8(15)); - + *crc0 = _mm256_shuffle_epi8(*crc0, shuf); *crc1 = _mm256_shuffle_epi8(*crc1, shuf); crc_part = _mm256_shuffle_epi8(crc_part, shuf); - + __m256i crc_out = _mm256_permute2x128_si256(*crc0, *crc0, 0x08); // move bottom->top __m256i crc01, crc1p; if(len >= 16) { @@ -81,10 +81,10 @@ crc01 = _mm256_permute2x128_si256(*crc0, *crc1, 0x21); crc1p = _mm256_permute2x128_si256(*crc1, crc_part, 0x21); } - + *crc0 = MM256_BLENDV(*crc0, crc01, mask); *crc1 = MM256_BLENDV(*crc1, crc1p, mask); - + *crc1 = do_one_fold(crc_out, *crc1); } @@ -103,7 +103,7 @@ // info from https://www.reddit.com/r/ReverseEngineering/comments/2zwhl3/mystery_constant_0x9db42487_in_intels_crc32ieee/ // firstly, calculate: xmm_crc0 = (intial * 0x487b9c8a) mod 0x104c11db7, where 0x487b9c8a = inverse(1<<512) mod 0x104c11db7 __m128i xmm_t0 = _mm_cvtsi32_si128(~initial); - + xmm_t0 = _mm_clmulepi64_si128(xmm_t0, _mm_set_epi32(0, 0, 0xa273bc24, 0), 0); // reverse(0x487b9c8a)<<1 == 0xa273bc24 __m128i reduction = _mm_set_epi32( // polynomial reduction factors 1, 0xdb710640, // G* = 0x04c11db7 @@ -111,11 +111,11 @@ ); __m128i xmm_t1 = _mm_clmulepi64_si128(xmm_t0, reduction, 0); xmm_t1 = _mm_clmulepi64_si128(xmm_t1, reduction, 0x10); - + xmm_t0 = _mm_srli_si128(_mm_xor_si128(xmm_t0, xmm_t1), 8); __m256i crc0 = zext128_256(xmm_t0); __m256i crc1 = _mm256_setzero_si256(); - + if (len < 32) { if (len == 0) return initial; @@ -129,31 +129,31 @@ src += algn_diff; len -= algn_diff; } - + while (len >= 64) { crc0 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src)); crc1 = do_one_fold(crc1, _mm256_load_si256((__m256i*)src + 1)); src += 64; len -= 64; } - + if (len >= 32) { __m256i old = crc1; crc1 = do_one_fold(crc0, _mm256_load_si256((__m256i*)src)); crc0 = old; - + len -= 32; src += 32; } - + if(len != 0) { partial_fold(len, &crc0, &crc1, _mm256_load_si256((__m256i *)src)); } } - + const __m128i xmm_mask = _mm_set_epi32(-1,-1,-1,0); __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; - + __m128i xmm_crc0 = _mm256_castsi256_si128(crc0); __m128i xmm_crc1 = _mm256_extracti128_si256(crc0, 1); __m128i xmm_crc2 = _mm256_castsi256_si128(crc1); diff -Nru python-sabyenc-5.3.0/src/yencode/crc_folding.cc python-sabyenc-5.4.2/src/yencode/crc_folding.cc --- python-sabyenc-5.3.0/src/yencode/crc_folding.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/crc_folding.cc 2022-05-29 12:28:02.000000000 +0000 @@ -1,6 +1,6 @@ // taken from zlib-ng / Intel's zlib patch, modified to remove zlib dependencies /* - * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ * instruction. * * A white paper describing this algorithm can be found at: @@ -19,7 +19,7 @@ #include "crc_common.h" -#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86)) +#if (defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)) || (defined(_MSC_VER) && _MSC_VER >= 1600 && defined(PLATFORM_X86) && !defined(__clang__)) #include #include #include diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx2_base.h python-sabyenc-5.4.2/src/yencode/decoder_avx2_base.h --- python-sabyenc-5.3.0/src/yencode/decoder_avx2_base.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_avx2_base.h 2022-05-29 12:28:02.000000000 +0000 @@ -1,8 +1,8 @@ #ifdef __AVX2__ -// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine -#if defined(__GNUC__) && __GNUC__ >= 7 +// GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine; functions added in Clang 8 +#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924) # define KORTEST32(a, b) !_kortestz_mask32_u8((a), (b)) # define KAND32(a, b) _kand_mask32((a), (b)) # define KOR32(a, b) _kor_mask32((a), (b)) @@ -60,6 +60,17 @@ '.','.','.','.','.','.','.','.','.','.','.','.','.','.',_nextMask==2?0:'.',_nextMask==1?0:'.' ); } + + // for some reason, MSVC Win32 seems to crash when trying to compile _mm256_mask_cmpeq_epi8_mask + // the crash can be fixed by switching the order of the last two arguments, but it seems to generate wrong code + // so just disable the optimisation as it seems to be problematic there +#if defined(__AVX512VL__) && defined(__AVX512BW__) +# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__) + const bool useAVX3MaskCmp = false; +# else + const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3); +# endif +#endif intptr_t i; for(i = -len; i; i += sizeof(__m256i)*2) { __m256i oDataA = _mm256_load_si256((__m256i *)(src+i)); @@ -126,7 +137,7 @@ __mmask32 match2EqMaskA, match2EqMaskB; __mmask32 match0CrMaskA, match0CrMaskB; __mmask32 match2CrXDtMaskA, match2CrXDtMaskB; - if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) { + if(useAVX3MaskCmp && searchEnd) { match2EqMaskA = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2A); match2EqMaskB = _mm256_cmpeq_epi8_mask(_mm256_set1_epi8('='), tmpData2B); } else @@ -142,7 +153,7 @@ // find patterns of \r_. #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match0CrMaskA = _mm256_cmpeq_epi8_mask(oDataA, _mm256_set1_epi8('\r')); match0CrMaskB = _mm256_cmpeq_epi8_mask(oDataB, _mm256_set1_epi8('\r')); match2CrXDtMaskA = _mm256_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm256_set1_epi8('.')); @@ -172,7 +183,7 @@ #if defined(__AVX512VL__) && defined(__AVX512BW__) __mmask32 match1NlMaskA, match1NlMaskB; __mmask32 match2NlDotMaskA, match2NlDotMaskB; - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match1NlMaskA = _mm256_mask_cmpeq_epi8_mask( match0CrMaskA, _mm256_set1_epi8('\n'), @@ -228,7 +239,7 @@ int matchEnd; #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { __mmask32 match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask( match2EqMaskA, _mm256_set1_epi8('y'), @@ -307,7 +318,7 @@ } } #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { mask |= (uint64_t)match2NlDotMaskA << 2; mask |= (uint64_t)match2NlDotMaskB << 34; minMask = _mm256_maskz_mov_epi8(~(match2NlDotMaskB>>30), _mm256_set1_epi8('.')); @@ -325,7 +336,7 @@ __m256i match3EqYA, match3EqYB; #if defined(__AVX512VL__) && defined(__AVX512BW__) __mmask32 match3EqYMaskA, match3EqYMaskB; - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match3EqYMaskA = _mm256_mask_cmpeq_epi8_mask( match2EqMaskA, _mm256_set1_epi8('y'), @@ -355,7 +366,7 @@ if(LIKELIHOOD(0.002, partialEndFound)) { bool endFound; #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { __mmask32 match3LfEqYMaskA = _mm256_mask_cmpeq_epi8_mask( match3EqYMaskA, _mm256_set1_epi8('\n'), diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx2.cc python-sabyenc-5.4.2/src/yencode/decoder_avx2.cc --- python-sabyenc-5.3.0/src/yencode/decoder_avx2.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_avx2.cc 2022-05-29 12:28:02.000000000 +0000 @@ -9,6 +9,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_AVX2; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_avx.cc python-sabyenc-5.4.2/src/yencode/decoder_avx.cc --- python-sabyenc-5.3.0/src/yencode/decoder_avx.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_avx.cc 2022-05-29 12:28:02.000000000 +0000 @@ -9,6 +9,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_AVX; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder.cc python-sabyenc-5.4.2/src/yencode/decoder.cc --- python-sabyenc-5.3.0/src/yencode/decoder.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder.cc 2022-05-29 12:28:02.000000000 +0000 @@ -10,6 +10,7 @@ YencDecoderState *) = &do_decode_scalar; YencDecoderEnd (*_do_decode_end_raw)(const unsigned char *HEDLEY_RESTRICT *, unsigned char *HEDLEY_RESTRICT *, size_t, YencDecoderState *) = &do_decode_end_scalar; +int _decode_simd_level = 0; } void decoder_set_sse2_funcs(); @@ -20,6 +21,8 @@ void decoder_set_avx2_funcs(); +void decoder_set_vbmi2_funcs(); + void decoder_set_neon_funcs(); @@ -32,6 +35,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_NATIVE; } # else # include "decoder_sse_base.h" @@ -41,6 +45,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_NATIVE; } # endif #endif @@ -51,7 +56,9 @@ decoder_set_native_funcs(); # else int use_isa = cpu_supports_isa(); - if(use_isa >= ISA_LEVEL_AVX2) { + if(use_isa >= ISA_LEVEL_VBMI2) { + decoder_set_vbmi2_funcs(); + } else if(use_isa >= ISA_LEVEL_AVX2) { decoder_set_avx2_funcs(); } else if(use_isa >= ISA_LEVEL_AVX) { decoder_set_avx_funcs(); @@ -68,3 +75,31 @@ } #endif } + +const char* simd_detected() { +#ifdef PLATFORM_X86 + if(_decode_simd_level >= ISA_LEVEL_VBMI2) + return "AVX512VL+VBMI2"; + if(_decode_simd_level >= ISA_LEVEL_AVX3) + return "AVX512VL"; + if(_decode_simd_level >= ISA_LEVEL_AVX2) + return "AVX2"; + if(_decode_simd_level >= ISA_LEVEL_AVX) + return "AVX"; + if(_decode_simd_level >= ISA_LEVEL_SSE4_POPCNT) + return "SSE4.1+POPCNT"; + if(_decode_simd_level >= ISA_LEVEL_SSE41) + return "SSE4.1"; + if(_decode_simd_level >= ISA_LEVEL_SSSE3) + return "SSSE3"; + if(_decode_simd_level >= (ISA_LEVEL_SSE2 | ISA_FEATURE_POPCNT | ISA_FEATURE_LZCNT)) + return "SSE2+ABM"; + return "SSE2"; +#endif +#ifdef PLATFORM_ARM + if(_decode_simd_level >= 1) { + return "NEON"; + } +#endif + return ""; +} diff -Nru python-sabyenc-5.3.0/src/yencode/decoder.h python-sabyenc-5.4.2/src/yencode/decoder.h --- python-sabyenc-5.3.0/src/yencode/decoder.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder.h 2022-05-29 12:28:02.000000000 +0000 @@ -38,6 +38,8 @@ (*_do_decode_end_raw)(const unsigned char *HEDLEY_RESTRICT *, unsigned char *HEDLEY_RESTRICT *, size_t, YencDecoderState *); +extern int _decode_simd_level; + static inline size_t do_decode(int isRaw, const unsigned char *HEDLEY_RESTRICT src, unsigned char *HEDLEY_RESTRICT dest, size_t len, YencDecoderState *state) { @@ -54,6 +56,8 @@ void decoder_init(); +const char* simd_detected(); + #ifdef __cplusplus } diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_neon64.cc python-sabyenc-5.4.2/src/yencode/decoder_neon64.cc --- python-sabyenc-5.3.0/src/yencode/decoder_neon64.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_neon64.cc 2022-05-29 12:28:02.000000000 +0000 @@ -448,6 +448,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = 1; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_neon.cc python-sabyenc-5.4.2/src/yencode/decoder_neon.cc --- python-sabyenc-5.3.0/src/yencode/decoder_neon.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_neon.cc 2022-05-29 12:28:02.000000000 +0000 @@ -20,8 +20,8 @@ #endif -// for compilers that lack these functions -#if defined(__clang__) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8)) +// for compilers that lack these functions (Clang armv7 9-12 seems to have issues with multi-vector loads) +#if (defined(__clang__) && (defined(__aarch64__) || __clang_major__<9 || __clang_major__>12)) || (defined(__GNUC__) && (defined(__aarch64__) && __GNUC__ >= 8)) # define vld1q_u8_x2_align(p, n) vld1q_u8_x2((uint8_t*)__builtin_assume_aligned(p, n)) #else # define vld1q_u8_x2_align(p, n) vcreate2_u8(vld1q_u8_align(p, (n)/2), vld1q_u8_align((p)+16, (n)/2)) @@ -469,6 +469,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = 1; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_sse2.cc python-sabyenc-5.4.2/src/yencode/decoder_sse2.cc --- python-sabyenc-5.3.0/src/yencode/decoder_sse2.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_sse2.cc 2022-05-29 12:28:02.000000000 +0000 @@ -10,6 +10,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_SSE2; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_sse_base.h python-sabyenc-5.4.2/src/yencode/decoder_sse_base.h --- python-sabyenc-5.3.0/src/yencode/decoder_sse_base.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_sse_base.h 2022-05-29 12:28:02.000000000 +0000 @@ -8,7 +8,7 @@ #endif // GCC (ver 6-10(dev)) fails to optimize pure C version of mask testing, but has this intrinsic; Clang >= 7 optimizes C version fine -#if defined(__GNUC__) && __GNUC__ >= 7 +#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924) # define KORTEST16(a, b) !_kortestz_mask16_u8((a), (b)) # define KAND16(a, b) _kand_mask16((a), (b)) # define KOR16(a, b) _kor_mask16((a), (b)) @@ -112,7 +112,7 @@ -42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42,-42-64 ) : _mm_set1_epi8(-42); -#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) +#if defined(__SSSE3__) && !defined(__tune_atom__) && !defined(__tune_slm__) && !defined(__tune_btver1__) && !defined(__tune_btver2__) const bool _USING_FAST_MATCH = (use_isa >= ISA_LEVEL_SSSE3); #else const bool _USING_FAST_MATCH = false; @@ -122,6 +122,13 @@ #else const bool _USING_BLEND_ADD = false; #endif +#if defined(__AVX512VL__) && defined(__AVX512BW__) +# if defined(_MSC_VER) && !defined(PLATFORM_AMD64) && !defined(__clang__) + const bool useAVX3MaskCmp = false; +# else + const bool useAVX3MaskCmp = (use_isa >= ISA_LEVEL_AVX3); +# endif +#endif __m128i lfCompare = _mm_set1_epi8('\n'); __m128i minMask = _mm_set1_epi8('.'); @@ -214,7 +221,7 @@ __mmask16 match2EqMaskA, match2EqMaskB; __mmask16 match0CrMaskA, match0CrMaskB; __mmask16 match2CrXDtMaskA, match2CrXDtMaskB; - if(use_isa >= ISA_LEVEL_AVX3 && searchEnd) { + if(useAVX3MaskCmp && searchEnd) { match2EqMaskA = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2A); match2EqMaskB = _mm_cmpeq_epi8_mask(_mm_set1_epi8('='), tmpData2B); } else @@ -230,7 +237,7 @@ __m128i match2CrXDtA, match2CrXDtB; if(isRaw) { #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match0CrMaskA = _mm_cmpeq_epi8_mask(oDataA, _mm_set1_epi8('\r')); match0CrMaskB = _mm_cmpeq_epi8_mask(oDataB, _mm_set1_epi8('\r')); match2CrXDtMaskA = _mm_mask_cmpeq_epi8_mask(match0CrMaskA, tmpData2A, _mm_set1_epi8('.')); @@ -256,7 +263,7 @@ #if defined(__AVX512VL__) && defined(__AVX512BW__) __mmask16 match1NlMaskA, match1NlMaskB; __mmask16 match2NlDotMaskA, match2NlDotMaskB; - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match1NlMaskA = _mm_mask_cmpeq_epi8_mask( match0CrMaskA, _mm_set1_epi8('\n'), @@ -299,7 +306,7 @@ int matchEnd; #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { __mmask16 match3EqYMaskA = _mm_mask_cmpeq_epi8_mask( match2EqMaskA, _mm_set1_epi8('y'), tmpData3A ); @@ -373,7 +380,7 @@ } } #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { mask |= match2NlDotMaskA << 2; mask |= (match2NlDotMaskB << 18) & 0xffffffff; minMask = _mm_maskz_mov_epi8(~(match2NlDotMaskB>>14), _mm_set1_epi8('.')); @@ -398,7 +405,7 @@ __m128i match3EqYA, match3EqYB; #if defined(__AVX512VL__) && defined(__AVX512BW__) __mmask16 match3EqYMaskA, match3EqYMaskB; - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { match3EqYMaskA = _mm_mask_cmpeq_epi8_mask( match2EqMaskA, _mm_set1_epi8('y'), @@ -434,7 +441,7 @@ bool endFound; #if defined(__AVX512VL__) && defined(__AVX512BW__) - if(use_isa >= ISA_LEVEL_AVX3) { + if(useAVX3MaskCmp) { __mmask16 match3LfEqYMaskA = _mm_mask_cmpeq_epi8_mask( match3EqYMaskA, _mm_set1_epi8('\n'), diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_ssse3.cc python-sabyenc-5.4.2/src/yencode/decoder_ssse3.cc --- python-sabyenc-5.3.0/src/yencode/decoder_ssse3.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_ssse3.cc 2022-05-29 12:28:02.000000000 +0000 @@ -9,6 +9,7 @@ _do_decode = &do_decode_simd >; _do_decode_raw = &do_decode_simd >; _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_SSSE3; } #else diff -Nru python-sabyenc-5.3.0/src/yencode/decoder_vbmi2.cc python-sabyenc-5.4.2/src/yencode/decoder_vbmi2.cc --- python-sabyenc-5.3.0/src/yencode/decoder_vbmi2.cc 1970-01-01 00:00:00.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/decoder_vbmi2.cc 2022-05-29 12:28:02.000000000 +0000 @@ -0,0 +1,34 @@ +#include "common.h" + +#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__) +# include "decoder_common.h" +# ifndef YENC_DISABLE_AVX256 +# include "decoder_avx2_base.h" +void decoder_set_vbmi2_funcs() { + ALIGN_ALLOC(lookups, sizeof(*lookups), 16); + // TODO: consider removing compact LUT + decoder_init_lut(lookups->eqFix, lookups->compact); + _do_decode = &do_decode_simd >; + _do_decode_raw = &do_decode_simd >; + _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_VBMI2; +} +# else +# include "decoder_sse_base.h" +void decoder_set_vbmi2_funcs() { + decoder_sse_init(); + decoder_init_lut(lookups->eqFix, lookups->compact); + _do_decode = &do_decode_simd >; + _do_decode_raw = &do_decode_simd >; + _do_decode_end_raw = &do_decode_simd >; + _decode_simd_level = ISA_LEVEL_VBMI2; +} +# endif +#else + +void decoder_set_avx2_funcs(); + +void decoder_set_vbmi2_funcs() { + decoder_set_avx2_funcs(); +} +#endif diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_avx_base.h python-sabyenc-5.4.2/src/yencode/encoder_avx_base.h --- python-sabyenc-5.3.0/src/yencode/encoder_avx_base.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/encoder_avx_base.h 2022-05-29 12:28:02.000000000 +0000 @@ -7,7 +7,7 @@ #include "encoder_common.h" #define YMM_SIZE 32 -#if defined(__GNUC__) && __GNUC__ >= 7 +#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924) # define KLOAD32(a, offs) _load_mask32((__mmask32*)(a) + (offs)) #else # define KLOAD32(a, offs) (((uint32_t*)(a))[(offs)]) @@ -294,7 +294,7 @@ asm( "shrq $1, %[eqMask] \n" "shrq %%cl, %[eqMask] \n" - "adcq %[col], %[p] \n" + "adcq %q[col], %q[p] \n" : [eqMask]"+r"(eqMask), [p]"+r"(p) : "c"(shiftAmt), [col]"r"(~col) ); diff -Nru python-sabyenc-5.3.0/src/yencode/encoder.cc python-sabyenc-5.4.2/src/yencode/encoder.cc --- python-sabyenc-5.3.0/src/yencode/encoder.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/encoder.cc 2022-05-29 12:28:02.000000000 +0000 @@ -134,6 +134,8 @@ void encoder_avx2_init(); +void encoder_vbmi2_init(); + void encoder_neon_init(); #if defined(PLATFORM_X86) && defined(YENC_BUILD_NATIVE) && YENC_BUILD_NATIVE != 0 @@ -159,7 +161,9 @@ encoder_native_init(); # else int use_isa = cpu_supports_isa(); - if(use_isa >= ISA_LEVEL_AVX2) { + if(use_isa >= ISA_LEVEL_VBMI2) { + encoder_vbmi2_init(); + } else if(use_isa >= ISA_LEVEL_AVX2) { encoder_avx2_init(); } else if(use_isa >= ISA_LEVEL_AVX) { encoder_avx_init(); diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_neon.cc python-sabyenc-5.4.2/src/yencode/encoder_neon.cc --- python-sabyenc-5.3.0/src/yencode/encoder_neon.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/encoder_neon.cc 2022-05-29 12:28:02.000000000 +0000 @@ -241,7 +241,7 @@ } -static HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) { +HEDLEY_ALWAYS_INLINE void do_encode_neon(int line_size, int* colOffset, const uint8_t* HEDLEY_RESTRICT srcEnd, uint8_t* HEDLEY_RESTRICT& dest, size_t& len) { // offset position to enable simpler loop condition checking const int INPUT_OFFSET = sizeof(uint8x16_t)*4 -1; // extra chars for EOL handling, -1 to change <= to < if(len <= INPUT_OFFSET || line_size < (int)sizeof(uint8x16_t)*4) return; diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_sse_base.h python-sabyenc-5.4.2/src/yencode/encoder_sse_base.h --- python-sabyenc-5.3.0/src/yencode/encoder_sse_base.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/encoder_sse_base.h 2022-05-29 12:28:02.000000000 +0000 @@ -8,7 +8,7 @@ # define _mm_mask_expand_epi8 _mm128_mask_expand_epi8 #endif -#if defined(__GNUC__) && __GNUC__ >= 7 +#if (defined(__GNUC__) && __GNUC__ >= 7) || (defined(_MSC_VER) && _MSC_VER >= 1924) # define KLOAD16(a, offs) _load_mask16((__mmask16*)(a) + (offs)) #else # define KLOAD16(a, offs) (((uint16_t*)(a))[(offs)]) @@ -167,7 +167,7 @@ if (len <= INPUT_OFFSET || line_size < XMM_SIZE) return; // slower CPUs prefer to branch as mispredict penalty is probably small relative to general execution -#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) +#if defined(__tune_atom__) || defined(__tune_slm__) || defined(__tune_btver1__) || defined(__tune_btver2__) const bool _PREFER_BRANCHING = true; #else const bool _PREFER_BRANCHING = (use_isa < ISA_LEVEL_SSSE3); @@ -424,8 +424,8 @@ asm( "shrl $1, %[eqMask] \n" "shrl %%cl, %[eqMask] \n" // TODO: can use shrq to avoid above shift? -# if defined(PLATFORM_AMD64) - "adcq %[col], %[p] \n" +# if defined(PLATFORM_AMD64) && !defined(__ILP32__) + "adcq %q[col], %q[p] \n" # else "adcl %[col], %[p] \n" # endif @@ -551,7 +551,6 @@ dataA = _mm_shuffle_epi8(dataA, shufMaskA); # if defined(__SSE4_1__) && !defined(__tune_slm__) && !defined(__tune_goldmont__) && !defined(__tune_goldmont_plus__) && !defined(__tune_tremont__) - // unsure if worth on: Jaguar/Puma (3|2), Core2 (2|2) if(use_isa >= ISA_LEVEL_SSE41) { dataB = _mm_blendv_epi8(dataBShifted, dataB, mergeMaskB); } else diff -Nru python-sabyenc-5.3.0/src/yencode/encoder_vbmi2.cc python-sabyenc-5.4.2/src/yencode/encoder_vbmi2.cc --- python-sabyenc-5.3.0/src/yencode/encoder_vbmi2.cc 1970-01-01 00:00:00.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/encoder_vbmi2.cc 2022-05-29 12:28:02.000000000 +0000 @@ -0,0 +1,25 @@ +#include "common.h" + +#if defined(__AVX512VL__) && defined(__AVX512VBMI2__) && defined(__AVX512BW__) +# ifndef YENC_DISABLE_AVX256 +# include "encoder_avx_base.h" + +void encoder_vbmi2_init() { + _do_encode = &do_encode_simd< do_encode_avx2 >; + encoder_avx2_lut(); +} +# else +# include "encoder_sse_base.h" +void encoder_vbmi2_init() { + _do_encode = &do_encode_simd< do_encode_sse >; + encoder_sse_lut(); +} +# endif +#else + +void encoder_avx2_init(); + +void encoder_vbmi2_init() { + encoder_avx2_init(); +} +#endif diff -Nru python-sabyenc-5.3.0/src/yencode/platform.cc python-sabyenc-5.4.2/src/yencode/platform.cc --- python-sabyenc-5.3.0/src/yencode/platform.cc 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/platform.cc 2022-05-29 12:28:02.000000000 +0000 @@ -149,7 +149,7 @@ int cpu_supports_crc_isa() { int flags[4]; _cpuid1(flags); - + if((flags[2] & 0x80202) == 0x80202) { // SSE4.1 + SSSE3 + CLMUL if((flags[2] & 0x18000000) == 0x18000000) { // OSXSAVE + AVX int xcr = _GET_XCR() & 0xff; // ignore unused bits @@ -167,24 +167,3 @@ } #endif // PLATFORM_X86 - -const char* simd_detected() { -#ifdef PLATFORM_X86 - int use_isa = cpu_supports_isa(); - if(use_isa >= ISA_LEVEL_AVX2) { - return "AVX2"; - } else if(use_isa >= ISA_LEVEL_AVX) { - return "AVX"; - } else if(use_isa >= ISA_LEVEL_SSSE3) { - return "SSSE3"; - } else { - return "SSE2"; - } -#endif -#ifdef PLATFORM_ARM - if(cpu_supports_neon()) { - return "NEON"; - } -#endif - return ""; -} diff -Nru python-sabyenc-5.3.0/src/yencode/stdint.h python-sabyenc-5.4.2/src/yencode/stdint.h --- python-sabyenc-5.3.0/src/yencode/stdint.h 2022-05-19 08:10:58.000000000 +0000 +++ python-sabyenc-5.4.2/src/yencode/stdint.h 2022-05-29 12:28:02.000000000 +0000 @@ -1,32 +1,32 @@ // ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// // Copyright (c) 2006-2008 Alexander Chemeris -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: -// +// // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// /////////////////////////////////////////////////////////////////////////////// #if !defined(_MSC_VER) || defined(_STDINT) || _MSC_VER >= 1900