diff -Nru minimap2-2.17+dfsg/debian/changelog minimap2-2.17+dfsg/debian/changelog --- minimap2-2.17+dfsg/debian/changelog 2019-08-01 13:23:40.000000000 +0000 +++ minimap2-2.17+dfsg/debian/changelog 2020-01-12 17:22:11.000000000 +0000 @@ -1,3 +1,16 @@ +minimap2 (2.17+dfsg-2) unstable; urgency=medium + + [ Michael R. Crusoe ] + * Team upload. + * Add support for more architectures using the SIMD Everyhere library + Closes: #922620 + * Standards-Version: 4.4.1 + + [ Andreas Tille ] + * Set upstream metadata fields: Bug-Database, Bug-Submit. + + -- Michael R. Crusoe Sun, 12 Jan 2020 18:22:11 +0100 + minimap2 (2.17+dfsg-1) unstable; urgency=medium * New upstream version diff -Nru minimap2-2.17+dfsg/debian/control minimap2-2.17+dfsg/debian/control --- minimap2-2.17+dfsg/debian/control 2019-08-01 13:23:40.000000000 +0000 +++ minimap2-2.17+dfsg/debian/control 2020-01-12 17:22:11.000000000 +0000 @@ -11,7 +11,7 @@ texlive-font-utils, ghostscript, gnuplot -Standards-Version: 4.4.0 +Standards-Version: 4.4.1 Vcs-Browser: https://salsa.debian.org/med-team/minimap2 Vcs-Git: https://salsa.debian.org/med-team/minimap2.git Homepage: https://github.com/lh3/minimap2 diff -Nru minimap2-2.17+dfsg/debian/include/simde/check.h minimap2-2.17+dfsg/debian/include/simde/check.h --- minimap2-2.17+dfsg/debian/include/simde/check.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/check.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,249 @@ +/* Check (assertions) + * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * https://creativecommons.org/publicdomain/zero/1.0/ + */ + +#if !defined(SIMDE_CHECK_H) +#define SIMDE_CHECK_H + +#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG) +# define SIMDE_NDEBUG 1 +#endif + +#include + +#if !defined(_WIN32) +# define SIMDE_SIZE_MODIFIER "z" +# define SIMDE_CHAR_MODIFIER "hh" +# define SIMDE_SHORT_MODIFIER "h" +#else +# if defined(_M_X64) || defined(__amd64__) +# define SIMDE_SIZE_MODIFIER "I64" +# else +# define SIMDE_SIZE_MODIFIER "" +# endif +# define SIMDE_CHAR_MODIFIER "" +# define SIMDE_SHORT_MODIFIER "" +#endif + +#if defined(_MSC_VER) && (_MSC_VER >= 1500) +# define SIMDE__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127)) +# define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) +#else +# define SIMDE__PUSH_DISABLE_MSVC_C4127 +# define SIMDE__POP_DISABLE_MSVC_C4127 +#endif + +#if !defined(simde_errorf) +# include +# include +# define simde_errorf(format, ...) (fprintf(stderr, format, __VA_ARGS__), abort()) +#endif + +#define simde_error(msg) simde_errorf("%s", msg) + +#if defined(SIMDE_NDEBUG) +# if defined(SIMDE_CHECK_FAIL_DEFINED) +# define simde_assert(expr) +# else +# if defined(HEDLEY_ASSUME) +# define simde_assert(expr) HEDLEY_ASSUME(expr) +# elif HEDLEY_GCC_VERSION_CHECK(4,5,0) +# define simde_assert(expr) ((void) (!!(expr) ? 1 : (__builtin_unreachable(), 1))) +# elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +# define simde_assert(expr) __assume(expr) +# else +# define simde_assert(expr) +# endif +# endif +# define simde_assert_true(expr) simde_assert(expr) +# define simde_assert_false(expr) simde_assert(!(expr)) +# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) simde_assert(((a) op (b))) +# define simde_assert_double_equal(a, b, precision) +# define simde_assert_string_equal(a, b) +# define simde_assert_string_not_equal(a, b) +# define simde_assert_memory_equal(size, a, b) +# define simde_assert_memory_not_equal(size, a, b) +#else +# define simde_assert(expr) \ + do { \ + if (!HEDLEY_LIKELY(expr)) { \ + simde_error("assertion failed: " #expr "\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_true(expr) \ + do { \ + if (!HEDLEY_LIKELY(expr)) { \ + simde_error("assertion failed: " #expr " is not true\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_false(expr) \ + do { \ + if (!HEDLEY_LIKELY(!(expr))) { \ + simde_error("assertion failed: " #expr " is not false\n"); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ + do { \ + T simde_tmp_a_ = (a); \ + T simde_tmp_b_ = (b); \ + if (!(simde_tmp_a_ op simde_tmp_b_)) { \ + simde_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")\n", \ + #a, #op, #b, simde_tmp_a_, #op, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_double_equal(a, b, precision) \ + do { \ + const double simde_tmp_a_ = (a); \ + const double simde_tmp_b_ = (b); \ + const double simde_tmp_diff_ = ((simde_tmp_a_ - simde_tmp_b_) < 0) ? \ + -(simde_tmp_a_ - simde_tmp_b_) : \ + (simde_tmp_a_ - simde_tmp_b_); \ + if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \ + simde_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# include +# define simde_assert_string_equal(a, b) \ + do { \ + const char* simde_tmp_a_ = a; \ + const char* simde_tmp_b_ = b; \ + if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != 0)) { \ + simde_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_string_not_equal(a, b) \ + do { \ + const char* simde_tmp_a_ = a; \ + const char* simde_tmp_b_ = b; \ + if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == 0)) { \ + simde_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \ + #a, #b, simde_tmp_a_, simde_tmp_b_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_memory_equal(size, a, b) \ + do { \ + const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ + const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ + const size_t simde_tmp_size_ = (size); \ + if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) != 0) { \ + size_t simde_tmp_pos_; \ + for (simde_tmp_pos_ = 0 ; simde_tmp_pos_ < simde_tmp_size_ ; simde_tmp_pos_++) { \ + if (simde_tmp_a_[simde_tmp_pos_] != simde_tmp_b_[simde_tmp_pos_]) { \ + simde_errorf("assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER "u\n", \ + #a, #b, simde_tmp_pos_); \ + break; \ + } \ + } \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 + +# define simde_assert_memory_not_equal(size, a, b) \ + do { \ + const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ + const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ + const size_t simde_tmp_size_ = (size); \ + if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) == 0) { \ + simde_errorf("assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER "u bytes)\n", \ + #a, #b, simde_tmp_size_); \ + } \ + SIMDE__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + SIMDE__POP_DISABLE_MSVC_C4127 +#endif + +#define simde_assert_type(T, fmt, a, op, b) \ + simde_assert_type_full("", "", T, fmt, a, op, b) + +#define simde_assert_char(a, op, b) \ + simde_assert_type_full("'\\x", "'", char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) +#define simde_assert_uchar(a, op, b) \ + simde_assert_type_full("'\\x", "'", unsigned char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) +#define simde_assert_short(a, op, b) \ + simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b) +#define simde_assert_ushort(a, op, b) \ + simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b) +#define simde_assert_int(a, op, b) \ + simde_assert_type(int, "d", a, op, b) +#define simde_assert_uint(a, op, b) \ + simde_assert_type(unsigned int, "u", a, op, b) +#define simde_assert_long(a, op, b) \ + simde_assert_type(long int, "ld", a, op, b) +#define simde_assert_ulong(a, op, b) \ + simde_assert_type(unsigned long int, "lu", a, op, b) +#define simde_assert_llong(a, op, b) \ + simde_assert_type(long long int, "lld", a, op, b) +#define simde_assert_ullong(a, op, b) \ + simde_assert_type(unsigned long long int, "llu", a, op, b) + +#define simde_assert_size(a, op, b) \ + simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b) + +#define simde_assert_float(a, op, b) \ + simde_assert_type(float, "f", a, op, b) +#define simde_assert_double(a, op, b) \ + simde_assert_type(double, "g", a, op, b) +#define simde_assert_ptr(a, op, b) \ + simde_assert_type(const void*, "p", a, op, b) + +#define simde_assert_int8(a, op, b) \ + simde_assert_type(int8_t, PRIi8, a, op, b) +#define simde_assert_uint8(a, op, b) \ + simde_assert_type(uint8_t, PRIu8, a, op, b) +#define simde_assert_int16(a, op, b) \ + simde_assert_type(int16_t, PRIi16, a, op, b) +#define simde_assert_uint16(a, op, b) \ + simde_assert_type(uint16_t, PRIu16, a, op, b) +#define simde_assert_int32(a, op, b) \ + simde_assert_type(int32_t, PRIi32, a, op, b) +#define simde_assert_uint32(a, op, b) \ + simde_assert_type(uint32_t, PRIu32, a, op, b) +#define simde_assert_int64(a, op, b) \ + simde_assert_type(int64_t, PRIi64, a, op, b) +#define simde_assert_uint64(a, op, b) \ + simde_assert_type(uint64_t, PRIu64, a, op, b) + +#define simde_assert_ptr_equal(a, b) \ + simde_assert_ptr(a, ==, b) +#define simde_assert_ptr_not_equal(a, b) \ + simde_assert_ptr(a, !=, b) +#define simde_assert_null(ptr) \ + simde_assert_ptr(ptr, ==, NULL) +#define simde_assert_not_null(ptr) \ + simde_assert_ptr(ptr, !=, NULL) +#define simde_assert_ptr_null(ptr) \ + simde_assert_ptr(ptr, ==, NULL) +#define simde_assert_ptr_not_null(ptr) \ + simde_assert_ptr(ptr, !=, NULL) + +#endif /* !defined(SIMDE_CHECK_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/hedley.h minimap2-2.17+dfsg/debian/include/simde/hedley.h --- minimap2-2.17+dfsg/debian/include/simde/hedley.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/hedley.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,1899 @@ +/* Hedley - https://nemequ.github.io/hedley + * Created by Evan Nemerson + * + * To the extent possible under law, the author(s) have dedicated all + * copyright and related and neighboring rights to this software to + * the public domain worldwide. This software is distributed without + * any warranty. + * + * For details, see . + * SPDX-License-Identifier: CC0-1.0 + */ + +#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 12) +#if defined(HEDLEY_VERSION) +# undef HEDLEY_VERSION +#endif +#define HEDLEY_VERSION 12 + +#if defined(HEDLEY_STRINGIFY_EX) +# undef HEDLEY_STRINGIFY_EX +#endif +#define HEDLEY_STRINGIFY_EX(x) #x + +#if defined(HEDLEY_STRINGIFY) +# undef HEDLEY_STRINGIFY +#endif +#define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x) + +#if defined(HEDLEY_CONCAT_EX) +# undef HEDLEY_CONCAT_EX +#endif +#define HEDLEY_CONCAT_EX(a,b) a##b + +#if defined(HEDLEY_CONCAT) +# undef HEDLEY_CONCAT +#endif +#define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b) + +#if defined(HEDLEY_VERSION_ENCODE) +# undef HEDLEY_VERSION_ENCODE +#endif +#define HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) + +#if defined(HEDLEY_VERSION_DECODE_MAJOR) +# undef HEDLEY_VERSION_DECODE_MAJOR +#endif +#define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) + +#if defined(HEDLEY_VERSION_DECODE_MINOR) +# undef HEDLEY_VERSION_DECODE_MINOR +#endif +#define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) + +#if defined(HEDLEY_VERSION_DECODE_REVISION) +# undef HEDLEY_VERSION_DECODE_REVISION +#endif +#define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) + +#if defined(HEDLEY_GNUC_VERSION) +# undef HEDLEY_GNUC_VERSION +#endif +#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) +# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#elif defined(__GNUC__) +# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) +#endif + +#if defined(HEDLEY_GNUC_VERSION_CHECK) +# undef HEDLEY_GNUC_VERSION_CHECK +#endif +#if defined(HEDLEY_GNUC_VERSION) +# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_MSVC_VERSION) +# undef HEDLEY_MSVC_VERSION +#endif +#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) +# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) +#elif defined(_MSC_FULL_VER) +# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) +#elif defined(_MSC_VER) +# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) +#endif + +#if defined(HEDLEY_MSVC_VERSION_CHECK) +# undef HEDLEY_MSVC_VERSION_CHECK +#endif +#if !defined(_MSC_VER) +# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) +# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) +#elif defined(_MSC_VER) && (_MSC_VER >= 1200) +# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) +#else +# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) +#endif + +#if defined(HEDLEY_INTEL_VERSION) +# undef HEDLEY_INTEL_VERSION +#endif +#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) +# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) +#elif defined(__INTEL_COMPILER) +# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) +#endif + +#if defined(HEDLEY_INTEL_VERSION_CHECK) +# undef HEDLEY_INTEL_VERSION_CHECK +#endif +#if defined(HEDLEY_INTEL_VERSION) +# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_PGI_VERSION) +# undef HEDLEY_PGI_VERSION +#endif +#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) +# define HEDLEY_PGI_VERSION HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) +#endif + +#if defined(HEDLEY_PGI_VERSION_CHECK) +# undef HEDLEY_PGI_VERSION_CHECK +#endif +#if defined(HEDLEY_PGI_VERSION) +# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_SUNPRO_VERSION) +# undef HEDLEY_SUNPRO_VERSION +#endif +#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) +# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) +#elif defined(__SUNPRO_C) +# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) +#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) +# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) +#elif defined(__SUNPRO_CC) +# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) +#endif + +#if defined(HEDLEY_SUNPRO_VERSION_CHECK) +# undef HEDLEY_SUNPRO_VERSION_CHECK +#endif +#if defined(HEDLEY_SUNPRO_VERSION) +# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_EMSCRIPTEN_VERSION) +# undef HEDLEY_EMSCRIPTEN_VERSION +#endif +#if defined(__EMSCRIPTEN__) +# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) +#endif + +#if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK) +# undef HEDLEY_EMSCRIPTEN_VERSION_CHECK +#endif +#if defined(HEDLEY_EMSCRIPTEN_VERSION) +# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (HEDLEY_EMSCRIPTEN_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_ARM_VERSION) +# undef HEDLEY_ARM_VERSION +#endif +#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) +# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) +#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) +# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) +#endif + +#if defined(HEDLEY_ARM_VERSION_CHECK) +# undef HEDLEY_ARM_VERSION_CHECK +#endif +#if defined(HEDLEY_ARM_VERSION) +# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_IBM_VERSION) +# undef HEDLEY_IBM_VERSION +#endif +#if defined(__ibmxl__) +# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) +#elif defined(__xlC__) && defined(__xlC_ver__) +# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) +#elif defined(__xlC__) +# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) +#endif + +#if defined(HEDLEY_IBM_VERSION_CHECK) +# undef HEDLEY_IBM_VERSION_CHECK +#endif +#if defined(HEDLEY_IBM_VERSION) +# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_VERSION) +# undef HEDLEY_TI_VERSION +#endif +#if \ + defined(__TI_COMPILER_VERSION__) && \ + ( \ + defined(__TMS470__) || defined(__TI_ARM__) || \ + defined(__MSP430__) || \ + defined(__TMS320C2000__) \ + ) +# if (__TI_COMPILER_VERSION__ >= 16000000) +# define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +# endif +#endif + +#if defined(HEDLEY_TI_VERSION_CHECK) +# undef HEDLEY_TI_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_VERSION) +# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL2000_VERSION) +# undef HEDLEY_TI_CL2000_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) +# define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL2000_VERSION_CHECK) +# undef HEDLEY_TI_CL2000_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL2000_VERSION) +# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION) +# undef HEDLEY_TI_CL430_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) +# define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION_CHECK) +# undef HEDLEY_TI_CL430_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL430_VERSION) +# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION) +# undef HEDLEY_TI_ARMCL_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) +# define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK) +# undef HEDLEY_TI_ARMCL_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_ARMCL_VERSION) +# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION) +# undef HEDLEY_TI_CL6X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) +# define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION_CHECK) +# undef HEDLEY_TI_CL6X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL6X_VERSION) +# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION) +# undef HEDLEY_TI_CL7X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) +# define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION_CHECK) +# undef HEDLEY_TI_CL7X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL7X_VERSION) +# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION) +# undef HEDLEY_TI_CLPRU_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) +# define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK) +# undef HEDLEY_TI_CLPRU_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CLPRU_VERSION) +# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_CRAY_VERSION) +# undef HEDLEY_CRAY_VERSION +#endif +#if defined(_CRAYC) +# if defined(_RELEASE_PATCHLEVEL) +# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) +# else +# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) +# endif +#endif + +#if defined(HEDLEY_CRAY_VERSION_CHECK) +# undef HEDLEY_CRAY_VERSION_CHECK +#endif +#if defined(HEDLEY_CRAY_VERSION) +# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_IAR_VERSION) +# undef HEDLEY_IAR_VERSION +#endif +#if defined(__IAR_SYSTEMS_ICC__) +# if __VER__ > 1000 +# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) +# else +# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0) +# endif +#endif + +#if defined(HEDLEY_IAR_VERSION_CHECK) +# undef HEDLEY_IAR_VERSION_CHECK +#endif +#if defined(HEDLEY_IAR_VERSION) +# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_TINYC_VERSION) +# undef HEDLEY_TINYC_VERSION +#endif +#if defined(__TINYC__) +# define HEDLEY_TINYC_VERSION HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) +#endif + +#if defined(HEDLEY_TINYC_VERSION_CHECK) +# undef HEDLEY_TINYC_VERSION_CHECK +#endif +#if defined(HEDLEY_TINYC_VERSION) +# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_DMC_VERSION) +# undef HEDLEY_DMC_VERSION +#endif +#if defined(__DMC__) +# define HEDLEY_DMC_VERSION HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) +#endif + +#if defined(HEDLEY_DMC_VERSION_CHECK) +# undef HEDLEY_DMC_VERSION_CHECK +#endif +#if defined(HEDLEY_DMC_VERSION) +# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_COMPCERT_VERSION) +# undef HEDLEY_COMPCERT_VERSION +#endif +#if defined(__COMPCERT_VERSION__) +# define HEDLEY_COMPCERT_VERSION HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) +#endif + +#if defined(HEDLEY_COMPCERT_VERSION_CHECK) +# undef HEDLEY_COMPCERT_VERSION_CHECK +#endif +#if defined(HEDLEY_COMPCERT_VERSION) +# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_PELLES_VERSION) +# undef HEDLEY_PELLES_VERSION +#endif +#if defined(__POCC__) +# define HEDLEY_PELLES_VERSION HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) +#endif + +#if defined(HEDLEY_PELLES_VERSION_CHECK) +# undef HEDLEY_PELLES_VERSION_CHECK +#endif +#if defined(HEDLEY_PELLES_VERSION) +# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_GCC_VERSION) +# undef HEDLEY_GCC_VERSION +#endif +#if \ + defined(HEDLEY_GNUC_VERSION) && \ + !defined(__clang__) && \ + !defined(HEDLEY_INTEL_VERSION) && \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_ARM_VERSION) && \ + !defined(HEDLEY_TI_VERSION) && \ + !defined(HEDLEY_TI_ARMCL_VERSION) && \ + !defined(HEDLEY_TI_CL430_VERSION) && \ + !defined(HEDLEY_TI_CL2000_VERSION) && \ + !defined(HEDLEY_TI_CL6X_VERSION) && \ + !defined(HEDLEY_TI_CL7X_VERSION) && \ + !defined(HEDLEY_TI_CLPRU_VERSION) && \ + !defined(__COMPCERT__) +# define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION +#endif + +#if defined(HEDLEY_GCC_VERSION_CHECK) +# undef HEDLEY_GCC_VERSION_CHECK +#endif +#if defined(HEDLEY_GCC_VERSION) +# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) +#endif + +#if defined(HEDLEY_HAS_ATTRIBUTE) +# undef HEDLEY_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +# define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) +#else +# define HEDLEY_HAS_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_ATTRIBUTE) +# undef HEDLEY_GNUC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute) +#else +# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_ATTRIBUTE) +# undef HEDLEY_GCC_HAS_ATTRIBUTE +#endif +#if defined(__has_attribute) +# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute) +#else +# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_CPP_ATTRIBUTE) +# undef HEDLEY_HAS_CPP_ATTRIBUTE +#endif +#if \ + defined(__has_cpp_attribute) && \ + defined(__cplusplus) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) +# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) +#else +# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS) +# undef HEDLEY_HAS_CPP_ATTRIBUTE_NS +#endif +#if !defined(__cplusplus) || !defined(__has_cpp_attribute) +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#elif \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_IAR_VERSION) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) +#else +# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) +# undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) +# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else +# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE) +# undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE +#endif +#if defined(__has_cpp_attribute) && defined(__cplusplus) +# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) +#else +# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_BUILTIN) +# undef HEDLEY_HAS_BUILTIN +#endif +#if defined(__has_builtin) +# define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +# define HEDLEY_HAS_BUILTIN(builtin) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_BUILTIN) +# undef HEDLEY_GNUC_HAS_BUILTIN +#endif +#if defined(__has_builtin) +# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else +# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_BUILTIN) +# undef HEDLEY_GCC_HAS_BUILTIN +#endif +#if defined(__has_builtin) +# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) +#else +# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_FEATURE) +# undef HEDLEY_HAS_FEATURE +#endif +#if defined(__has_feature) +# define HEDLEY_HAS_FEATURE(feature) __has_feature(feature) +#else +# define HEDLEY_HAS_FEATURE(feature) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_FEATURE) +# undef HEDLEY_GNUC_HAS_FEATURE +#endif +#if defined(__has_feature) +# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else +# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_FEATURE) +# undef HEDLEY_GCC_HAS_FEATURE +#endif +#if defined(__has_feature) +# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) +#else +# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_EXTENSION) +# undef HEDLEY_HAS_EXTENSION +#endif +#if defined(__has_extension) +# define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) +#else +# define HEDLEY_HAS_EXTENSION(extension) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_EXTENSION) +# undef HEDLEY_GNUC_HAS_EXTENSION +#endif +#if defined(__has_extension) +# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else +# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_EXTENSION) +# undef HEDLEY_GCC_HAS_EXTENSION +#endif +#if defined(__has_extension) +# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) +#else +# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE) +# undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) +#else +# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) +# undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else +# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) +# undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE +#endif +#if defined(__has_declspec_attribute) +# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) +#else +# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_HAS_WARNING) +# undef HEDLEY_HAS_WARNING +#endif +#if defined(__has_warning) +# define HEDLEY_HAS_WARNING(warning) __has_warning(warning) +#else +# define HEDLEY_HAS_WARNING(warning) (0) +#endif + +#if defined(HEDLEY_GNUC_HAS_WARNING) +# undef HEDLEY_GNUC_HAS_WARNING +#endif +#if defined(__has_warning) +# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else +# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_GCC_HAS_WARNING) +# undef HEDLEY_GCC_HAS_WARNING +#endif +#if defined(__has_warning) +# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) +#else +# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +# undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#endif +#if defined(__cplusplus) +# if HEDLEY_HAS_WARNING("-Wc++98-compat") +# if HEDLEY_HAS_WARNING("-Wc++17-extensions") +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + xpr \ + HEDLEY_DIAGNOSTIC_POP +# else +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + xpr \ + HEDLEY_DIAGNOSTIC_POP +# endif +# endif +#endif +#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x +#endif + +#if defined(HEDLEY_CONST_CAST) +# undef HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif \ + HEDLEY_HAS_WARNING("-Wcast-qual") || \ + HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ + ((T) (expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define HEDLEY_CONST_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_REINTERPRET_CAST) +# undef HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else +# define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_STATIC_CAST) +# undef HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) +# define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else +# define HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) +#endif + +#if defined(HEDLEY_CPP_CAST) +# undef HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +# if HEDLEY_HAS_WARNING("-Wold-style-cast") +# define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ + ((T) (expr)) \ + HEDLEY_DIAGNOSTIC_POP +# elif HEDLEY_IAR_VERSION_CHECK(8,3,0) +# define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("diag_suppress=Pe137") \ + HEDLEY_DIAGNOSTIC_POP \ +# else +# define HEDLEY_CPP_CAST(T, expr) ((T) (expr)) +# endif +#else +# define HEDLEY_CPP_CAST(T, expr) (expr) +#endif + +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + defined(__clang__) || \ + HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ + HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ + (HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) +# define HEDLEY_PRAGMA(value) _Pragma(#value) +#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +# define HEDLEY_PRAGMA(value) __pragma(value) +#else +# define HEDLEY_PRAGMA(value) +#endif + +#if defined(HEDLEY_DIAGNOSTIC_PUSH) +# undef HEDLEY_DIAGNOSTIC_PUSH +#endif +#if defined(HEDLEY_DIAGNOSTIC_POP) +# undef HEDLEY_DIAGNOSTIC_POP +#endif +#if defined(__clang__) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") +#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +# define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) +# define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) +#elif HEDLEY_ARM_VERSION_CHECK(5,6,0) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") +#elif \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") +#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) +# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") +# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") +#else +# define HEDLEY_DIAGNOSTIC_PUSH +# define HEDLEY_DIAGNOSTIC_POP +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) +# undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif +#if HEDLEY_HAS_WARNING("-Wdeprecated-declarations") +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") +#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") +#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) +#elif \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") +#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) +# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") +#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") +#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") +#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) +#elif \ + HEDLEY_TI_VERSION_CHECK(16,9,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) +# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-attributes") +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") +#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_INTEL_VERSION_CHECK(17,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") +#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) +#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") +#elif \ + HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif + +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) +# undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif +#if HEDLEY_HAS_WARNING("-Wcast-qual") +# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") +#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) +# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") +#else +# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL +#endif + +#if defined(HEDLEY_DEPRECATED) +# undef HEDLEY_DEPRECATED +#endif +#if defined(HEDLEY_DEPRECATED_FOR) +# undef HEDLEY_DEPRECATED_FOR +#endif +#if defined(__cplusplus) && (__cplusplus >= 201402L) +# define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) +# define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) +#elif \ + HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \ + HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + HEDLEY_TI_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) +# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) +# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) +#elif \ + HEDLEY_HAS_ATTRIBUTE(deprecated) || \ + HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) +# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) +#elif HEDLEY_MSVC_VERSION_CHECK(14,0,0) +# define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) +# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + HEDLEY_PELLES_VERSION_CHECK(6,50,0) +# define HEDLEY_DEPRECATED(since) __declspec(deprecated) +# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_DEPRECATED(since) _Pragma("deprecated") +# define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") +#else +# define HEDLEY_DEPRECATED(since) +# define HEDLEY_DEPRECATED_FOR(since, replacement) +#endif + +#if defined(HEDLEY_UNAVAILABLE) +# undef HEDLEY_UNAVAILABLE +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(warning) || \ + HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) +#else +# define HEDLEY_UNAVAILABLE(available_since) +#endif + +#if defined(HEDLEY_WARN_UNUSED_RESULT) +# undef HEDLEY_WARN_UNUSED_RESULT +#endif +#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG) +# undef HEDLEY_WARN_UNUSED_RESULT_MSG +#endif +#if (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) +# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) +# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +#elif \ + HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ + HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) +#elif defined(_Check_return_) /* SAL */ +# define HEDLEY_WARN_UNUSED_RESULT _Check_return_ +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ +#else +# define HEDLEY_WARN_UNUSED_RESULT +# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) +#endif + +#if defined(HEDLEY_SENTINEL) +# undef HEDLEY_SENTINEL +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(sentinel) || \ + HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(5,4,0) +# define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) +#else +# define HEDLEY_SENTINEL(position) +#endif + +#if defined(HEDLEY_NO_RETURN) +# undef HEDLEY_NO_RETURN +#endif +#if HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_NO_RETURN __noreturn +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define HEDLEY_NO_RETURN _Noreturn +#elif defined(__cplusplus) && (__cplusplus >= 201103L) +# define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) +#elif \ + HEDLEY_HAS_ATTRIBUTE(noreturn) || \ + HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_NO_RETURN _Pragma("does_not_return") +#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +# define HEDLEY_NO_RETURN __declspec(noreturn) +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) +# define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") +#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) +# define HEDLEY_NO_RETURN __attribute((noreturn)) +#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) +# define HEDLEY_NO_RETURN __declspec(noreturn) +#else +# define HEDLEY_NO_RETURN +#endif + +#if defined(HEDLEY_NO_ESCAPE) +# undef HEDLEY_NO_ESCAPE +#endif +#if HEDLEY_HAS_ATTRIBUTE(noescape) +# define HEDLEY_NO_ESCAPE __attribute__((__noescape__)) +#else +# define HEDLEY_NO_ESCAPE +#endif + +#if defined(HEDLEY_UNREACHABLE) +# undef HEDLEY_UNREACHABLE +#endif +#if defined(HEDLEY_UNREACHABLE_RETURN) +# undef HEDLEY_UNREACHABLE_RETURN +#endif +#if defined(HEDLEY_ASSUME) +# undef HEDLEY_ASSUME +#endif +#if \ + HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_ASSUME(expr) __assume(expr) +#elif HEDLEY_HAS_BUILTIN(__builtin_assume) +# define HEDLEY_ASSUME(expr) __builtin_assume(expr) +#elif \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) +# if defined(__cplusplus) +# define HEDLEY_ASSUME(expr) std::_nassert(expr) +# else +# define HEDLEY_ASSUME(expr) _nassert(expr) +# endif +#endif +#if \ + (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \ + HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ + HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_IBM_VERSION_CHECK(13,1,5) +# define HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif defined(HEDLEY_ASSUME) +# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) +#endif +#if !defined(HEDLEY_ASSUME) +# if defined(HEDLEY_UNREACHABLE) +# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1))) +# else +# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr) +# endif +#endif +#if defined(HEDLEY_UNREACHABLE) +# if \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) +# define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value)) +# else +# define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() +# endif +#else +# define HEDLEY_UNREACHABLE_RETURN(value) return (value) +#endif +#if !defined(HEDLEY_UNREACHABLE) +# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) +#endif + +HEDLEY_DIAGNOSTIC_PUSH +#if HEDLEY_HAS_WARNING("-Wpedantic") +# pragma clang diagnostic ignored "-Wpedantic" +#endif +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) +# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) +# if defined(__clang__) +# pragma clang diagnostic ignored "-Wvariadic-macros" +# elif defined(HEDLEY_GCC_VERSION) +# pragma GCC diagnostic ignored "-Wvariadic-macros" +# endif +#endif +#if defined(HEDLEY_NON_NULL) +# undef HEDLEY_NON_NULL +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(nonnull) || \ + HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) +# define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) +#else +# define HEDLEY_NON_NULL(...) +#endif +HEDLEY_DIAGNOSTIC_POP + +#if defined(HEDLEY_PRINTF_FORMAT) +# undef HEDLEY_PRINTF_FORMAT +#endif +#if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) +# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) +#elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) +# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) +#elif \ + HEDLEY_HAS_ATTRIBUTE(format) || \ + HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) +#elif HEDLEY_PELLES_VERSION_CHECK(6,0,0) +# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) +#else +# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) +#endif + +#if defined(HEDLEY_CONSTEXPR) +# undef HEDLEY_CONSTEXPR +#endif +#if defined(__cplusplus) +# if __cplusplus >= 201103L +# define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) +# endif +#endif +#if !defined(HEDLEY_CONSTEXPR) +# define HEDLEY_CONSTEXPR +#endif + +#if defined(HEDLEY_PREDICT) +# undef HEDLEY_PREDICT +#endif +#if defined(HEDLEY_LIKELY) +# undef HEDLEY_LIKELY +#endif +#if defined(HEDLEY_UNLIKELY) +# undef HEDLEY_UNLIKELY +#endif +#if defined(HEDLEY_UNPREDICTABLE) +# undef HEDLEY_UNPREDICTABLE +#endif +#if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) +# define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) +#endif +#if \ + HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \ + HEDLEY_GCC_VERSION_CHECK(9,0,0) +# define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) +# define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) +# define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) +# define HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) +# define HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) +#elif \ + HEDLEY_HAS_BUILTIN(__builtin_expect) || \ + HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ + HEDLEY_CRAY_VERSION_CHECK(8,1,0) +# define HEDLEY_PREDICT(expr, expected, probability) \ + (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr))) +# define HEDLEY_PREDICT_TRUE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ + })) +# define HEDLEY_PREDICT_FALSE(expr, probability) \ + (__extension__ ({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ + })) +# define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) +# define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#else +# define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr)) +# define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) +# define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) +# define HEDLEY_LIKELY(expr) (!!(expr)) +# define HEDLEY_UNLIKELY(expr) (!!(expr)) +#endif +#if !defined(HEDLEY_UNPREDICTABLE) +# define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5) +#endif + +#if defined(HEDLEY_MALLOC) +# undef HEDLEY_MALLOC +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(malloc) || \ + HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_MALLOC __attribute__((__malloc__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_MALLOC _Pragma("returns_new_memory") +#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) +# define HEDLEY_MALLOC __declspec(restrict) +#else +# define HEDLEY_MALLOC +#endif + +#if defined(HEDLEY_PURE) +# undef HEDLEY_PURE +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(pure) || \ + HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_PURE __attribute__((__pure__)) +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_PURE _Pragma("does_not_write_global_data") +#elif defined(__cplusplus) && \ + ( \ + HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ + ) +# define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") +#else +# define HEDLEY_PURE +#endif + +#if defined(HEDLEY_CONST) +# undef HEDLEY_CONST +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(const) || \ + HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) +# define HEDLEY_CONST __attribute__((__const__)) +#elif \ + HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) +# define HEDLEY_CONST _Pragma("no_side_effect") +#else +# define HEDLEY_CONST HEDLEY_PURE +#endif + +#if defined(HEDLEY_RESTRICT) +# undef HEDLEY_RESTRICT +#endif +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) +# define HEDLEY_RESTRICT restrict +#elif \ + HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ + HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ + HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ + defined(__clang__) +# define HEDLEY_RESTRICT __restrict +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) +# define HEDLEY_RESTRICT _Restrict +#else +# define HEDLEY_RESTRICT +#endif + +#if defined(HEDLEY_INLINE) +# undef HEDLEY_INLINE +#endif +#if \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ + (defined(__cplusplus) && (__cplusplus >= 199711L)) +# define HEDLEY_INLINE inline +#elif \ + defined(HEDLEY_GCC_VERSION) || \ + HEDLEY_ARM_VERSION_CHECK(6,2,0) +# define HEDLEY_INLINE __inline__ +#elif \ + HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_INLINE __inline +#else +# define HEDLEY_INLINE +#endif + +#if defined(HEDLEY_ALWAYS_INLINE) +# undef HEDLEY_ALWAYS_INLINE +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(always_inline) || \ + HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE +#elif HEDLEY_MSVC_VERSION_CHECK(12,0,0) +# define HEDLEY_ALWAYS_INLINE __forceinline +#elif defined(__cplusplus) && \ + ( \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ + ) +# define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") +#else +# define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE +#endif + +#if defined(HEDLEY_NEVER_INLINE) +# undef HEDLEY_NEVER_INLINE +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(noinline) || \ + HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ + HEDLEY_TI_VERSION_CHECK(15,12,0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) +# define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) +#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +# define HEDLEY_NEVER_INLINE __declspec(noinline) +#elif HEDLEY_PGI_VERSION_CHECK(10,2,0) +# define HEDLEY_NEVER_INLINE _Pragma("noinline") +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) +# define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_NEVER_INLINE _Pragma("inline=never") +#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) +# define HEDLEY_NEVER_INLINE __attribute((noinline)) +#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) +# define HEDLEY_NEVER_INLINE __declspec(noinline) +#else +# define HEDLEY_NEVER_INLINE +#endif + +#if defined(HEDLEY_PRIVATE) +# undef HEDLEY_PRIVATE +#endif +#if defined(HEDLEY_PUBLIC) +# undef HEDLEY_PUBLIC +#endif +#if defined(HEDLEY_IMPORT) +# undef HEDLEY_IMPORT +#endif +#if defined(_WIN32) || defined(__CYGWIN__) +# define HEDLEY_PRIVATE +# define HEDLEY_PUBLIC __declspec(dllexport) +# define HEDLEY_IMPORT __declspec(dllimport) +#else +# if \ + HEDLEY_HAS_ATTRIBUTE(visibility) || \ + HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + ( \ + defined(__TI_EABI__) && \ + ( \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ + ) \ + ) +# define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) +# define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) +# else +# define HEDLEY_PRIVATE +# define HEDLEY_PUBLIC +# endif +# define HEDLEY_IMPORT extern +#endif + +#if defined(HEDLEY_NO_THROW) +# undef HEDLEY_NO_THROW +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(nothrow) || \ + HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_NO_THROW __attribute__((__nothrow__)) +#elif \ + HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) +# define HEDLEY_NO_THROW __declspec(nothrow) +#else +# define HEDLEY_NO_THROW +#endif + +#if defined(HEDLEY_FALL_THROUGH) +# undef HEDLEY_FALL_THROUGH +#endif +#if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough,7,0,0) && !defined(HEDLEY_PGI_VERSION) +# define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) +# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) +# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) +#elif defined(__fallthrough) /* SAL */ +# define HEDLEY_FALL_THROUGH __fallthrough +#else +# define HEDLEY_FALL_THROUGH +#endif + +#if defined(HEDLEY_RETURNS_NON_NULL) +# undef HEDLEY_RETURNS_NON_NULL +#endif +#if \ + HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ + HEDLEY_GCC_VERSION_CHECK(4,9,0) +# define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) +#elif defined(_Ret_notnull_) /* SAL */ +# define HEDLEY_RETURNS_NON_NULL _Ret_notnull_ +#else +# define HEDLEY_RETURNS_NON_NULL +#endif + +#if defined(HEDLEY_ARRAY_PARAM) +# undef HEDLEY_ARRAY_PARAM +#endif +#if \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ + !defined(__STDC_NO_VLA__) && \ + !defined(__cplusplus) && \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_TINYC_VERSION) +# define HEDLEY_ARRAY_PARAM(name) (name) +#else +# define HEDLEY_ARRAY_PARAM(name) +#endif + +#if defined(HEDLEY_IS_CONSTANT) +# undef HEDLEY_IS_CONSTANT +#endif +#if defined(HEDLEY_REQUIRE_CONSTEXPR) +# undef HEDLEY_REQUIRE_CONSTEXPR +#endif +/* HEDLEY_IS_CONSTEXPR_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_IS_CONSTEXPR_) +# undef HEDLEY_IS_CONSTEXPR_ +#endif +#if \ + HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ + HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ + HEDLEY_CRAY_VERSION_CHECK(8,1,0) +# define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) +#endif +#if !defined(__cplusplus) +# if \ + HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ + HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ + HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ + HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,24) +# if defined(__INTPTR_TYPE__) +# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) +# else +# include +# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) +# endif +# elif \ + ( \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(HEDLEY_SUNPRO_VERSION) && \ + !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_IAR_VERSION)) || \ + HEDLEY_HAS_EXTENSION(c_generic_selections) || \ + HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ + HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ + HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ + HEDLEY_ARM_VERSION_CHECK(5,3,0) +# if defined(__INTPTR_TYPE__) +# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) +# else +# include +# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) +# endif +# elif \ + defined(HEDLEY_GCC_VERSION) || \ + defined(HEDLEY_INTEL_VERSION) || \ + defined(HEDLEY_TINYC_VERSION) || \ + defined(HEDLEY_TI_ARMCL_VERSION) || \ + HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ + defined(HEDLEY_TI_CL2000_VERSION) || \ + defined(HEDLEY_TI_CL6X_VERSION) || \ + defined(HEDLEY_TI_CL7X_VERSION) || \ + defined(HEDLEY_TI_CLPRU_VERSION) || \ + defined(__clang__) +# define HEDLEY_IS_CONSTEXPR_(expr) ( \ + sizeof(void) != \ + sizeof(*( \ + 1 ? \ + ((void*) ((expr) * 0L) ) : \ + ((struct { char v[sizeof(void) * 2]; } *) 1) \ + ) \ + ) \ + ) +# endif +#endif +#if defined(HEDLEY_IS_CONSTEXPR_) +# if !defined(HEDLEY_IS_CONSTANT) +# define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr) +# endif +# define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) +#else +# if !defined(HEDLEY_IS_CONSTANT) +# define HEDLEY_IS_CONSTANT(expr) (0) +# endif +# define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) +#endif + +#if defined(HEDLEY_BEGIN_C_DECLS) +# undef HEDLEY_BEGIN_C_DECLS +#endif +#if defined(HEDLEY_END_C_DECLS) +# undef HEDLEY_END_C_DECLS +#endif +#if defined(HEDLEY_C_DECL) +# undef HEDLEY_C_DECL +#endif +#if defined(__cplusplus) +# define HEDLEY_BEGIN_C_DECLS extern "C" { +# define HEDLEY_END_C_DECLS } +# define HEDLEY_C_DECL extern "C" +#else +# define HEDLEY_BEGIN_C_DECLS +# define HEDLEY_END_C_DECLS +# define HEDLEY_C_DECL +#endif + +#if defined(HEDLEY_STATIC_ASSERT) +# undef HEDLEY_STATIC_ASSERT +#endif +#if \ + !defined(__cplusplus) && ( \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ + HEDLEY_HAS_FEATURE(c_static_assert) || \ + HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + defined(_Static_assert) \ + ) +# define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) +#elif \ + (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ + HEDLEY_MSVC_VERSION_CHECK(16,0,0) +# define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) +#else +# define HEDLEY_STATIC_ASSERT(expr, message) +#endif + +#if defined(HEDLEY_NULL) +# undef HEDLEY_NULL +#endif +#if defined(__cplusplus) +# if __cplusplus >= 201103L +# define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) +# elif defined(NULL) +# define HEDLEY_NULL NULL +# else +# define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0) +# endif +#elif defined(NULL) +# define HEDLEY_NULL NULL +#else +# define HEDLEY_NULL ((void*) 0) +#endif + +#if defined(HEDLEY_MESSAGE) +# undef HEDLEY_MESSAGE +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define HEDLEY_MESSAGE(msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + HEDLEY_PRAGMA(message msg) \ + HEDLEY_DIAGNOSTIC_POP +#elif \ + HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg) +#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) +# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg) +#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) +# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) +#elif HEDLEY_PELLES_VERSION_CHECK(2,0,0) +# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) +#else +# define HEDLEY_MESSAGE(msg) +#endif + +#if defined(HEDLEY_WARNING) +# undef HEDLEY_WARNING +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") +# define HEDLEY_WARNING(msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ + HEDLEY_PRAGMA(clang warning msg) \ + HEDLEY_DIAGNOSTIC_POP +#elif \ + HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ + HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) +#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) +# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) +#else +# define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) +#endif + +#if defined(HEDLEY_REQUIRE) +# undef HEDLEY_REQUIRE +#endif +#if defined(HEDLEY_REQUIRE_MSG) +# undef HEDLEY_REQUIRE_MSG +#endif +#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) +# if HEDLEY_HAS_WARNING("-Wgcc-compat") +# define HEDLEY_REQUIRE(expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) \ + HEDLEY_DIAGNOSTIC_POP +# define HEDLEY_REQUIRE_MSG(expr,msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), msg, "error"))) \ + HEDLEY_DIAGNOSTIC_POP +# else +# define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) +# define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) +# endif +#else +# define HEDLEY_REQUIRE(expr) +# define HEDLEY_REQUIRE_MSG(expr,msg) +#endif + +#if defined(HEDLEY_FLAGS) +# undef HEDLEY_FLAGS +#endif +#if HEDLEY_HAS_ATTRIBUTE(flag_enum) +# define HEDLEY_FLAGS __attribute__((__flag_enum__)) +#endif + +#if defined(HEDLEY_FLAGS_CAST) +# undef HEDLEY_FLAGS_CAST +#endif +#if HEDLEY_INTEL_VERSION_CHECK(19,0,0) +# define HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("warning(disable:188)") \ + ((T) (expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +# define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr) +#endif + +#if defined(HEDLEY_EMPTY_BASES) +# undef HEDLEY_EMPTY_BASES +#endif +#if HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0) +# define HEDLEY_EMPTY_BASES __declspec(empty_bases) +#else +# define HEDLEY_EMPTY_BASES +#endif + +/* Remaining macros are deprecated. */ + +#if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) +# undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK +#endif +#if defined(__clang__) +# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) +#else +# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) +#endif + +#if defined(HEDLEY_CLANG_HAS_ATTRIBUTE) +# undef HEDLEY_CLANG_HAS_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) +# undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) HEDLEY_HAS_CPP_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_BUILTIN) +# undef HEDLEY_CLANG_HAS_BUILTIN +#endif +#define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin) + +#if defined(HEDLEY_CLANG_HAS_FEATURE) +# undef HEDLEY_CLANG_HAS_FEATURE +#endif +#define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature) + +#if defined(HEDLEY_CLANG_HAS_EXTENSION) +# undef HEDLEY_CLANG_HAS_EXTENSION +#endif +#define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension) + +#if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) +# undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE +#endif +#define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) + +#if defined(HEDLEY_CLANG_HAS_WARNING) +# undef HEDLEY_CLANG_HAS_WARNING +#endif +#define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning) + +#endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/simde-arch.h minimap2-2.17+dfsg/debian/include/simde/simde-arch.h --- minimap2-2.17+dfsg/debian/include/simde/simde-arch.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/simde-arch.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,347 @@ +/* Architecture detection + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * + * + * Different compilers define different preprocessor macros for the + * same architecture. This is an attempt to provide a single + * interface which is usable on any compiler. + * + * In general, a macro named SIMDE_ARCH_* is defined for each + * architecture the CPU supports. When there are multiple possible + * versions, we try to define the macro to the target version. For + * example, if you want to check for i586+, you could do something + * like: + * + * #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5) + * ... + * #endif + * + * You could also just check that SIMDE_ARCH_X86 >= 5 without checking + * if it's defined first, but some compilers may emit a warning about + * an undefined macro being used (e.g., GCC with -Wundef). + * + * This was originally created for SIMDe + * (hence the prefix), but this + * header has no dependencies and may be used anywhere. It is + * originally based on information from + * , though it + * has been enhanced with additional information. + * + * If you improve this file, or find a bug, please file the issue at + * . If you copy this into + * your project, even if you change the prefix, please keep the links + * to SIMDe intact so others know where to report issues, submit + * enhancements, and find the latest version. */ + +#if !defined(SIMDE_ARCH_H) +#define SIMDE_ARCH_H + +/* Alpha + */ +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) +# if defined(__alpha_ev6__) +# define SIMDE_ARCH_ALPHA 6 +# elif defined(__alpha_ev5__) +# define SIMDE_ARCH_ALPHA 5 +# elif defined(__alpha_ev4__) +# define SIMDE_ARCH_ALPHA 4 +# else +# define SIMDE_ARCH_ALPHA 1 +# endif +#endif + +/* Atmel AVR + */ +#if defined(__AVR_ARCH__) +# define SIMDE_ARCH_AVR __AVR_ARCH__ +#endif + +/* AMD64 / x86_64 + */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64) +# define SIMDE_ARCH_AMD64 1 +#endif + +/* ARM + */ +#if defined(__ARM_ARCH_8A__) +# define SIMDE_ARCH_ARM 82 +#elif defined(__ARM_ARCH_8R__) +# define SIMDE_ARCH_ARM 81 +#elif defined(__ARM_ARCH_8__) +# define SIMDE_ARCH_ARM 80 +#elif defined(__ARM_ARCH_7S__) +# define SIMDE_ARCH_ARM 74 +#elif defined(__ARM_ARCH_7M__) +# define SIMDE_ARCH_ARM 73 +#elif defined(__ARM_ARCH_7R__) +# define SIMDE_ARCH_ARM 72 +#elif defined(__ARM_ARCH_7A__) +# define SIMDE_ARCH_ARM 71 +#elif defined(__ARM_ARCH_7__) +# define SIMDE_ARCH_ARM 70 +#elif defined(__ARM_ARCH) +# define SIMDE_ARCH_ARM (__ARM_ARCH * 10) +#elif defined(_M_ARM) +# define SIMDE_ARCH_ARM (_M_ARM * 10) +#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || defined(_ARM) || defined(_M_ARM) || defined(_M_ARM) +# define SIMDE_ARCH_ARM 1 +#endif + +/* AArch64 + */ +#if defined(__aarch64__) || defined(_M_ARM64) +# define SIMDE_ARCH_AARCH64 10 +#endif + +/* Blackfin + */ +#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__) +# define SIMDE_ARCH_BLACKFIN 1 +#endif + +/* CRIS + */ +#if defined(__CRIS_arch_version) +# define SIMDE_ARCH_CRIS __CRIS_arch_version +#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || defined(__CRIS__) +# define SIMDE_ARCH_CRIS 1 +#endif + +/* Convex + */ +#if defined(__convex_c38__) +# define SIMDE_ARCH_CONVEX 38 +#elif defined(__convex_c34__) +# define SIMDE_ARCH_CONVEX 34 +#elif defined(__convex_c32__) +# define SIMDE_ARCH_CONVEX 32 +#elif defined(__convex_c2__) +# define SIMDE_ARCH_CONVEX 2 +#elif defined(__convex__) +# define SIMDE_ARCH_CONVEX 1 +#endif + +/* Adapteva Epiphany + */ +#if defined(__epiphany__) +# define SIMDE_ARCH_EPIPHANY 1 +#endif + +/* Fujitsu FR-V + */ +#if defined(__frv__) +# define SIMDE_ARCH_FRV 1 +#endif + +/* H8/300 + */ +#if defined(__H8300__) +# define SIMDE_ARCH_H8300 +#endif + +/* HP/PA / PA-RISC + */ +#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || defined(_PA_RISC2_0) +# define SIMDE_ARCH_HPPA 20 +#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1) +# define SIMDE_ARCH_HPPA 11 +#elif defined(_PA_RISC1_0) +# define SIMDE_ARCH_HPPA 10 +#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) +# define SIMDE_ARCH_HPPA 1 +#endif + +/* x86 + */ +#if defined(_M_IX86) +# define SIMDE_ARCH_X86 (_M_IX86 / 100) +#elif defined(__I86__) +# define SIMDE_ARCH_X86 __I86__ +#elif defined(i686) || defined(__i686) || defined(__i686__) +# define SIMDE_ARCH_X86 6 +#elif defined(i586) || defined(__i586) || defined(__i586__) +# define SIMDE_ARCH_X86 5 +#elif defined(i486) || defined(__i486) || defined(__i486__) +# define SIMDE_ARCH_X86 4 +#elif defined(i386) || defined(__i386) || defined(__i386__) +# define SIMDE_ARCH_X86 3 +#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__) +# define SIMDE_ARCH_X86 3 +#endif + +/* Itanium + */ +#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || defined(__itanium__) +# define SIMDE_ARCH_IA64 1 +#endif + +/* Renesas M32R + */ +#if defined(__m32r__) || defined(__M32R__) +# define SIMDE_ARCH_M32R +#endif + +/* Motorola 68000 + */ +#if defined(__mc68060__) || defined(__MC68060__) +# define SIMDE_ARCH_M68K 68060 +#elif defined(__mc68040__) || defined(__MC68040__) +# define SIMDE_ARCH_M68K 68040 +#elif defined(__mc68030__) || defined(__MC68030__) +# define SIMDE_ARCH_M68K 68030 +#elif defined(__mc68020__) || defined(__MC68020__) +# define SIMDE_ARCH_M68K 68020 +#elif defined(__mc68010__) || defined(__MC68010__) +# define SIMDE_ARCH_M68K 68010 +#elif defined(__mc68000__) || defined(__MC68000__) +# define SIMDE_ARCH_M68K 68000 +#endif + +/* Xilinx MicroBlaze + */ +#if defined(__MICROBLAZE__) || defined(__microblaze__) +# define SIMDE_ARCH_MICROBLAZE +#endif + +/* MIPS + */ +#if defined(_MIPS_ISA_MIPS64R2) +# define SIMDE_ARCH_MIPS 642 +#elif defined(_MIPS_ISA_MIPS64) +# define SIMDE_ARCH_MIPS 640 +#elif defined(_MIPS_ISA_MIPS32R2) +# define SIMDE_ARCH_MIPS 322 +#elif defined(_MIPS_ISA_MIPS32) +# define SIMDE_ARCH_MIPS 320 +#elif defined(_MIPS_ISA_MIPS4) +# define SIMDE_ARCH_MIPS 4 +#elif defined(_MIPS_ISA_MIPS3) +# define SIMDE_ARCH_MIPS 3 +#elif defined(_MIPS_ISA_MIPS2) +# define SIMDE_ARCH_MIPS 2 +#elif defined(_MIPS_ISA_MIPS1) +# define SIMDE_ARCH_MIPS 1 +#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__) +# define SIMDE_ARCH_MIPS 1 +#endif + +/* Matsushita MN10300 + */ +#if defined(__MN10300__) || defined(__mn10300__) +# define SIMDE_ARCH_MN10300 1 +#endif + +/* POWER + */ +#if defined(_M_PPC) +# define SIMDE_ARCH_POWER _M_PPC +#elif defined(_ARCH_PWR8) +# define SIMDE_ARCH_POWER 800 +#elif defined(_ARCH_PWR7) +# define SIMDE_ARCH_POWER 700 +#elif defined(_ARCH_PWR6) +# define SIMDE_ARCH_POWER 600 +#elif defined(_ARCH_PWR5) +# define SIMDE_ARCH_POWER 500 +#elif defined(_ARCH_PWR4) +# define SIMDE_ARCH_POWER 400 +#elif defined(_ARCH_440) || defined(__ppc440__) +# define SIMDE_ARCH_POWER 440 +#elif defined(_ARCH_450) || defined(__ppc450__) +# define SIMDE_ARCH_POWER 450 +#elif defined(_ARCH_601) || defined(__ppc601__) +# define SIMDE_ARCH_POWER 601 +#elif defined(_ARCH_603) || defined(__ppc603__) +# define SIMDE_ARCH_POWER 603 +#elif defined(_ARCH_604) || defined(__ppc604__) +# define SIMDE_ARCH_POWER 604 +#elif defined(_ARCH_605) || defined(__ppc605__) +# define SIMDE_ARCH_POWER 605 +#elif defined(_ARCH_620) || defined(__ppc620__) +# define SIMDE_ARCH_POWER 620 +#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || defined(__ppc) +# define SIMDE_ARCH_POWER 1 +#endif + +/* SPARC + */ +#if defined(__sparc_v9__) || defined(__sparcv9) +# define SIMDE_ARCH_SPARC 9 +#elif defined(__sparc_v8__) || defined(__sparcv8) +# define SIMDE_ARCH_SPARC 8 +#elif defined(__sparc_v7__) || defined(__sparcv7) +# define SIMDE_ARCH_SPARC 7 +#elif defined(__sparc_v6__) || defined(__sparcv6) +# define SIMDE_ARCH_SPARC 6 +#elif defined(__sparc_v5__) || defined(__sparcv5) +# define SIMDE_ARCH_SPARC 5 +#elif defined(__sparc_v4__) || defined(__sparcv4) +# define SIMDE_ARCH_SPARC 4 +#elif defined(__sparc_v3__) || defined(__sparcv3) +# define SIMDE_ARCH_SPARC 3 +#elif defined(__sparc_v2__) || defined(__sparcv2) +# define SIMDE_ARCH_SPARC 2 +#elif defined(__sparc_v1__) || defined(__sparcv1) +# define SIMDE_ARCH_SPARC 1 +#elif defined(__sparc__) || defined(__sparc) +# define SIMDE_ARCH_SPARC 1 +#endif + +/* SuperH + */ +#if defined(__sh5__) || defined(__SH5__) +# define SIMDE_ARCH_SUPERH 5 +#elif defined(__sh4__) || defined(__SH4__) +# define SIMDE_ARCH_SUPERH 4 +#elif defined(__sh3__) || defined(__SH3__) +# define SIMDE_ARCH_SUPERH 3 +#elif defined(__sh2__) || defined(__SH2__) +# define SIMDE_ARCH_SUPERH 2 +#elif defined(__sh1__) || defined(__SH1__) +# define SIMDE_ARCH_SUPERH 1 +#elif defined(__sh__) || defined(__SH__) +# define SIMDE_ARCH_SUPERH 1 +#endif + +/* IBM System z + */ +#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__) +# define SIMDE_ARCH_SYSTEMZ +#endif + +/* TMS320 DSP + */ +#if defined(_TMS320C6740) || defined(__TMS320C6740__) +# define SIMDE_ARCH_TMS320 6740 +#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__) +# define SIMDE_ARCH_TMS320 6701 +#elif defined(_TMS320C6700) || defined(__TMS320C6700__) +# define SIMDE_ARCH_TMS320 6700 +#elif defined(_TMS320C6600) || defined(__TMS320C6600__) +# define SIMDE_ARCH_TMS320 6600 +#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__) +# define SIMDE_ARCH_TMS320 6401 +#elif defined(_TMS320C6400) || defined(__TMS320C6400__) +# define SIMDE_ARCH_TMS320 6400 +#elif defined(_TMS320C6200) || defined(__TMS320C6200__) +# define SIMDE_ARCH_TMS320 6200 +#elif defined(_TMS320C55X) || defined(__TMS320C55X__) +# define SIMDE_ARCH_TMS320 550 +#elif defined(_TMS320C54X) || defined(__TMS320C54X__) +# define SIMDE_ARCH_TMS320 540 +#elif defined(_TMS320C28X) || defined(__TMS320C28X__) +# define SIMDE_ARCH_TMS320 280 +#endif + +/* Xtensa + */ +#if defined(__xtensa__) || defined(__XTENSA__) +# define SIMDE_ARCH_XTENSA 1 +#endif + +#endif /* !defined(SIMDE_ARCH_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/simde-common.h minimap2-2.17+dfsg/debian/include/simde/simde-common.h --- minimap2-2.17+dfsg/debian/include/simde/simde-common.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/simde-common.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,364 @@ +/* Copyright (c) 2017-2019 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE_COMMON_H) +#define SIMDE_COMMON_H + +#include "hedley.h" +#include "check.h" +#include "simde-arch.h" + +#if \ + HEDLEY_HAS_ATTRIBUTE(aligned) || \ + HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ + HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \ + HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ + HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ + HEDLEY_PGI_VERSION_CHECK(19,4,0) || \ + HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ + HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ + HEDLEY_TI_VERSION_CHECK(8,1,0) +# define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) +#elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64)) +# define SIMDE_ALIGN(alignment) __declspec(align(alignment)) +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +# define SIMDE_ALIGN(alignment) _Alignas(alignment) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) +# define SIMDE_ALIGN(alignment) alignas(alignment) +#else +# define SIMDE_ALIGN(alignment) +#endif + +#define simde_assert_aligned(alignment, val) \ + simde_assert_int(HEDLEY_REINTERPRET_CAST(uintptr_t, HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, (val)))) % (alignment), ==, 0) + +/* TODO: this should really do something like + HEDLEY_STATIC_CAST(T, (simde_assert_int(alignment, v), v)) + but I need to think about how to handle it in all compilers... + may end up moving to Hedley, too. */ +#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) +# define SIMDE_CAST_ALIGN(alignment, T, v) HEDLEY_REINTERPRET_CAST(T, __builtin_assume_aligned(v, alignment)) +#elif HEDLEY_HAS_WARNING("-Wcast-align") +# define SIMDE_CAST_ALIGN(alignment, T, v) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wcast-align\"") \ + HEDLEY_REINTERPRET_CAST(T, (v)) \ + HEDLEY_DIAGNOSTIC_POP +#else +# define SIMDE_CAST_ALIGN(alignment, T, v) HEDLEY_REINTERPRET_CAST(T, (v)) +#endif + +#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size,4,6,0) +# define SIMDE__ENABLE_GCC_VEC_EXT +/* clang had a bug (present in 3.5 at least) where it wouldn't + shift by a scalar value. I have no idea how to detect when + it was fixed, so we just blacklist clang from certain functions. */ +# if !defined(__clang__) +# define SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR +# endif +#endif + +#if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) +# define SIMDE_ENABLE_OPENMP +#endif + +#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk) +# define SIMDE_ENABLE_CILKPLUS +#endif + +#if defined(SIMDE_ENABLE_OPENMP) +# define SIMDE__VECTORIZE _Pragma("omp simd") +# define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) +# define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) +# define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) +#elif defined(SIMDE_ENABLE_CILKPLUS) +# define SIMDE__VECTORIZE _Pragma("simd") +# define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) +# define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) +# define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) +#elif defined(__INTEL_COMPILER) +# define SIMDE__VECTORIZE _Pragma("simd") +# define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) +# define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) +# define SIMDE__VECTORIZE_ALIGNED(a) +#elif defined(__clang__) +# define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)") +# define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) +# define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_ALIGNED(a) +#elif HEDLEY_GCC_VERSION_CHECK(4,9,0) +# define SIMDE__VECTORIZE _Pragma("GCC ivdep") +# define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_ALIGNED(a) +#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) +# define SIMDE__VECTORIZE _Pragma("_CRI ivdep") +# define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_ALIGNED(a) +#else +# define SIMDE__VECTORIZE +# define SIMDE__VECTORIZE_SAFELEN(l) +# define SIMDE__VECTORIZE_REDUCTION(r) +# define SIMDE__VECTORIZE_ALIGNED(a) +#endif + +#if HEDLEY_GCC_HAS_ATTRIBUTE(unused,3,1,0) +# define SIMDE__UNUSED __attribute__((__unused__)) +#else +# define SIMDE__UNUSED +#endif + +#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial,4,3,0) +# define SIMDE__ARTIFICIAL __attribute__((__artificial__)) +#else +# define SIMDE__ARTIFICIAL +#endif + +/* Intended for checking coverage, you should never use this in + production. */ +#if defined(SIMDE_NO_INLINE) +# define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static +#else +# define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static +#endif + +#if defined(_MSC_VER) +# define SIMDE__BEGIN_DECLS HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS +# define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS +#else +# define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS +# define SIMDE__END_DECLS HEDLEY_END_C_DECLS +#endif + +#if HEDLEY_HAS_WARNING("-Wpedantic") +# define SIMDE_DIAGNOSTIC_DISABLE_INT128 _Pragma("clang diagnostic ignored \"-Wpedantic\"") +#elif defined(HEDLEY_GCC_VERSION) +# define SIMDE_DIAGNOSTIC_DISABLE_INT128 _Pragma("GCC diagnostic ignored \"-Wpedantic\"") +#else +# define SIMDE_DIAGNOSTIC_DISABLE_INT128 +#endif + +#if defined(__SIZEOF_INT128__) +# define SIMDE__HAVE_INT128 +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_INT128 +typedef __int128 simde_int128; +typedef unsigned __int128 simde_uint128; +HEDLEY_DIAGNOSTIC_POP +#endif + +/* TODO: we should at least make an attempt to detect the correct + types for simde_float32/float64 instead of just assuming float and + double. */ + +#if !defined(SIMDE_FLOAT32_TYPE) +# define SIMDE_FLOAT32_TYPE float +# define SIMDE_FLOAT32_C(value) value##f +#else +# define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value) +#endif +typedef SIMDE_FLOAT32_TYPE simde_float32; +HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4, "Unable to find 32-bit floating-point type."); + +#if !defined(SIMDE_FLOAT64_TYPE) +# define SIMDE_FLOAT64_TYPE double +# define SIMDE_FLOAT64_C(value) value +#else +# define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE) value) +#endif +typedef SIMDE_FLOAT64_TYPE simde_float64; +HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, "Unable to find 64-bit floating-point type."); + +/* Whether to assume that the compiler can auto-vectorize reasonably + well. This will cause SIMDe to attempt to compose vector + operations using more simple vector operations instead of minimize + serial work. + + As an example, consider the _mm_add_ss(a, b) function from SSE, + which returns { a0 + b0, a1, a2, a3 }. This pattern is repeated + for other operations (sub, mul, etc.). + + The naïve implementation would result in loading a0 and b0, adding + them into a temporary variable, then splicing that value into a new + vector with the remaining elements from a. + + On platforms which support vectorization, it's generally faster to + simply perform the operation on the entire vector to avoid having + to move data between SIMD registers and non-SIMD registers. + Basically, instead of the temporary variable being (a0 + b0) it + would be a vector of (a + b), which is then combined with a to form + the result. + + By default, SIMDe will prefer the pure-vector versions if we detect + a vector ISA extension, but this can be overridden by defining + SIMDE_NO_ASSUME_VECTORIZATION. You can also define + SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the + vectorized version. */ +#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && !defined(SIMDE_ASSUME_VECTORIZATION) +# if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || defined(__ALTIVEC__) +# define SIMDE_ASSUME_VECTORIZATION +# endif +#endif + +/* GCC and clang have built-in functions to handle shuffling of + vectors, but the implementations are slightly different. This + macro is just an abstraction over them. Note that elem_size is in + bits but vec_size is in bytes. */ +#if !defined(SIMDE_NO_SHUFFLE_VECTOR) +#if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) +# define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER) +# define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) (__extension__ ({ \ + int##elem_size##_t __attribute__((__vector_size__(vec_size))) simde_shuffle_ = { __VA_ARGS__ }; \ + __builtin_shuffle(a, b, simde_shuffle_); \ + })) +#endif +#endif + +#if HEDLEY_GCC_HAS_BUILTIN(__builtin_convertvector,9,0,0) +# define SIMDE__CONVERT_VECTOR(to, from) ((to) = __builtin_convertvector((from), __typeof__(to))) +#endif + +#if HEDLEY_HAS_WARNING("-Wbad-function-cast") +# define SIMDE_CONVERT_FTOI(T,v) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \ + HEDLEY_STATIC_CAST(T, (v)) \ + HEDLEY_DIAGNOSTIC_POP +#else +# define SIMDE_CONVERT_FTOI(T,v) ((T) (v)) +#endif + + +#if HEDLEY_HAS_WARNING("-Wfloat-equal") +# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("clang diagnostic ignored \"-Wfloat-equal\"") +#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) +# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") +#else +# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL +#endif + +/* Some algorithms are iterative, and fewer iterations means less + accuracy. Lower values here will result in faster, but less + accurate, calculations for some functions. */ +#if !defined(SIMDE_ACCURACY_ITERS) +# define SIMDE_ACCURACY_ITERS 2 +#endif + +#if defined(SIMDE__ASSUME_ALIGNED) +# undef SIMDE__ASSUME_ALIGNED +#endif +#if HEDLEY_INTEL_VERSION_CHECK(9,0,0) +# define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align) +#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) +# define SIMDE__ASSUME_ALIGNED(ptr, align) __assume((((char*) ptr) - ((char*) 0)) % (align) == 0) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned,4,7,0) +# define SIMDE__ASSUME_ALIGNED(ptr, align) (ptr = (__typeof__(ptr)) __builtin_assume_aligned((ptr), align)) +#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume) +# define SIMDE__ASSUME_ALIGNED(ptr, align) __builtin_assume((((char*) ptr) - ((char*) 0)) % (align) == 0) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable,4,5,0) +# define SIMDE__ASSUME_ALIGNED(ptr, align) ((((char*) ptr) - ((char*) 0)) % (align) == 0) ? (1) : (__builtin_unreachable(), 0) +#else +# define SIMDE__ASSUME_ALIGNED(ptr, align) +#endif + +/* This is only to help us implement functions like _mm_undefined_ps. */ +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +# undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif +#if HEDLEY_HAS_WARNING("-Wuninitialized") +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wuninitialized\"") +#elif HEDLEY_GCC_VERSION_CHECK(4,2,0) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wuninitialized\"") +#elif HEDLEY_PGI_VERSION_CHECK(19,10,0) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) && defined(__cplusplus) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,unassigned)") +/* #elif \ + HEDLEY_TI_VERSION_CHECK(16,9,9) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551") */ +#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)") +#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) +# define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ __pragma(warning(disable:4700)) +#endif + +/* Sometimes we run into problems with specific versions of compilers + which make the native versions unusable for us. Often this is due + to missing functions, sometimes buggy implementations, etc. These + macros are how we check for specific bugs. As they are fixed we'll + start only defining them for problematic compiler versions. */ + +#if !defined(SIMDE_IGNORE_COMPILER_BUGS) +# if !HEDLEY_GCC_VERSION_CHECK(4,9,0) +# define SIMDE_BUG_GCC_REV_208793 +# endif +# if !HEDLEY_GCC_VERSION_CHECK(5,0,0) +# define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */ +# endif +# if !HEDLEY_GCC_VERSION_CHECK(4,6,0) +# define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ +# endif +# if !HEDLEY_GCC_VERSION_CHECK(10,0,0) +# define SIMDE_BUG_GCC_REV_274313 +# endif +# if defined(HEDLEY_EMSCRIPTEN_VERSION) +# define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */ +# define SIMDE_BUG_EMSCRIPTEN_5242 +# endif +#endif + +HEDLEY_ALWAYS_INLINE static +simde_float32 simde_u32_to_f32(uint32_t val) { + union { + uint32_t u32; + simde_float32 f32; + } u; + u.u32 = val; + return u.f32; +} + +HEDLEY_ALWAYS_INLINE static +simde_float64 simde_u64_to_f64(uint64_t val) { + union { + uint64_t u64; + simde_float64 f64; + } u; + u.u64 = val; + return u.f64; +} + +#define SIMDE_F32_ALL_SET (simde_u32_to_f32(~UINT32_C(0))) +#define SIMDE_F32_ALL_UNSET (simde_u32_to_f32( UINT32_C(0))) +#define SIMDE_F64_ALL_SET (simde_u64_to_f64(~UINT64_C(0))) +#define SIMDE_F64_ALL_UNSET (simde_u64_to_f64( UINT64_C(0))) + +#endif /* !defined(SIMDE_COMMON_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/mmx.h minimap2-2.17+dfsg/debian/include/simde/x86/mmx.h --- minimap2-2.17+dfsg/debian/include/simde/x86/mmx.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/mmx.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,2057 @@ +/* Copyright (c) 2017-2019 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE__MMX_H) +# if !defined(SIMDE__MMX_H) +# define SIMDE__MMX_H +# endif +# include "../simde-common.h" + +# if defined(SIMDE_MMX_FORCE_NATIVE) +# define SIMDE_MMX_NATIVE +# elif (defined(__MMX__) || (defined(_MSC_VER) && defined(_M_IX86))) && !defined(SIMDE_MMX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# define SIMDE_MMX_NATIVE +# elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_MMX_NEON +# endif + +# if defined(SIMDE_MMX_NATIVE) +# include +# else +# if defined(SIMDE_MMX_NEON) +# include +# endif +# endif +# include +# include +# include +# include + +SIMDE__BEGIN_DECLS + +typedef union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + SIMDE_ALIGN(8) int8_t i8 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) int16_t i16 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) int32_t i32 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) int64_t i64 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) uint8_t u8 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) uint16_t u16 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) uint32_t u32 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) uint64_t u64 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) int_fast32_t i32f __attribute__((__vector_size__(8), __may_alias__)); + SIMDE_ALIGN(8) uint_fast32_t u32f __attribute__((__vector_size__(8), __may_alias__)); +#else + SIMDE_ALIGN(8) int8_t i8[8]; + SIMDE_ALIGN(8) int16_t i16[4]; + SIMDE_ALIGN(8) int32_t i32[2]; + SIMDE_ALIGN(8) int64_t i64[1]; + SIMDE_ALIGN(8) uint8_t u8[8]; + SIMDE_ALIGN(8) uint16_t u16[4]; + SIMDE_ALIGN(8) uint32_t u32[2]; + SIMDE_ALIGN(8) uint64_t u64[1]; + SIMDE_ALIGN(8) simde_float32 f32[2]; + SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)]; +#endif + +#if defined(SIMDE_MMX_NATIVE) + __m64 n; +#elif defined(SIMDE_MMX_NEON) + int8x8_t neon_i8; + int16x4_t neon_i16; + int32x2_t neon_i32; + int64x1_t neon_i64; + uint8x8_t neon_u8; + uint16x4_t neon_u16; + uint32x2_t neon_u32; + uint64x1_t neon_u64; + float32x2_t neon_f32; +#endif +} simde__m64; + +#if !defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_MMX_ENABLE_NATIVE_ALIASES + typedef simde__m64 __m64; +#endif + +#if defined(SIMDE_MMX_NATIVE) + HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64), "__m64 size doesn't match simde__m64 size"); +#endif +HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); + +HEDLEY_DIAGNOSTIC_PUSH + +/* Function has no EMMS instruction */ +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning(disable:4799) +#endif + +#if defined(SIMDE_MMX_NATIVE) + SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_FROM_NATIVE(__m64 v) { simde__m64 r; r.n = v; return r; } +# define SIMDE__M64_TO_NATIVE(v) (v.n) +#else +# define SIMDE__M64_FROM_NATIVE(val) (val) +# define SIMDE__M64_TO_NATIVE(val) (val) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_add_pi8 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_add_pi8(a.n, b.n); +#elif defined(SIMDE_MMX_NEON) + r.neon_i8 = vadd_s8(a.neon_i8, b.neon_i8); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i8 = a.i8 + b.i8; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] + b.i8[i]; + } +#endif + + return r; +} +#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_add_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_add_paddb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_add_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_add_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vadd_s16(a.neon_i16, b.neon_i16); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i16 = a.i16 + b.i16; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] + b.i16[i]; + } +#endif + + return r; +#endif +} +#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_add_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_add_paddw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_add_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_add_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vadd_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i32 = a.i32 + b.i32; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] + b.i32[i]; + } +#endif + + return r; +#endif +} +#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_add_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_add_paddd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_adds_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_adds_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vqadd_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { + r.i8[i] = INT8_MAX; + } else if ((((b.i8[i]) < 0) && ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { + r.i8[i] = INT8_MIN; + } else { + r.i8[i] = (a.i8[i]) + (b.i8[i]); + } + } +#endif + + return r; +#endif +} +#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_adds_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_add_paddsb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_adds_pu8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_adds_pu8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u8 = vqadd_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + const uint_fast16_t x = ((uint_fast16_t) a.u8[i]) + ((uint_fast16_t) b.u8[i]); + if (x > UINT8_MAX) + r.u8[i] = UINT8_MAX; + else + r.u8[i] = (uint8_t) x; + } +#endif + + return r; +#endif +} +#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_adds_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_paddusb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_adds_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_adds_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vqadd_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if ((((b.i16[i]) > 0) && ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { + r.i16[i] = INT16_MAX; + } else if ((((b.i16[i]) < 0) && ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) { + r.i16[i] = SHRT_MIN; + } else { + r.i16[i] = (a.i16[i]) + (b.i16[i]); + } + } +#endif + + return r; +#endif +} +#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_adds_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_paddsw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_adds_pu16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_adds_pu16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u16 = vqadd_u16(a.neon_u16, b.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + const uint32_t x = a.u16[i] + b.u16[i]; + if (x > UINT16_MAX) + r.u16[i] = UINT16_MAX; + else + r.u16[i] = (uint16_t) x; + } +#endif + + return r; +#endif +} +#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_adds_pu16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_paddusw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_adds_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_and_si64 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_and_si64(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vand_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i64 = a.i64 & b.i64; +#else + r.i64[0] = a.i64[0] & b.i64[0]; +#endif + + return r; +#endif +} +#define simde_m_pand(a, b) simde_mm_and_si64(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_and_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_and_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pand(a, b) SIMDE__M64_TO_NATIVE(simde_mm_and_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_andnot_si64 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_andnot_si64(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vand_s32(vmvn_s32(a.neon_i32), b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i32f = ~a.i32f & b.i32f; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[0] = ~(a.i32f[0]) & b.i32f[0]; + } +#endif + + return r; +#endif +} +#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_andnot_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_andnot_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pandn(a, b) SIMDE__M64_TO_NATIVE(simde_mm_andnot_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpeq_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpeq_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vreinterpret_s8_u8(vceq_s8(a.neon_i8, b.neon_i8)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (a.i8[i] == b.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpeqb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpeq_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpeq_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vreinterpret_s16_u16(vceq_s16(a.neon_i16, b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] == b.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpeqw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpeq_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpeq_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vreinterpret_s32_u32(vceq_s32(a.neon_i32, b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (a.i32[i] == b.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpeqd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpeq_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpgt_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpgt_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a.neon_i8, b.neon_i8)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (a.i8[i] > b.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpgtb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpgt_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpgt_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a.neon_i16, b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpgtw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cmpgt_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cmpgt_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a.neon_i32, b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (a.i32[i] > b.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } +#endif + + return r; +#endif +} +#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pcmpgtd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_cmpgt_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvtm64_si64 (simde__m64 a) { +#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) + return _mm_cvtm64_si64(a.n); +#elif defined(SIMDE_MMX_NEON) + return vget_lane_s64(a.neon_i64, 0); +#else + return a.i64[0]; +#endif +} +#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(SIMDE__M64_FROM_NATIVE(a)) +# define _m_to_int64(a) simde_mm_cvtm64_si64(SIMDE__M64_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtsi32_si64 (int32_t a) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_cvtsi32_si64(a)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int32_t av[sizeof(r.neon_i32) / sizeof(r.neon_i32[0])] = { a, 0 }; + r.neon_i32 = vld1_s32(av); +#else + r.i32[0] = a; + r.i32[1] = 0; +#endif + + return r; +#endif +} +#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi32_si64(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtsi32_si64(a)) +# define _m_from_int(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtsi32_si64(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtsi64_m64 (int64_t a) { +#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_cvtsi64_m64(a)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i64 = vld1_s64(&a); +#else + r.i64[0] = a; +#endif + + return r; +#endif +} +#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi64_m64(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtsi64_m64(a)) +# define _m_from_int64(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtsi64_m64(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvtsi64_si32 (simde__m64 a) { +#if defined(SIMDE_MMX_NATIVE) + return _mm_cvtsi64_si32(a.n); +#elif defined(SIMDE_MMX_NEON) + return vget_lane_s32(a.neon_i32, 0); +#else + return a.i32[0]; +#endif +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(SIMDE__M64_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_empty (void) { +#if defined(SIMDE_MMX_NATIVE) + _mm_empty(); +#else +#endif +} +#define simde_m_empty() simde_mm_empty() +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_empty() SIMDE__M64_TO_NATIVE(simde_mm_empty()) +# define _m_empty() SIMDE__M64_TO_NATIVE(simde_mm_empty()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_madd_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_madd_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + int32x4_t i1 = vmull_s16(a.neon_i16, b.neon_i16); + r.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i += 2) { + r.i32[i / 2] = (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + } +#endif + + return r; +#endif +} +#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_madd_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_madd_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pmaddwd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_madd_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_mulhi_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_mulhi_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int32x4_t t1 = vmull_s16(a.neon_i16, b.neon_i16); + const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16); + const uint16x4_t t3 = vmovn_u32(t2); + r.neon_i16 = vreinterpret_s16_u16(t3); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (int16_t) ((a.i16[i] * b.i16[i]) >> 16); + } +#endif + + return r; +#endif +} +#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_mulhi_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mulhi_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pmulhw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mulhi_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_mullo_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_mullo_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int32x4_t t1 = vmull_s16(a.neon_i16, b.neon_i16); + const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1)); + r.neon_i16 = vreinterpret_s16_u16(t2); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (int16_t) ((a.i16[i] * b.i16[i]) & 0xffff); + } +#endif + + return r; +#endif +} +#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_mullo_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mullo_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pmullw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mullo_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_or_si64 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_or_si64(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vorr_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i64 = a.i64 | b.i64; +#else + r.i64[0] = a.i64[0] | b.i64[0]; +#endif + + return r; +#endif +} +#define simde_m_por(a, b) simde_mm_or_si64(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_or_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_or_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_por(a, b) SIMDE__M64_TO_NATIVE(simde_mm_or_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_packs_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_packs_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vqmovn_s16(vcombine_s16(a.neon_i16, b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if (a.i16[i] < INT8_MIN) { + r.i8[i] = INT8_MIN; + } else if (a.i16[i] > INT8_MAX) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (int8_t) a.i16[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if (b.i16[i] < INT8_MIN) { + r.i8[i + 4] = INT8_MIN; + } else if (b.i16[i] > INT8_MAX) { + r.i8[i + 4] = INT8_MAX; + } else { + r.i8[i + 4] = (int8_t) b.i16[i]; + } + } +#endif + + return r; +#endif +} +#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_packs_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_packsswb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_packs_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_packs_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vqmovn_s32(vcombine_s32(a.neon_i32, b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (8 / sizeof(a.i32[0])) ; i++) { + if (a.i32[i] < SHRT_MIN) { + r.i16[i] = SHRT_MIN; + } else if (a.i32[i] > INT16_MAX) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (int16_t) a.i32[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (8 / sizeof(b.i32[0])) ; i++) { + if (b.i32[i] < SHRT_MIN) { + r.i16[i + 2] = SHRT_MIN; + } else if (b.i32[i] > INT16_MAX) { + r.i16[i + 2] = INT16_MAX; + } else { + r.i16[i + 2] = (int16_t) b.i32[i]; + } + } +#endif + + return r; +#endif +} +#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_packs_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_packssdw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_packs_pu16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_packs_pu16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int16x8_t t1 = vcombine_s16(a.neon_i16, b.neon_i16); + + /* Set elements which are < 0 to 0 */ + const int16x8_t t2 = vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1))); + + /* Vector with all s16 elements set to UINT8_MAX */ + const int16x8_t vmax = vmovq_n_s16((int16_t) UINT8_MAX); + + /* Elements which are within the acceptable range */ + const int16x8_t le_max = vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax))); + const int16x8_t gt_max = vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax))); + + /* Final values as 16-bit integers */ + const int16x8_t values = vorrq_s16(le_max, gt_max); + + r.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if (a.i16[i] > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else if (a.i16[i] < 0) { + r.u8[i] = 0; + } else { + r.u8[i] = (uint8_t) a.i16[i]; + } + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if (b.i16[i] > UINT8_MAX) { + r.u8[i + 4] = UINT8_MAX; + } else if (b.i16[i] < 0) { + r.u8[i + 4] = 0; + } else { + r.u8[i + 4] = (uint8_t) b.i16[i]; + } + } +#endif + + return r; +#endif +} +#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_packs_pu16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_packuswb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_packs_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int8_t v[sizeof(r.i8) / sizeof(r.i8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; + r.neon_i8 = vld1_s8(v); +#else + r.i8[0] = e0; + r.i8[1] = e1; + r.i8[2] = e2; + r.i8[3] = e3; + r.i8[4] = e4; + r.i8[5] = e5; + r.i8[6] = e6; + r.i8[7] = e7; +#endif + + return r; +#endif +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_x_mm_set_pu8 (uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set_pi8( + HEDLEY_STATIC_CAST(int8_t, e7), + HEDLEY_STATIC_CAST(int8_t, e6), + HEDLEY_STATIC_CAST(int8_t, e5), + HEDLEY_STATIC_CAST(int8_t, e4), + HEDLEY_STATIC_CAST(int8_t, e3), + HEDLEY_STATIC_CAST(int8_t, e2), + HEDLEY_STATIC_CAST(int8_t, e1), + HEDLEY_STATIC_CAST(int8_t, e0)); +#elif defined(SIMDE_MMX_NEON) + const uint8_t v[sizeof(r.u8) / sizeof(r.u8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; + r.neon_u8 = vld1_u8(v); +#else + r.u8[0] = e0; + r.u8[1] = e1; + r.u8[2] = e2; + r.u8[3] = e3; + r.u8[4] = e4; + r.u8[5] = e5; + r.u8[6] = e6; + r.u8[7] = e7; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_set_pi16(e3, e2, e1, e0)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + const int16_t v[sizeof(r.i16) / sizeof(r.i16[0])] = { e0, e1, e2, e3 }; + r.neon_i16 = vld1_s16(v); +#else + r.i16[0] = e0; + r.i16[1] = e1; + r.i16[2] = e2; + r.i16[3] = e3; +#endif + return r; +#endif +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set_pi16(e3, e2, e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_set_pi16(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_x_mm_set_pu16 (uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set_pi16( + HEDLEY_STATIC_CAST(int16_t, e3), + HEDLEY_STATIC_CAST(int16_t, e2), + HEDLEY_STATIC_CAST(int16_t, e1), + HEDLEY_STATIC_CAST(int16_t, e0) + ); +#elif defined(SIMDE_MMX_NEON) + const uint16_t v[sizeof(r.u16) / sizeof(r.u16[0])] = { e0, e1, e2, e3 }; + r.neon_u16 = vld1_u16(v); +#else + r.u16[0] = e0; + r.u16[1] = e1; + r.u16[2] = e2; + r.u16[3] = e3; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_x_mm_set_pu32 (uint32_t e1, uint32_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set_pi32( + HEDLEY_STATIC_CAST(int32_t, e1), + HEDLEY_STATIC_CAST(int32_t, e0)); +#elif defined(SIMDE_MMX_NEON) + const uint32_t v[sizeof(r.u32) / sizeof(r.u32[0])] = { e0, e1 }; + r.neon_u32 = vld1_u32(v); +#else + r.u32[0] = e0; + r.u32[1] = e1; +#endif + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set_pi32 (int32_t e1, int32_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set_pi32(e1, e0); +#elif defined(SIMDE_MMX_NEON) + const int32_t v[sizeof(r.i32) / sizeof(r.i32[0])] = { e0, e1 }; + r.neon_i32 = vld1_s32(v); +#else + r.i32[0] = e0; + r.i32[1] = e1; +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set_pi32(e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_set_pi32(e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set1_pi8 (int8_t a) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set1_pi8(a); +#elif defined(SIMDE_MMX_NEON) + r.neon_i8 = vmov_n_s8(a); +#else + r = simde_mm_set_pi8(a, a, a, a, a, a, a, a); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set1_pi8(a) SIMDE__M64_TO_NATIVE(simde_mm_set1_pi8(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set1_pi16 (int16_t a) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set1_pi16(a); +#elif defined(SIMDE_MMX_NEON) + r.neon_i16 = vmov_n_s16(a); +#else + return simde_mm_set_pi16(a, a, a, a); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set1_pi16(a) SIMDE__M64_TO_NATIVE(simde_mm_set1_pi16(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_set1_pi32 (int32_t a) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_set1_pi32(a); +#elif defined(SIMDE_MMX_NEON) + r.neon_i32 = vmov_n_s32(a); +#else + return simde_mm_set_pi32(a, a); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_set1_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_set1_pi32(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_setr_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0); +#else + r = simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_setr_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_setr_pi16(e3, e2, e1, e0); +#else + r = simde_mm_set_pi16(e0, e1, e2, e3); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_setr_pi16(e3, e2, e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_setr_pi16(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_setr_pi32 (int32_t e1, int32_t e0) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_setr_pi32(e1, e0); +#else + r = simde_mm_set_pi32(e0, e1); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_setr_pi32(e1, e0) SIMDE__M64_TO_NATIVE(simde_mm_setr_pi32(e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_setzero_si64 (void) { + simde__m64 r; + +#if defined(SIMDE_MMX_NATIVE) + r.n = _mm_setzero_si64(); +#elif defined(SIMDE_MMX_NEON) + r.neon_u32 = vmov_n_u32(0); +#else + r = simde_mm_set_pi32(0, 0); +#endif + + return r; +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_setzero_si64() SIMDE__M64_TO_NATIVE(simde_mm_setzero_si64()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sll_pi16(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vshl_n_s16(a.neon_i16, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i16 = a.i16 << count.u64[0]; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (uint16_t) (a.u16[i] << count.u64[0]); + } +#endif + + return r; +#endif +} +#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sll_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psllw(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sll_pi32(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vshl_n_s32(a.neon_i32, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i32 = a.i32 << count.u64[0]; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] << count.u64[0]; + } +#endif + + return r; +#endif +} +#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sll_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_pslld(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_slli_pi16 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_slli_pi16(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vshl_n_s16(a.neon_i16, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i16 = a.i16 << count; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (uint16_t) (a.u16[i] << count); + } +#endif + + return r; +#endif +} +#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_slli_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psllwi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_slli_pi32 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_slli_pi32(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vshl_n_s32(a.neon_i32, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i32 = a.i32 << count; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] << count; + } +#endif + + return r; +#endif +} +#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_slli_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_pslldi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_slli_si64 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_slli_si64(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i64 = vshl_n_s64(a.neon_i64, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i64 = a.i64 << count; +#else + r.u64[0] = a.u64[0] << count; +#endif + + return r; +#endif +} +#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_slli_si64(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_si64(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psllqi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_slli_si64(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sll_si64 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sll_si64(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i64 = vshl_s64(a.neon_i64, count.neon_i64); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i64 = a.i64 << count.i64; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { + memset(&r, 0, sizeof(r)); + return r; + } + + r.u64[0] = a.u64[0] << count.u64[0]; +#endif + + return r; +#endif +} +#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sll_si64(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psllq(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sll_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_srl_pi16(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u16 = vshr_n_u16(a.neon_u16, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.u16 = a.u16 >> count.u64[0]; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < sizeof(r.u16) / sizeof(r.u16[0]) ; i++) { + r.u16[i] = a.u16[i] >> count.u64[0]; + } +#endif + + return r; +#endif +} +#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srl_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psrlw(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_srl_pi32(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u32 = vshr_n_u32(a.neon_u32, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.u32 = a.u32 >> count.u64[0]; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { + memset(&r, 0, sizeof(r)); + return r; + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < sizeof(r.u32) / sizeof(r.u32[0]) ; i++) { + r.u32[i] = a.u32[i] >> count.u64[0]; + } +#endif + + return r; +#endif +} +#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srl_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psrld(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srli_pi16 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_srli_pi16(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u16 = vshr_n_u16(a.neon_u16, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.u16 = a.u16 >> count; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = a.u16[i] >> count; + } +#endif + + return r; +#endif +} +#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srli_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psrlwi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srli_pi32 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_srli_pi32(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u32 = vshr_n_u32(a.neon_u32, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.u32 = a.u32 >> count; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] >> count; + } +#endif + + return r; +#endif +} +#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srli_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psrldi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srli_si64 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_srli_si64(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u64 = vshl_u64(a.neon_u64, vmov_n_s64(-count)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.u64 = a.u64 >> count; +#else + r.u64[0] = a.u64[0] >> count; +#endif + + return r; +#endif +} +#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srli_si64(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_si64(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psrlqi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srli_si64(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srl_si64 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_srl_si64(a.n, count.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u64 = vshl_u64(a.neon_u64, vneg_s64(count.neon_i64)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.u64 = a.u64 >> count.u64; +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { + memset(&r, 0, sizeof(r)); + return r; + } + + r.u64[0] = a.u64[0] >> count.u64[0]; +#endif + + return r; +#endif +} +#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srl_si64(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psrlq(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srl_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srai_pi16 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_srai_pi16(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vshr_n_s16(a.neon_i16, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i16 = a.i16 >> (count & 0xff); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] >> (count & 0xff); + } +#endif + + return r; +#endif +} +#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srai_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srai_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_psrawi(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srai_pi16(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_srai_pi32 (simde__m64 a, int count) { +#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) + return SIMDE__M64_FROM_NATIVE(_mm_srai_pi32(a.n, count)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vshr_n_s32(a.neon_i32, count); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i32 = a.i32 >> (count & 0xff); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] >> (count & 0xff); + } +#endif + + return r; +#endif +} +#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_srai_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srai_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +# define _m_srai_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_srai_pi32(SIMDE__M64_FROM_NATIVE(a), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sra_pi16(a.n, count.n)); +#else + simde__m64 r; + const int cnt = (int) (count.i64[0] > 15 ? 15 : count.i64[0]); + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vshr_n_s16(a.neon_i16, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i16 = a.i16 >> cnt; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] >> cnt; + } +#endif + + return r; +#endif +} +#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sra_pi16(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sra_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psraw(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sra_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sra_pi32(a.n, count.n)); +#else + simde__m64 r; + const int32_t cnt = (count.u64[0] > 31) ? 31 : ((int32_t) count.u64[0]); + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vshr_n_s32(a.neon_i32, (int) vget_lane_s64(count.neon_i64, 0)); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT_SHIFT_BY_SCALAR) + r.i32 = a.i32 >> cnt; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] >> cnt; + } +#endif + + return r; +#endif +} +#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sra_pi32(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sra_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +# define _m_psrad(a, count) SIMDE__M64_TO_NATIVE(simde_mm_sra_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sub_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sub_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vsub_s8(a.neon_i8, b.neon_i8); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i8 = a.i8 - b.i8; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] - b.i8[i]; + } +#endif + + return r; +#endif +} +#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sub_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sub_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sub_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vsub_s16(a.neon_i16, b.neon_i16); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i16 = a.i16 - b.i16; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] - b.i16[i]; + } +#endif + + return r; +#endif +} +#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sub_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sub_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sub_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vsub_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i32 = a.i32 - b.i32; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] - b.i32[i]; + } +#endif + + return r; +#endif +} +#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_sub_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_subs_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_subs_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vqsub_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { + r.i8[i] = INT8_MIN; + } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (a.i8[i]) - (b.i8[i]); + } + } +#endif + + return r; +#endif +} +#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_subs_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubsb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_subs_pu8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_subs_pu8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u8 = vqsub_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + const int32_t x = a.u8[i] - b.u8[i]; + if (x < 0) { + r.u8[i] = 0; + } else if (x > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else { + r.u8[i] = (uint8_t) x; + } + } +#endif + + return r; +#endif +} +#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_subs_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubusb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_subs_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_subs_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vqsub_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) { + r.i16[i] = SHRT_MIN; + } else if ((b.i16[i]) < 0 && (a.i16[i]) > INT16_MAX + (b.i16[i])) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (a.i16[i]) - (b.i16[i]); + } + } +#endif + + return r; +#endif +} +#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_subs_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubsw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_subs_pu16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_subs_pu16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_u16 = vqsub_u16(a.neon_u16, b.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + const int x = a.u16[i] - b.u16[i]; + if (x < 0) { + r.u16[i] = 0; + } else if (x > UINT16_MAX) { + r.u16[i] = UINT16_MAX; + } else { + r.u16[i] = (uint16_t) x; + } + } +#endif + + return r; +#endif +} +#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_subs_pu16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psubusw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_subs_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpackhi_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vzip2_s8(a.neon_i8, b.neon_i8); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i8 = SIMDE__SHUFFLE_VECTOR(8, 8, a.i8, b.i8, 4, 12, 5, 13, 6, 14, 7, 15); +#else + r.i8[0] = a.i8[4]; + r.i8[1] = b.i8[4]; + r.i8[2] = a.i8[5]; + r.i8[3] = b.i8[5]; + r.i8[4] = a.i8[6]; + r.i8[5] = b.i8[6]; + r.i8[6] = a.i8[7]; + r.i8[7] = b.i8[7]; +#endif + + return r; +#endif +} +#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpckhbw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpackhi_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpackhi_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vzip2_s16(a.neon_i16, b.neon_i16); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i16 = SIMDE__SHUFFLE_VECTOR(16, 8, a.i16, b.i16, 2, 6, 3, 7); +#else + r.i16[0] = a.i16[2]; + r.i16[1] = b.i16[2]; + r.i16[2] = a.i16[3]; + r.i16[3] = b.i16[3]; +#endif + + return r; +#endif +} +#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpckhwd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpackhi_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpackhi_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vzip2_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i32 = SIMDE__SHUFFLE_VECTOR(32, 8, a.i32, b.i32, 1, 3); +#else + r.i32[0] = a.i32[1]; + r.i32[1] = b.i32[1]; +#endif + + return r; +#endif +} +#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpckhdq(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpackhi_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpacklo_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpacklo_pi8(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i8 = vzip1_s8(a.neon_i8, b.neon_i8); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i8 = SIMDE__SHUFFLE_VECTOR(8, 8, a.i8, b.i8, 0, 8, 1, 9, 2, 10, 3, 11); +#else + r.i8[0] = a.i8[0]; + r.i8[1] = b.i8[0]; + r.i8[2] = a.i8[1]; + r.i8[3] = b.i8[1]; + r.i8[4] = a.i8[2]; + r.i8[5] = b.i8[2]; + r.i8[6] = a.i8[3]; + r.i8[7] = b.i8[3]; +#endif + + return r; +#endif +} +#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpcklbw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpacklo_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpacklo_pi16(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i16 = vzip1_s16(a.neon_i16, b.neon_i16); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i16 = SIMDE__SHUFFLE_VECTOR(16, 8, a.i16, b.i16, 0, 4, 1, 5); +#else + r.i16[0] = a.i16[0]; + r.i16[1] = b.i16[0]; + r.i16[2] = a.i16[1]; + r.i16[3] = b.i16[1]; +#endif + + return r; +#endif +} +#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpcklwd(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_unpacklo_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_unpacklo_pi32(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = vzip1_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i32 = SIMDE__SHUFFLE_VECTOR(32, 8, a.i32, b.i32, 0, 2); +#else + r.i32[0] = a.i32[0]; + r.i32[1] = b.i32[0]; +#endif + + return r; +#endif +} +#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_punpckldq(a, b) SIMDE__M64_TO_NATIVE(simde_mm_unpacklo_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_xor_si64 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_MMX_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_xor_si64(a.n, b.n)); +#else + simde__m64 r; + +#if defined(SIMDE_MMX_NEON) + r.neon_i32 = veor_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i32f = a.i32f ^ b.i32f; +#else + r.i32f[0] = a.i32f[0] ^ b.i32f[0]; +#endif + + return r; +#endif +} +#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b) +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _mm_xor_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_xor_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pxor(a, b) SIMDE__M64_TO_NATIVE(simde_mm_xor_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_m_to_int (simde__m64 a) { +#if defined(SIMDE_MMX_NEON) + return vget_lane_s32(a.neon_i32, 0); +#elif defined(SIMDE_MMX_NATIVE) + return _m_to_int(a.n); +#else + return a.i32[0]; +#endif +} +#if defined(SIMDE_MMX_ENABLE_NATIVE_ALIASES) +# define _m_to_int(a) simde_m_to_int(SIMDE__M64_FROM_NATIVE(a)) +#endif + +HEDLEY_DIAGNOSTIC_POP + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__MMX_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/sse2.h minimap2-2.17+dfsg/debian/include/simde/x86/sse2.h --- minimap2-2.17+dfsg/debian/include/simde/x86/sse2.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/sse2.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,4939 @@ +/* Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2017 Evan Nemerson + * 2015-2017 John W. Ratcliff + * 2015 Brandon Rowlett + * 2015 Ken Fast + * 2017 Hasindu Gamaarachchi + * 2018 Jeff Daily + */ + +#if !defined(SIMDE__SSE2_H) +# if !defined(SIMDE__SSE2_H) +# define SIMDE__SSE2_H +# endif +# include "sse.h" + +# if defined(SIMDE_SSE2_NATIVE) +# undef SIMDE_SSE2_NATIVE +# endif +# if defined(SIMDE_SSE2_FORCE_NATIVE) +# define SIMDE_SSE2_NATIVE +# elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# define SIMDE_SSE2_NATIVE +# elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_SSE2_NEON +# endif + +# if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE) +# if defined(SIMDE_SSE2_FORCE_NATIVE) +# error Native SSE2 support requires native SSE support +# else +# warning Native SSE2 support requires native SSE support, disabling +# undef SIMDE_SSE2_NATIVE +# endif +# elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON) +# warning SSE2 NEON support requires SSE NEON support, disabling +# undef SIMDE_SSE_NEON +# endif + +# if defined(SIMDE_SSE2_NATIVE) +# include +# else +# if defined(SIMDE_SSE2_NEON) +# include +# endif +# endif + +# include +# include +# include + +SIMDE__BEGIN_DECLS + +typedef union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + SIMDE_ALIGN(16) int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); + #if defined(SIMDE__HAVE_INT128) + SIMDE_ALIGN(16) simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); + #endif + SIMDE_ALIGN(16) simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); + + SIMDE_ALIGN(16) int_fast32_t i32f __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint_fast32_t u32f __attribute__((__vector_size__(16), __may_alias__)); +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; + #if defined(SIMDE__HAVE_INT128) + SIMDE_ALIGN(16) simde_int128 i128[1]; + SIMDE_ALIGN(16) simde_uint128 u128[1]; + #endif + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) simde_float64 f64[2]; + + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_SSE2_NATIVE) + SIMDE_ALIGN(16) __m128i n; +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; + #if defined(SIMDE_ARCH_AMD64) + SIMDE_ALIGN(16) float64x2_t neon_f64; + #endif +#endif +} simde__m128i; + +typedef union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + SIMDE_ALIGN(16) int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int_fast32_t i32f __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint_fast32_t u32f __attribute__((__vector_size__(16), __may_alias__)); +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) simde_float64 f64[2]; + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_SSE2_NATIVE) + SIMDE_ALIGN(16) __m128d n; +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; + #if defined(SIMDE_ARCH_AMD64) + SIMDE_ALIGN(16) float64x2_t neon_f64; + #endif +#endif +} simde__m128d; + +#if defined(SIMDE_SSE2_NATIVE) + HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i), "__m128i size doesn't match simde__m128i size"); + HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d), "__m128d size doesn't match simde__m128d size"); +#elif defined(SIMDE_SSE_NEON) + #define SIMDE__M128I_NEON_C(T, expr) (simde__m128i) { .neon_##T = expr } + #define SIMDE__M128D_NEON_C(T, expr) (simde__m128d) { .neon_##T = expr } +#endif +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); + +#if !defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_SSE2_ENABLE_NATIVE_ALIASES + typedef simde__m128d __m128d; + typedef simde__m128i __m128i; +#endif + +#if defined(SIMDE_SSE2_NATIVE) + SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_FROM_NATIVE(__m128d v) { simde__m128d r; r.n = v; return r; } + SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_FROM_NATIVE(__m128i v) { simde__m128i r; r.n = v; return r; } +# define SIMDE__M128D_TO_NATIVE(v) (v.n) +# define SIMDE__M128I_TO_NATIVE(v) (v.n) +#else +# define SIMDE__M128D_FROM_NATIVE(val) (val) +# define SIMDE__M128I_FROM_NATIVE(val) (val) +# define SIMDE__M128D_TO_NATIVE(val) (val) +# define SIMDE__M128I_TO_NATIVE(val) (val) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_add_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vaddq_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] + b.i8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_add_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_add_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vaddq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] + b.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_add_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_add_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vaddq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] + b.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_add_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_add_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_epi64(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vaddq_s64(a.neon_i64, b.neon_i64); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] + b.i64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_epi64(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_add_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_add_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_pd(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64) + r.neon_f64 = vaddq_f64(a.neon_f64, b.neon_f64); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = a.f64[i] + b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_add_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_add_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] + b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_add_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_add_si64 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_add_si64(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vadd_s64(a.neon_i64, b.neon_i64); +#else + r.i64[0] = a.i64[0] + b.i64[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_add_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_add_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_adds_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vqaddq_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { + r.i8[i] = INT8_MAX; + } else if ((((b.i8[i]) < 0) && ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { + r.i8[i] = INT8_MIN; + } else { + r.i8[i] = (a.i8[i]) + (b.i8[i]); + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_adds_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_adds_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_adds_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vqaddq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + if ((((b.i16[i]) > 0) && ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { + r.i16[i] = INT16_MAX; + } else if ((((b.i16[i]) < 0) && ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) { + r.i16[i] = INT16_MIN; + } else { + r.i16[i] = (a.i16[i]) + (b.i16[i]); + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_adds_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_adds_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_adds_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vqaddq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i]) ? (a.u8[i] + b.u8[i]) : UINT8_MAX; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_adds_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_adds_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_adds_epu16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u16 = vqaddq_u16(a.neon_u16, b.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i]) ? (a.u16[i] + b.u16[i]) : UINT16_MAX; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_adds_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_adds_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_and_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_and_pd(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u64) / sizeof(r.u64[0])) ; i++) { + r.u64[i] = a.u64[i] & b.u64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_and_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_and_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_and_si128 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_and_si128(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vandq_s32(b.neon_i32, a.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = a.i32f[i] & b.i32f[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_and_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_and_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_andnot_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_andnot_pd(a.n, b.n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vbicq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u64) / sizeof(r.u64[0])) ; i++) { + r.u64[i] = ~a.u64[i] & b.u64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_andnot_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_andnot_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_andnot_si128(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = ~(a.i32f[i]) & b.i32f[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_andnot_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_andnot_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_avg_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vrhaddq_u8(b.neon_u8, a.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_avg_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_avg_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_avg_epu16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u16 = vrhaddq_u16(b.neon_u16, a.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_avg_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_avg_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_bslli_si128 (simde__m128i a, const int imm8) { + simde__m128i r; + + if (HEDLEY_UNLIKELY(imm8 > 15)) { + r.u64[0] = 0; + r.u64[1] = 0; + return r; + } + + const int s = imm8 * 8; + +#if defined(SIMDE__HAVE_INT128) + r.u128[0] = a.u128[0] << s; +#else + if (s < 64) { + r.u64[0] = (a.u64[0] << s); + r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s)); + } else { + r.u64[0] = 0; + r.u64[1] = a.u64[0] << (s - 64); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +# define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_slli_si128(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_bslli_si128(a, imm8) \ + SIMDE__M128I_NEON_C(i8, (((imm8) <= 0) ? ((a).neon_i8) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), (a).neon_i8, 16 - (imm8)))))) +#endif +#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_bslli_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_bslli_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +# define _mm_slli_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_bslli_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_bsrli_si128 (simde__m128i a, const int imm8) { + simde__m128i r; + + if (HEDLEY_UNLIKELY(imm8 > 15)) { + r.u64[0] = 0; + r.u64[1] = 0; + return r; + } + + const int s = imm8 * 8; + +#if defined(SIMDE__HAVE_INT128) + r.u128[0] = a.u128[0] >> s; +#else + if (s < 64) { + r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s)); + r.u64[1] = (a.u64[1] >> s); + } else { + r.u64[0] = a.u64[1] >> (s - 64); + r.u64[1] = 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +# define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srli_si128(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_bsrli_si128(a, imm8) \ + SIMDE__M128I_NEON_C(i8, ((imm8) <= 0) ? ((a).neon_i8) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8((a).neon_i8, vdupq_n_s8(0), (imm8))))) +#endif +#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_bsrli_si128(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_bsrli_si128(SIMDE__M128I_FROM_NATIVE(a), (imm8)) +# define _mm_srli_si128(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_bsrli_si128(SIMDE__M128I_FROM_NATIVE(a), (imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_clflush (void const* p) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_clflush(p); +#else + (void) p; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_clflush(a, b) simde_mm_clflush() +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comieq_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comieq_sd(a.n, b.n); +#else + return a.f64[0] == b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comieq_sd(a, b) simde_mm_comieq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comige_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comige_sd(a.n, b.n); +#else + return a.f64[0] >= b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comige_sd(a, b) simde_mm_comige_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comigt_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comigt_sd(a.n, b.n); +#else + return a.f64[0] > b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comigt_sd(a, b) simde_mm_comigt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comile_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comile_sd(a.n, b.n); +#else + return a.f64[0] <= b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comile_sd(a, b) simde_mm_comile_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comilt_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comilt_sd(a.n, b.n); +#else + return a.f64[0] < b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comilt_sd(a, b) simde_mm_comilt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comineq_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_comineq_sd(a.n, b.n); +#else + return a.f64[0] != b.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_comineq_sd(a, b) simde_mm_comineq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_castpd_ps (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_castpd_ps(a.n)); +#else + union { + simde__m128d pd; + simde__m128 ps; + } r; + r.pd = a; + return r.ps; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castpd_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_castpd_ps(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_castpd_si128 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_castpd_si128(a.n)); +#else + union { + simde__m128d pd; + simde__m128i si128; + } r; + r.pd = a; + return r.si128; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castpd_si128(a) SIMDE__M128I_TO_NATIVE(simde_mm_castpd_si128(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_castps_pd (simde__m128 a) { +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_castps_pd(a.n)); +#else + union { + simde__m128 ps; + simde__m128d pd; + } r; + r.ps = a; + return r.pd; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castps_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_castps_pd(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_castps_si128 (simde__m128 a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_castps_si128(a.n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = a.neon_i32; +#else + r = *((simde__m128i*) &a); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castps_si128(a) SIMDE__M128I_TO_NATIVE(simde_mm_castps_si128(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_castsi128_pd (simde__m128i a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_castsi128_pd(a.n)); +#else + r = *((simde__m128d*) &a); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castsi128_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_castsi128_pd(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_castsi128_ps (simde__m128i a) { + simde__m128 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_castsi128_ps(a.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_f32 = a.neon_f32; +#else + r = *((simde__m128*) &a); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_castsi128_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_castsi128_ps(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpeq_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (a.i8[i] == b.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpeq_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpeq_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] == b.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpeq_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpeq_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (a.i32[i] == b.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpeq_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpeq_pd(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vreinterpretq_s32_u32(vceqq_s32(vreinterpretq_s32_f32(b.neon_f32), vreinterpretq_s32_f32(a.neon_f32))); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpeq_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpeq_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0; + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpeq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpneq_pd(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_f32 = vreinterpretq_f32_u16(vmvnq_u16(vceqq_s16(b.neon_i16, a.neon_i16))); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpneq_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpneq_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpneq_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpneq_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpneq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmplt_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (a.i8[i] < b.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmplt_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmplt_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmplt_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmplt_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (a.i32[i] < b.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmplt_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmplt_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmplt_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmplt_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmplt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmple_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmple_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmple_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmple_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmple_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmple_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpgt_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (a.i8[i] > b.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpgt_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpgt_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpgt_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpgt_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (a.i32[i] > b.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpgt_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpgt_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpgt_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_cmpgt_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpgt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpge_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpge_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpge_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_cmpge_sd(a.n, b.n); +#else + r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpge_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpge_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpnge_pd(a.n, b.n); +#else + r = simde_mm_cmplt_pd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnge_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnge_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_cmpnge_sd(a.n, b.n); +#else + r = simde_mm_cmplt_sd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnge_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnge_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpnlt_pd(a.n, b.n); +#else + r = simde_mm_cmpge_pd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnlt_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnlt_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpnlt_sd(a.n, b.n); +#else + r = simde_mm_cmpge_sd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnlt_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnlt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpnle_pd(a.n, b.n); +#else + r = simde_mm_cmpgt_pd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnle_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnle_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpnle_sd(a.n, b.n); +#else + r = simde_mm_cmpgt_sd(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnle_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpnle_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpord_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpord_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpord_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpord_sd(a.n, b.n); +#else + r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpord_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpord_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpunord_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpunord_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpunord_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cmpunord_sd(a.n, b.n); +#else + r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cmpunord_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cmpunord_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtepi32_pd (simde__m128i a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r = SIMDE__M128D_FROM_NATIVE(_mm_cvtepi32_pd(a.n)); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f64, a.m64[0].i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = (simde_float64) a.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi32_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_cvtepi32_pd(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtepi32_ps (simde__m128i a) { + simde__m128 r; + +#if defined(SIMDE_SSE2_NATIVE) + r = SIMDE__M128_FROM_NATIVE(_mm_cvtepi32_ps(a.n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_f32 = vcvtq_f32_s32(a.neon_i32); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f32, a.i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (simde_float32) a.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi32_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtepi32_ps(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtpd_epi32 (simde__m128d a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r = SIMDE__M128I_FROM_NATIVE(_mm_cvtpd_epi32(a.n)); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.m64[0].i32, a.f64); + r.m64[1] = simde_mm_setzero_si64(); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.i32[i] = (int32_t) a.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpd_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtpd_epi32(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtpd_pi32 (simde__m128d a) { + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r = SIMDE__M64_FROM_NATIVE(_mm_cvtpd_pi32(a.n)); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.f64); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (int32_t) a.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpd_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtpd_pi32(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpd_ps (simde__m128d a) { + simde__m128 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtpd_ps(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.m64[0].f32, a.f64); + r.m64[1] = simde_mm_setzero_si64(); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(a.f64) / sizeof(a.f64[0])) ; i++) { + r.f32[i] = (simde_float32) a.f64[i]; + } + r.m64[1] = simde_mm_setzero_si64(); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpd_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtpd_ps(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtpi32_pd (simde__m64 a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtpi32_pd(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f64, a.i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = (simde_float64) a.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpi32_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_cvtpi32_pd(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtps_epi32 (simde__m128 a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtps_epi32(a.n); +#elif defined(SIMDE_SSE2_NEON) + /* The default rounding mode on SSE is 'round to even', which ArmV7 + does not support! It is supported on ARMv8 however. */ + #if defined(SIMDE_ARCH_AARCH64) + r.neon_i32 = vcvtnq_s32_f32(a.neon_f32); + #else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, a.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */ + int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32(a.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + r.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal); + #endif +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (int32_t) a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtps_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtps_epi32(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtps_pd (simde__m128 a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtps_pd(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f64, a.m64[0].f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtps_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_cvtps_pd(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde_float64 +simde_mm_cvtsd_f64 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + return _mm_cvtsd_f64(a.n); +#else + return a.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvtsd_si32 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvtsd_si32(a.n); +#else + return (int32_t) a.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvtsd_si64 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if defined(__PGI) + return _mm_cvtsd_si64x(a.n); + #else + return _mm_cvtsd_si64(a.n); + #endif +#else + return (int32_t) a.f64[0]; +#endif +} +#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(SIMDE__M128D_FROM_NATIVE(a)) +# define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_cvtsd_ss(a.n, b.n)); +#else + simde__m128 r; + + r.f32[0] = (simde_float32) b.f64[0]; + + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i]; + } + + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsd_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvtsd_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvtsi128_si32 (simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvtsi128_si32(a.n); +#elif defined(SIMDE_SSE2_NEON) + return vgetq_lane_s32(a.neon_i32, 0); +#else + return a.i32[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvtsi128_si64 (simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if defined(__PGI) + return _mm_cvtsi128_si64x(a.n); + #else + return _mm_cvtsi128_si64(a.n); + #endif +#else + return a.i64[0]; +#endif +} +#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(SIMDE__M128I_FROM_NATIVE(a)) +# define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtsi32_sd(a.n, b); +#else + r.f64[0] = (simde_float64) b; + r.i64[1] = a.i64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi32_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cvtsi32_sd(SIMDE__M128D_FROM_NATIVE(a), b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtsi32_si128 (int32_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtsi32_si128(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); +#else + r.i32[0] = a; + r.i32[1] = 0; + r.i32[2] = 0; + r.i32[3] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi32_si128(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtsi32_si128(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if !defined(__PGI) + r.n = _mm_cvtsi64_sd(a.n, b); + #else + r.n = _mm_cvtsi64x_sd(a.n, b); + #endif +#else + r.f64[0] = (simde_float64) b; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi64_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cvtsi64_sd(SIMDE__M128D_FROM_NATIVE(a), b)) +# define _mm_cvtsi64x_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cvtsi64x_sd(SIMDE__M128D_FROM_NATIVE(a), b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtsi64_si128 (int64_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if !defined(__PGI) + r.n = _mm_cvtsi64_si128(a); + #else + r.n = _mm_cvtsi64x_si128(a); + #endif +#else + r.i64[0] = a; + r.i64[1] = 0; +#endif + + return r; +} +#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi64_si128(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtsi64_si128(a)) +# define _mm_cvtsi64x_si128(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtsi64x_si128(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvtss_sd(a.n, b.n); +#else + r.f64[0] = b.f32[0]; + r.i64[1] = a.i64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvtss_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_cvtss_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvttpd_epi32 (simde__m128d a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttpd_epi32(a.n); +#else + for (size_t i = 0 ; i < (sizeof(a.f64) / sizeof(a.f64[0])) ; i++) { + r.i32[i] = SIMDE_CONVERT_FTOI(int32_t, trunc(a.f64[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvttpd_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvttpd_epi32(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvttpd_pi32 (simde__m128d a) { + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttpd_pi32(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.f64); +#else + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = SIMDE_CONVERT_FTOI(int32_t, trunc(a.f64[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvttpd_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_cvttpd_pi32(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvttps_epi32 (simde__m128 a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_cvttps_epi32(a.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vcvtq_s32_f32(a.neon_f32); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.f32); +#else + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = SIMDE_CONVERT_FTOI(int32_t, truncf(a.f32[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvttps_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvttps_epi32(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvttsd_si32 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_cvttsd_si32(a.n); +#else + return SIMDE_CONVERT_FTOI(int32_t, trunc(a.f64[0])); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvttsd_si64 (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if !defined(__PGI) + return _mm_cvttsd_si64(a.n); + #else + return _mm_cvttsd_si64x(a.n); + #endif +#else + return SIMDE_CONVERT_FTOI(int64_t, trunc(a.f64[0])); +#endif +} +#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(SIMDE__M128D_FROM_NATIVE(a)) +# define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_div_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_div_pd(a.n, b.n); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.f64 = a.f64 / b.f64; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = a.f64[i] / b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_div_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_div_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_div_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_div_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] / b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_div_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_div_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_extract_epi16 (simde__m128i a, const int imm8) { + return a.u16[imm8 & 7]; +} +#if defined(SIMDE_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0)) +# define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_extract_epi16(a, imm8) (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t) UINT32_C(0x0000ffff))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_insert_epi16 (simde__m128i a, int32_t i, const int imm8) { + a.i16[imm8 & 7] = HEDLEY_STATIC_CAST(int16_t, i); + return a; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +# define simde_mm_insert_epi16(a, i, imm8) SIMDE__M128I_FROM_NATIVE(_mm_insert_epi16((a).n, (i), (imm8))) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_insert_epi16(a, i, imm8) SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_insert_epi16(a, i, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_insert_epi16(SIMDE__M128I_FROM_NATIVE(a), i, imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { + simde__m128d r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_pd(mem_addr); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u32 = vld1q_u32((uint32_t const*) mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_load_pd(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_load_pd(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_load_pd1 (simde_float64 const* mem_addr) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_pd1(mem_addr); +#else + r.f64[0] = *mem_addr; + r.f64[1] = *mem_addr; +#endif + + return r; +} +#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_load_pd1(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_load_pd1(mem_addr)) +# define _mm_load1_pd(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_load1_pd(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_load_sd (simde_float64 const* mem_addr) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_sd(mem_addr); +#else + memcpy(&r, mem_addr, sizeof(simde_float64)); + r.u64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_load_sd(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_load_sd(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_load_si128 (simde__m128i const* mem_addr) { + simde__m128i r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_load_si128(&(mem_addr->n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vld1q_s32((int32_t const*) mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_load_si128(mem_addr) SIMDE__M128I_TO_NATIVE(simde_mm_load_si128(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadh_pd(a.n, mem_addr); +#else + simde_float64 t; + memcpy(&t, mem_addr, sizeof(t)); + r.f64[0] = a.f64[0]; + r.f64[1] = t; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadh_pd(a, mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_loadh_pd(SIMDE__M128D_FROM_NATIVE(a), mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadl_epi64(&mem_addr->n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *) mem_addr), vcreate_s32(0)); +#else + r.u64[0] = mem_addr->u64[0]; + r.u64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadl_epi64(mem_addr) SIMDE__M128I_TO_NATIVE(simde_mm_loadl_epi64(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadl_pd(a.n, mem_addr); +#else + memcpy(&r, mem_addr, sizeof(simde_float64)); + r.u64[1] = a.u64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadl_pd(a, mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_loadl_pd(SIMDE__M128D_FROM_NATIVE(a), mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { + simde__m128d r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadr_pd(mem_addr); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + r.f64[0] = mem_addr[1]; + r.f64[1] = mem_addr[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadr_pd(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_loadr_pd(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadu_pd(mem_addr); +#else + simde_float64 l, h; + memcpy(&l, &mem_addr[0], sizeof(l)); + memcpy(&h, &mem_addr[1], sizeof(h)); + r.f64[0] = l; + r.f64[1] = h; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadu_pd(mem_addr) SIMDE__M128D_TO_NATIVE(simde_mm_loadu_pd(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_loadu_si128 (simde__m128i const* mem_addr) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_loadu_si128(&((*mem_addr).n)); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vld1q_s32((int32_t const*) mem_addr); +#else + memcpy(&r, mem_addr, sizeof(r)); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_loadu_si128(mem_addr) SIMDE__M128I_TO_NATIVE(simde_mm_loadu_si128(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_madd_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int32x4_t pl = vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16)); + int32x4_t ph = vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16)); + int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); + int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); + r.neon_i32 = vcombine_s32(rl, rh); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.i16[0])) ; i += 2) { + r.i32[i / 2] = (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_madd_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_madd_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_maskmoveu_si128(a.n, mask.n, HEDLEY_REINTERPRET_CAST(char*, mem_addr)); +#else + for (size_t i = 0 ; i < 16 ; i++) { + if (mask.u8[i] & 0x80) { + mem_addr[i] = a.i8[i]; + } + } +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(mask), mem_addr) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_movemask_epi8 (simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_movemask_epi8(a.n); +#elif defined(SIMDE_SSE2_NEON) + uint8x16_t input = a.neon_u8; + SIMDE_ALIGN(16) static const int8_t xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ((hi[0] << 8) | (lo[0] & 0xFF)); +#else + int32_t r = 0; + SIMDE__VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < 16 ; i++) { + r |= (a.u8[15 - i] >> 7) << (15 - i); + } + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_movemask_epi8(a) simde_mm_movemask_epi8(SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_movemask_pd (simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_movemask_pd(a.n); +#else + int32_t r = 0; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) { + r |= (a.u64[i] >> 63) << i; + } + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_movemask_pd(a) simde_mm_movemask_pd(SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_movepi64_pi64 (simde__m128i a) { + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_movepi64_pi64(a.n); +#else + r.i64[0] = a.i64[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_movepi64_pi64(a) SIMDE__M64_TO_NATIVE(simde_mm_movepi64_pi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_movpi64_epi64 (simde__m64 a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_movpi64_epi64(a.n); +#else + r.i64[0] = a.i64[0]; + r.i64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_movpi64_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_movpi64_epi64(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_min_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_min_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_min_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_min_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_min_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_min_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_min_sd(a.n, b.n); +#else + r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_min_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_min_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_max_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_max_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_max_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_max_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_max_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_max_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_max_sd(a.n, b.n); +#else + r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_max_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_max_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_move_epi64 (simde__m128i a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_move_epi64(a.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1); +#else + r.i64[0] = a.i64[0]; + r.i64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_move_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_move_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_move_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_move_sd(a.n, b.n); +#else + r.f64[0] = b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_move_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_move_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_epu32(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u64) / sizeof(r.u64[0])) ; i++) { + r.u64[i] = ((uint64_t) a.u32[i * 2]) * ((uint64_t) b.u32[i * 2]); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mul_epu32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mul_epu32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] * b.i64[i]; + } + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] % b.i64[i]; + } + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mul_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = a.f64[i] * b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mul_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_mul_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mul_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mul_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] * b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mul_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_mul_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_mul_su32 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_mul_su32(a.n, b.n); +#else + r.u64[0] = ((uint64_t) a.u32[0]) * ((uint64_t) b.u32[0]); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mul_su32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mul_su32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mulhi_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a3210 = vget_low_s16(a.neon_i16); + int16x4_t b3210 = vget_low_s16(b.neon_i16); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(a.neon_i16); + int16x4_t b7654 = vget_high_s16(b.neon_i16); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + r.neon_u16 = rv.val[1]; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u16[i] = (uint16_t) (((uint32_t) (((int32_t) a.i16[i]) * ((int32_t) b.i16[i]))) >> 16); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mulhi_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mulhi_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) + r.n = _mm_mulhi_epu16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (uint16_t) ((((uint32_t) a.u16[i]) * ((uint32_t) b.u16[i])) >> 16); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mulhi_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mulhi_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_mullo_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u16[i] = (uint16_t) (((uint32_t) (((int32_t) a.i16[i]) * ((int32_t) b.i16[i]))) & 0xffff); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mullo_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mullo_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_or_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_or_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] | b.i64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_or_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_or_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_or_si128 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_or_si128(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = a.i32f[i] | b.i32f[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_or_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_or_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packs_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i8[i] = (a.i16[i] > INT8_MAX) ? INT8_MAX : ((a.i16[i] < INT8_MIN) ? INT8_MIN : ((int8_t) a.i16[i])); + r.i8[i + 8] = (b.i16[i] > INT8_MAX) ? INT8_MAX : ((b.i16[i] < INT8_MIN) ? INT8_MIN : ((int8_t) b.i16[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_packs_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_packs_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packs_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i16[i] = (a.i32[i] > INT16_MAX) ? INT16_MAX : ((a.i32[i] < INT16_MIN) ? INT16_MIN : ((int16_t) a.i32[i])); + r.i16[i + 4] = (b.i32[i] > INT16_MAX) ? INT16_MAX : ((b.i32[i] < INT16_MIN) ? INT16_MIN : ((int16_t) b.i32[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_packs_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_packs_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_packus_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u8[i] = (a.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a.i16[i] < 0) ? UINT8_C(0) : ((uint8_t) a.i16[i])); + r.u8[i + 8] = (b.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b.i16[i] < 0) ? UINT8_C(0) : ((uint8_t) b.i16[i])); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_packus_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_packus_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_pause (void) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_pause(); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_pause() SIMDE__M128_TO_NATIVE(simde_mm_pause()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sad_epu8(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + uint16_t tmp = 0; + SIMDE__VECTORIZE_REDUCTION(+:tmp) + for (size_t j = 0 ; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2) ; j++) { + const size_t e = j + (i * 8); + tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e]) : (b.u8[e] - a.u8[e]); + } + r.i64[i] = tmp; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sad_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sad_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, + int8_t e11, int8_t e10, int8_t e9, int8_t e8, + int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, + e7, e6, e5, e4, e3, e2, e1, e0); +#else + r.i8[ 0] = e0; + r.i8[ 1] = e1; + r.i8[ 2] = e2; + r.i8[ 3] = e3; + r.i8[ 4] = e4; + r.i8[ 5] = e5; + r.i8[ 6] = e6; + r.i8[ 7] = e7; + r.i8[ 8] = e8; + r.i8[ 9] = e9; + r.i8[10] = e10; + r.i8[11] = e11; + r.i8[12] = e12; + r.i8[13] = e13; + r.i8[14] = e14; + r.i8[15] = e15; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, + int16_t e3, int16_t e2, int16_t e1, int16_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; + r.neon_i16 = vld1q_s16(data); +#else + r.i16[0] = e0; + r.i16[1] = e1; + r.i16[2] = e2; + r.i16[3] = e3; + r.i16[4] = e4; + r.i16[5] = e5; + r.i16[6] = e6; + r.i16[7] = e7; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi32(e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + SIMDE_ALIGN(16) int32_t data[4] = { e0, e1, e2, e3 }; + r.neon_i32 = vld1q_s32(data); +#else + r.i32[0] = e0; + r.i32[1] = e1; + r.i32[2] = e2; + r.i32[3] = e3; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_epi32(e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_set_epi32(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi64(e1.n, e0.n); +#else + r.i64[0] = e0.i64[0]; + r.i64[1] = e1.i64[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_epi64(e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_set_epi64(SIMDE__M64_FROM_NATIVE(e1), SIMDE__M64_FROM_NATIVE(e0))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set_epi64x (int64_t e1, int64_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_epi64x(e1, e0); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)); +#else + r.i64[0] = e0; + r.i64[1] = e1; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_epi64x(e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_set_epi64x(e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, + uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, + uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, + uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { + simde__m128i r; + + r.u8[ 0] = e0; + r.u8[ 1] = e1; + r.u8[ 2] = e2; + r.u8[ 3] = e3; + r.u8[ 4] = e4; + r.u8[ 5] = e5; + r.u8[ 6] = e6; + r.u8[ 7] = e7; + r.u8[ 8] = e8; + r.u8[ 9] = e9; + r.u8[10] = e10; + r.u8[11] = e11; + r.u8[12] = e12; + r.u8[13] = e13; + r.u8[14] = e14; + r.u8[15] = e15; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, + uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { + simde__m128i r; + + r.u16[0] = e0; + r.u16[1] = e1; + r.u16[2] = e2; + r.u16[3] = e3; + r.u16[4] = e4; + r.u16[5] = e5; + r.u16[6] = e6; + r.u16[7] = e7; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { + simde__m128i r; + + r.u32[0] = e0; + r.u32[1] = e1; + r.u32[2] = e2; + r.u32[3] = e3; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { + simde__m128i r; + + r.u64[0] = e0; + r.u64[1] = e1; + + return r; +} + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_pd(e1, e0); +#else + r.f64[0] = e0; + r.f64[1] = e1; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_pd(e1, e0) SIMDE__M128D_TO_NATIVE(simde_mm_set_pd(e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_set_pd1 (simde_float64 a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_pd(a); +#else + r.f64[0] = a; + r.f64[1] = a; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_pd1(a) SIMDE__M128D_TO_NATIVE(simde_mm_set_pd1(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_set_sd (simde_float64 a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set_sd(a); +#else + r.f64[0] = a; + r.u64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set_sd(a) SIMDE__M128D_TO_NATIVE(simde_mm_set_sd(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set1_epi8 (int8_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi8(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vdupq_n_s8(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_epi8(a) SIMDE__M128I_TO_NATIVE(simde_mm_set1_epi8(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set1_epi16 (int16_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi16(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vdupq_n_s16(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_epi16(a) SIMDE__M128I_TO_NATIVE(simde_mm_set1_epi16(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set1_epi32 (int32_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi32(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vdupq_n_s32(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_set1_epi32(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set1_epi64x (int64_t a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi64x(a); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vmovq_n_s64(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_epi64x(a) SIMDE__M128I_TO_NATIVE(simde_mm_set1_epi64x(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_set1_epi64 (simde__m64 a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_epi64(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[0]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_set1_epi64(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_set1_pd (simde_float64 a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_set1_pd(a); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.f64[i] = a; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_set1_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_set1_pd(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, + int8_t e11, int8_t e10, int8_t e9, int8_t e8, + int8_t e7, int8_t e6, int8_t e5, int8_t e4, + int8_t e3, int8_t e2, int8_t e1, int8_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, + e7, e6, e5, e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int8_t t[] = { + e15, e14, e13, e12, e11, e10, e9, e8, + e7, e6, e5, e4, e3, e2, e1, e0 + }; + r.neon_i8 = vld1q_s8(t); +#else + r.i8[ 0] = e15; + r.i8[ 1] = e14; + r.i8[ 2] = e13; + r.i8[ 3] = e12; + r.i8[ 4] = e11; + r.i8[ 5] = e10; + r.i8[ 6] = e9; + r.i8[ 7] = e8; + r.i8[ 8] = e7; + r.i8[ 9] = e6; + r.i8[10] = e5; + r.i8[11] = e4; + r.i8[12] = e3; + r.i8[13] = e2; + r.i8[14] = e1; + r.i8[15] = e0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, + int16_t e3, int16_t e2, int16_t e1, int16_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int16_t t[] = { + e7, e6, e5, e4, e3, e2, e1, e0 + }; + r.neon_i16 = vld1q_s16(t); +#else + r.i16[0] = e7; + r.i16[1] = e6; + r.i16[2] = e5; + r.i16[3] = e4; + r.i16[4] = e3; + r.i16[5] = e2; + r.i16[6] = e1; + r.i16[7] = e0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi32(e3, e2, e1, e0); +#elif defined(SIMDE_SSE2_NEON) + int32_t t[] = { + e3, e2, e1, e0 + }; + r.neon_i32 = vld1q_s32(t); +#else + r.i32[0] = e3; + r.i32[1] = e2; + r.i32[2] = e1; + r.i32[3] = e0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setr_epi32(e3, e2, e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_setr_epi32(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_epi64(e1.n, e0.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64); +#else + r.i64[0] = e1.i64[0]; + r.i64[1] = e0.i64[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setr_epi64(e1, e0) SIMDE__M128I_TO_NATIVE(simde_mm_setr_epi64(SIMDE__M64_FROM_NATIVE(e1), SIMDE__M64_FROM_NATIVE(e0))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setr_pd(e1, e0); +#else + r.f64[0] = e1; + r.f64[1] = e0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setr_pd(e1, e0) SIMDE__M128D_TO_NATIVE(simde_mm_setr_pd(e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_setzero_pd (void) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setzero_pd(); +#else + r.u64[0] = 0; + r.u64[1] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setzero_pd() SIMDE__M128D_TO_NATIVE(simde_mm_setzero_pd()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_setzero_si128 (void) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_setzero_si128(); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vdupq_n_s32(0); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_setzero_si128() SIMDE__M128I_TO_NATIVE(simde_mm_setzero_si128()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) { + simde__m128i r; + + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3]; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_shuffle_epi32(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_shuffle_epi32((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i) { .i32 = \ + SIMDE__SHUFFLE_VECTOR(32, 16, \ + (simde__tmp_a_).i32, \ + (simde__tmp_a_).i32, \ + ((imm8) ) & 3, \ + ((imm8) >> 2) & 3, \ + ((imm8) >> 4) & 3, \ + ((imm8) >> 6) & 3) }; })) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_epi32(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_shuffle_epi32(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8) { + simde__m128d r; + + r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1]; + r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1]; + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +# define simde_mm_shuffle_pd(a, b, imm8) SIMDE__M128D_FROM_NATIVE(_mm_shuffle_pd((a).n, (b).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \ + (simde__m128d) { .f64 = \ + SIMDE__SHUFFLE_VECTOR(64, 16, \ + (a).f64, \ + (b).f64, \ + (((imm8) ) & 1), \ + (((imm8) >> 1) & 1) + 2) }; })) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_pd(a, b, imm8) SIMDE__M128D_TO_NATIVE(simde_mm_shuffle_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) { + simde__m128i r; + + r.i64[0] = a.i64[0]; + for (size_t i = 4 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_shufflehi_epi16(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_shufflehi_epi16((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i) { .i16 = \ + SIMDE__SHUFFLE_VECTOR(16, 16, \ + (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, \ + 0, 1, 2, 3, \ + (((imm8) ) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }; })) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_shufflehi_epi16(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_shufflehi_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) { + simde__m128i r; + + for (size_t i = 0 ; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2) ; i++) { + r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)]; + } + r.i64[1] = a.i64[1]; + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_shufflelo_epi16(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_shufflelo_epi16((a).n, (imm8))) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i simde__tmp_a_ = a; \ + (simde__m128i) { .i16 = \ + SIMDE__SHUFFLE_VECTOR(16, 16, \ + (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3), \ + 4, 5, 6, 7) }; })) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_shufflelo_epi16(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_shufflelo_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sll_epi16(a.n, count.n); +#else + if (count.u64[0] > 15) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (uint16_t) (a.u16[i] << s); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sll_epi16(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_sll_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sll_epi32(a.n, count.n); +#else + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] << s; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sll_epi32(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_sll_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sll_epi64(a.n, count.n); +#else + if (HEDLEY_UNLIKELY(count.u64[0] > 63)) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] << s; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sll_epi64(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_sll_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_sqrt_pd (simde__m128d a) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sqrt_pd(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = sqrt(a.f64[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sqrt_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_sqrt_pd(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sqrt_sd(a.n, b.n); +#else + r.f64[0] = sqrt(b.f64[0]); + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sqrt_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_sqrt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_srl_epi16(a.n, count.n); +#else + if (count.u64[0] > 15) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = a.u16[i] >> s; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srl_epi16(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_srl_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_srl_epi32(a.n, count.n); +#else + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] >> s; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srl_epi32(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_srl_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_srl_epi64(a.n, count.n); +#else + if (count.u64[0] > 31) + return simde_mm_setzero_si128(); + const int s = (int) (count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u64) / sizeof(r.u64[0])) ; i++) { + r.u64[i] = a.u64[i] >> s; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srl_epi64(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_srl_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srai_epi16 (simde__m128i a, const int imm8) { + simde__m128i r; + const int cnt = imm8 > 15 ? 15 : imm8; + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.u16[0])) ; i++) { + r.i16[i] = a.i16[i] >> cnt; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_srai_epi16(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srai_epi16((a).n, (imm8))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srai_epi16(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_srai_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srai_epi32 (simde__m128i a, int imm8) { + simde__m128i r; + const int cnt = imm8 > 31 ? 31 : imm8; + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.u32[0])) ; i++) { + r.i32[i] = a.i32[i] >> cnt; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_srai_epi32(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srai_epi32((a).n, (imm8))) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_srai_epi32(a, imm8) SIMDE__M128I_NEON_C(i32, ((imm8) <= 0) ? (a.neon_i32) : (((imm8) > 31) ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), 16)) : (vshrq_n_s32(a.neon_i32, (imm8))))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srai_epi32(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_srai_epi32(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sra_epi16(a.n, count.n); +#else + const int cnt = (int) (count.i64[0] > 15 ? 15 : count.i64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] >> cnt; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sra_epi16(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_sra_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) + r.n = _mm_sra_epi32(a.n, count.n); +#else + const int cnt = count.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count.u64[0]); + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] >> cnt; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sra_epi32(a, count) SIMDE__M128I_TO_NATIVE(simde_mm_sra_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(count))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_slli_epi16 (simde__m128i a, const int imm8) { + simde__m128i r; + + const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = HEDLEY_STATIC_CAST(int16_t, a.i16[i] << s); + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_slli_epi16(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_slli_epi16(a, imm8) \ + SIMDE__M128I_NEON_C(i16, ((imm8) <= 0) ? ((a).neon_i16) : (((imm8) > 31) ? (vdupq_n_s16(0)) : (vshlq_n_s16((a).neon_i16, (imm8))))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_slli_epi16(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_slli_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_slli_epi32 (simde__m128i a, const int imm8) { + simde__m128i r; + + const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 : imm8; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] << s; + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_slli_epi32(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_slli_epi32(a, imm8) \ + SIMDE__M128I_NEON_C(i32, ((imm8) <= 0) ? ((a).neon_i32) : (((imm8) > 31) ? (vdupq_n_s32(0)) : (vshlq_n_s32((a).neon_i32, (imm8))))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_slli_epi32(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_slli_epi32(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_slli_epi64 (simde__m128i a, const int imm8) { + simde__m128i r; + + const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0 : imm8; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] << s; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_slli_epi64(a.n, imm8)) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_slli_epi64(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_slli_epi64(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srli_epi16 (simde__m128i a, const int imm8) { + simde__m128i r; + + const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u16[i] = a.u16[i] >> s; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srli_epi16(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_srli_epi16(a, imm8) \ + SIMDE__M128I_NEON_C(u16, ((imm8) <= 0) ? ((a).neon_u16) : (((imm8) > 31) ? (vdupq_n_u16(0)) : (vshrq_n_u16((a).neon_u16, (imm8))))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srli_epi16(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_srli_epi16(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srli_epi32 (simde__m128i a, const int imm8) { + simde__m128i r; + + const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 : imm8; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.u32[i] = a.u32[i] >> s; + } + + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srli_epi32(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_srli_epi32(a, imm8) \ + SIMDE__M128I_NEON_C(u32, ((imm8) <= 0) ? ((a).neon_u32) : (((imm8) > 31) ? (vdupq_n_u32(0)) : (vshrq_n_u32((a).neon_u32, (imm8))))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srli_epi32(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_srli_epi32(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_srli_epi64 (simde__m128i a, const int imm8) { + simde__m128i r; + const unsigned char s = imm8&255; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + if (s > 63) { + r.u64[i] = 0; + } else { + r.u64[i] = a.u64[i] >> s; + } + } + return r; +} +#if defined(SIMDE_SSE2_NATIVE) +# define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_FROM_NATIVE(_mm_srli_epi64(a.n, imm8)) +#elif defined(SIMDE_SSE2_NEON) +# define simde_mm_srli_epi64(a, imm8) \ + SIMDE__M128I_NEON_C(u64, (((imm8)&255) < 0 || ((imm8)&255) > 63) ? (vdupq_n_u64(0)) : ((((imm8)&255) == 0) ? (a.neon_u64) : (vshrq_n_u64((a).neon_u64, (imm8)&255)))) +#endif +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_srli_epi64(a, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_srli_epi64(SIMDE__M128I_FROM_NATIVE(a), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_store_pd(mem_addr, a) simde_mm_store_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_store1_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + mem_addr[0] = a.f64[0]; + mem_addr[1] = a.f64[0]; +#endif +} +#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a) +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +# define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_sd(mem_addr, a.n); +#else + memcpy(mem_addr, &a, sizeof(a.f64[0])); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_store_sd(mem_addr, a) SIMDE__M128D_TO_NATIVE(simde_mm_store_sd(mem_addr, SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_store_si128(&mem_addr->n, a.n); +#elif defined(SIMDE_SSE2_NEON) + vst1q_s32((int32_t*) mem_addr, a.neon_i32); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeh_pd(mem_addr, a.n); +#else + *mem_addr = a.f64[1]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_storel_epi64(&(mem_addr->n), a.n); +#elif defined(SIMDE_SSE2_NEON) + mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0); +#else + mem_addr->i64[0] = a.i64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_storel_pd(mem_addr, a.n); +#else + *mem_addr = a.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE2_NATIVE) + _mm_storer_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + mem_addr[0] = a.f64[1]; + mem_addr[1] = a.f64[0]; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeu_pd(mem_addr, a.n); +#else + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storeu_si128 (simde__m128i* mem_addr, simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_storeu_si128(&mem_addr->n, a.n); +#elif defined(SIMDE_SSE2_NEON) + int32_t v[4]; + vst1q_s32(v, a.neon_i32); + memcpy(mem_addr, v, sizeof(v)); +#else + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_pd(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(mem_addr, SIMDE__M128D_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_si128(&mem_addr->n, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_stream_si32(mem_addr, a); +#else + *mem_addr = a; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,3,0)) && \ + (!defined(HEDLEY_PGI_VERSION)) + _mm_stream_si64(HEDLEY_REINTERPRET_CAST(long long*, mem_addr), a); + #else + *mem_addr = a; + #endif +#else + *mem_addr = a; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(mem_addr, a) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vsubq_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] - b.i8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sub_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vsubq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i16[i] - b.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sub_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = vsubq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] - b.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sub_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_epi64(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i64 = vsubq_s64(a.neon_i64, b.neon_i64); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] - b.i64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_epi64(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sub_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_sub_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = a.f64[i] - b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_sub_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_sub_sd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_sub_sd(a.n, b.n); +#else + r.f64[0] = a.f64[0] - b.f64[0]; + r.f64[1] = a.f64[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_sub_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sub_si64 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSE2_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sub_si64(a.n, b.n)); +#else + simde__m64 r; + r.i64[0] = a.i64[0] - b.i64[0]; + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_sub_si64(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sub_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_subs_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i8 = vqsubq_s8(a.neon_i8, b.neon_i8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.i8[0])) ; i++) { + if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { + r.i8[i] = INT8_MIN; + } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { + r.i8[i] = INT8_MAX; + } else { + r.i8[i] = (a.i8[i]) - (b.i8[i]); + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_subs_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_subs_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_subs_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i16 = vqsubq_s16(a.neon_i16, b.neon_i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.i16[0])) ; i++) { + if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) { + r.i16[i] = INT16_MIN; + } else if ((b.i16[i]) < 0 && (a.i16[i]) > INT16_MAX + (b.i16[i])) { + r.i16[i] = INT16_MAX; + } else { + r.i16[i] = (a.i16[i]) - (b.i16[i]); + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_subs_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_subs_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_subs_epu8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u8 = vqsubq_u8(a.neon_u8, b.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.i8[0])) ; i++) { + const int32_t x = a.u8[i] - b.u8[i]; + if (x < 0) { + r.u8[i] = 0; + } else if (x > UINT8_MAX) { + r.u8[i] = UINT8_MAX; + } else { + r.u8[i] = (uint8_t) x; + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_subs_epu8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_subs_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_subs_epu16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_u16 = vqsubq_u16(a.neon_u16, b.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r) / sizeof(r.i16[0])) ; i++) { + const int32_t x = a.u16[i] - b.u16[i]; + if (x < 0) { + r.u16[i] = 0; + } else if (x > UINT16_MAX) { + r.u16[i] = UINT16_MAX; + } else { + r.u16[i] = (uint16_t) x; + } + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_subs_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_subs_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomieq_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] == b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomige_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] >= b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomigt_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] > b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomile_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] <= b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomilt_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] < b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE2_NATIVE) + return _mm_ucomineq_sd(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f64[0] != b.f64[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b)) +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_undefined_pd (void) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_pd(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r = simde_mm_setzero_pd(); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_undefined_pd() SIMDE__M128D_TO_NATIVE(simde_mm_undefined_pd()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_undefined_si128 (void) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_si128(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r = simde_mm_setzero_si128(); +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_undefined_si128() SIMDE__M128I_TO_NATIVE(simde_mm_undefined_si128()) +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_lfence (void) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_lfence(); +#else + simde_mm_sfence(); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_lfence() simde_mm_lfence() +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_mfence (void) { +#if defined(SIMDE_SSE2_NATIVE) + _mm_mfence(); +#else + simde_mm_sfence(); +#endif +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_mfence() simde_mm_mfence() +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpackhi_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + r.neon_i8 = vcombine_s8(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i8 = SIMDE__SHUFFLE_VECTOR(8, 16, a.i8, b.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i8[0])) / 2) ; i++) { + r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; + r.i8[(i * 2) + 1] = b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpackhi_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpackhi_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a1 = vget_high_s16(a.neon_i16); + int16x4_t b1 = vget_high_s16(b.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + r.neon_i16 = vcombine_s16(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i16 = SIMDE__SHUFFLE_VECTOR(16, 16, a.i16, b.i16, 4, 12, 5, 13, 6, 14, 7, 15); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i16[0])) / 2) ; i++) { + r.i16[(i * 2)] = a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; + r.i16[(i * 2) + 1] = b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpackhi_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpackhi_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int32x2_t a1 = vget_high_s32(a.neon_i32); + int32x2_t b1 = vget_high_s32(b.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + r.neon_i32 = vcombine_s32(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.i32, b.i32, 2, 6, 3, 7); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i32[0])) / 2) ; i++) { + r.i32[(i * 2)] = a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; + r.i32[(i * 2) + 1] = b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpackhi_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n =_mm_unpackhi_epi64(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i64 = SIMDE__SHUFFLE_VECTOR(64, 16, a.i64, b.i64, 1, 3); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i64[0])) / 2) ; i++) { + r.i64[(i * 2)] = a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; + r.i64[(i * 2) + 1] = b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_epi64(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpackhi_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpackhi_pd(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.f64 = SIMDE__SHUFFLE_VECTOR(64, 16, a.f64, b.f64, 1, 3); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.f64[0])) / 2) ; i++) { + r.f64[(i * 2)] = a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; + r.f64[(i * 2) + 1] = b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_unpackhi_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpacklo_epi8(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + r.neon_i8 = vcombine_s8(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i8 = SIMDE__SHUFFLE_VECTOR(8, 16, a.i8, b.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i8[0])) / 2) ; i++) { + r.i8[(i * 2)] = a.i8[i]; + r.i8[(i * 2) + 1] = b.i8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpacklo_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpacklo_epi16(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int16x4_t a1 = vget_low_s16(a.neon_i16); + int16x4_t b1 = vget_low_s16(b.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + r.neon_i16 = vcombine_s16(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i16 = SIMDE__SHUFFLE_VECTOR(16, 16, a.i16, b.i16, 0, 8, 1, 9, 2, 10, 3, 11); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i16[0])) / 2) ; i++) { + r.i16[(i * 2)] = a.i16[i]; + r.i16[(i * 2) + 1] = b.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpacklo_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpacklo_epi32(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + int32x2_t a1 = vget_low_s32(a.neon_i32); + int32x2_t b1 = vget_low_s32(b.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + r.neon_i32 = vcombine_s32(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.i32, b.i32, 0, 4, 1, 5); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i32[0])) / 2) ; i++) { + r.i32[(i * 2)] = a.i32[i]; + r.i32[(i * 2) + 1] = b.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpacklo_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_unpacklo_epi64(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.i64 = SIMDE__SHUFFLE_VECTOR(64, 16, a.i64, b.i64, 0, 2); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.i64[0])) / 2) ; i++) { + r.i64[(i * 2)] = a.i64[i]; + r.i64[(i * 2) + 1] = b.i64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_epi64(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_unpacklo_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n =_mm_unpacklo_pd(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.f64 = SIMDE__SHUFFLE_VECTOR(64, 16, a.f64, b.f64, 0, 2); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r) / sizeof(r.f64[0])) / 2) ; i++) { + r.f64[(i * 2)] = a.f64[i]; + r.f64[(i * 2) + 1] = b.f64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_unpacklo_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_xor_pd (simde__m128d a, simde__m128d b) { + simde__m128d r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_xor_pd(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i64[i] ^ b.i64[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_xor_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_xor_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NATIVE) + r.n = _mm_xor_si128(a.n, b.n); +#elif defined(SIMDE_SSE2_NEON) + r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = a.i32f[i] ^ b.i32f[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE2_ENABLE_NATIVE_ALIASES) +# define _mm_xor_si128(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_xor_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_not_si128 (simde__m128i a) { + simde__m128i r; + +#if defined(SIMDE_SSE2_NEON) + r.neon_i32 = vmvnq_s32(a.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32f) / sizeof(r.i32f[0])) ; i++) { + r.i32f[i] = ~(a.i32f[i]); + } +#endif + + return r; +} + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE2_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/sse3.h minimap2-2.17+dfsg/debian/include/simde/x86/sse3.h --- minimap2-2.17+dfsg/debian/include/simde/x86/sse3.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/sse3.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,253 @@ +/* Copyright (c) 2017 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE__SSE3_H) +# if !defined(SIMDE__SSE3_H) +# define SIMDE__SSE3_H +# endif +# include "sse2.h" + +# if defined(SIMDE_SSE3_NATIVE) +# undef SIMDE_SSE3_NATIVE +# endif +# if defined(SIMDE_SSE3_FORCE_NATIVE) +# define SIMDE_SSE3_NATIVE +# elif defined(__SSE3__) && (!defined(SIMDE_SSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)) +# define SIMDE_SSE3_NATIVE +# elif defined(__ARM_NEON) && !defined(SIMDE_SSE3_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_SSE3_NEON +# endif + +# if defined(SIMDE_SSE3_NATIVE) && !defined(SIMDE_SSE2_NATIVE) +# if defined(SIMDE_SSE3_FORCE_NATIVE) +# error Native SSE3 support requires native SSE2 support +# else +# warning Native SSE3 support requires native SSE2 support, disabling +# undef SIMDE_SSE3_NATIVE +# endif +# elif defined(SIMDE_SSE3_NEON) && !defined(SIMDE_SSE2_NEON) +# warning SSE3 NEON support requires SSE2 NEON support, disabling +# undef SIMDE_SSE3_NEON +# endif + +# if defined(SIMDE_SSE3_NATIVE) +# include +# endif + +#if !defined(SIMDE_SSE3_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_SSE3_ENABLE_NATIVE_ALIASES +#endif + +SIMDE__BEGIN_DECLS + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_addsub_pd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_addsub_pd(a.n, b.n)); +#else + simde__m128d r; + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i += 2) { + r.f64[ i] = a.f64[ i] - b.f64[ i]; + r.f64[1 + i] = a.f64[1 + i] + b.f64[1 + i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_addsub_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_addsub_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_addsub_ps(a.n, b.n)); +#else + return simde_mm_add_ps(a, simde_mm_mul_ps(simde_mm_set_ps( 1.0f, -1.0f, 1.0f, -1.0f), b)); +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_addsub_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_addsub_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_hadd_pd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_hadd_pd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = a.f64[0] + a.f64[1]; + r.f64[1] = b.f64[0] + b.f64[1]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_hadd_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_hadd_ps (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_hadd_ps(a.n, b.n)); +#elif defined(SIMDE_SSE3_NEON) + #if defined(SIMDE_ARCH_AARCH64) + return SIMDE__M128_NEON_C(f32, vpaddq_f32(a.neon_f32, b.neon_f32)); + #else + float32x2_t a10 = vget_low_f32(a.neon_f32); + float32x2_t a32 = vget_high_f32(a.neon_f32); + float32x2_t b10 = vget_low_f32(b.neon_f32); + float32x2_t b32 = vget_high_f32(b.neon_f32); + return SIMDE__M128_NEON_C(f32, vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); + #endif +#else + simde__m128 r; + r.f32[0] = a.f32[0] + a.f32[1]; + r.f32[1] = a.f32[2] + a.f32[3]; + r.f32[2] = b.f32[0] + b.f32[1]; + r.f32[3] = b.f32[2] + b.f32[3]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_hadd_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_hsub_pd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_hsub_pd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = a.f64[0] - a.f64[1]; + r.f64[1] = b.f64[0] - b.f64[1]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_pd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_hsub_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_hsub_ps (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_hsub_ps(a.n, b.n)); +#elif defined(SIMDE_SSE3_NEON) + const float32_t mp[] = { 1.0f, -1.0f, 1.0f, -1.0f }; + const float32x4_t m = vld1q_f32(mp); + + float32x4_t ap = vmulq_f32(a.neon_f32, m); + float32x4_t bp = vmulq_f32(b.neon_f32, m); + float32x2_t ax = vpadd_f32(vget_low_f32(ap), vget_high_f32(ap)); + float32x2_t bx = vpadd_f32(vget_low_f32(bp), vget_high_f32(bp)); + + return SIMDE__M128_NEON_C(f32, vcombine_f32(ax, bx)); +#else + simde__m128 r; + r.f32[0] = a.f32[0] - a.f32[1]; + r.f32[1] = a.f32[2] - a.f32[3]; + r.f32[2] = b.f32[0] - b.f32[1]; + r.f32[3] = b.f32[2] - b.f32[3]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_hsub_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_lddqu_si128 (simde__m128i const* mem_addr) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_lddqu_si128(&mem_addr->n)); +#elif defined(SIMDE_SSE3_NEON) + return SIMDE__M128I_NEON_C(i32, vld1q_s32((int32_t const*) mem_addr)); +#else + simde__m128i r; + memcpy(&r, mem_addr, sizeof(r)); + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_lddqu_si128(mem_addr) SIMDE__M128I_TO_NATIVE(simde_mm_lddqu_si128(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_movedup_pd (simde__m128d a) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_movedup_pd(a.n)); +#else + simde__m128d r; + r.f64[0] = a.f64[0]; + r.f64[1] = a.f64[0]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_movedup_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_movedup_pd(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_movehdup_ps (simde__m128 a) { +#if defined(SIMDE_SSE3_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_movehdup_ps(a.n)); +#else + simde__m128 r; + r.f32[0] = a.f32[1]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[3]; + r.f32[3] = a.f32[3]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_movehdup_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_movehdup_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_moveldup_ps (simde__m128 a) { +#if defined(SIMDE__SSE3_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_moveldup_ps(a.n)); +#else + simde__m128 r; + r.f32[0] = a.f32[0]; + r.f32[1] = a.f32[0]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[2]; + return r; +#endif +} +#if defined(SIMDE_SSE3_ENABLE_NATIVE_ALIASES) +# define _mm_moveldup_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_moveldup_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE3_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/sse4.1.h minimap2-2.17+dfsg/debian/include/simde/x86/sse4.1.h --- minimap2-2.17+dfsg/debian/include/simde/x86/sse4.1.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/sse4.1.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,1289 @@ +/* Copyright (c) 2017-2019 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE__SSE4_1_H) +# if !defined(SIMDE__SSE4_1_H) +# define SIMDE__SSE4_1_H +# endif +# include "ssse3.h" + +# if defined(SIMDE_SSE4_1_NATIVE) +# undef SIMDE_SSE4_1_NATIVE +# endif +# if defined(SIMDE_SSE4_1_FORCE_NATIVE) +# define SIMDE_SSE4_1_NATIVE +# elif defined(__SSE4_1__) && !defined(SIMDE_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# define SIMDE_SSE4_1_NATIVE +# elif defined(__ARM_NEON) && !defined(SIMDE_SSE4_1_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_SSE4_1_NEON +# endif + +# if defined(SIMDE_SSE4_1_NATIVE) && !defined(SIMDE_SSE3_NATIVE) +# if defined(SIMDE_SSE4_1_FORCE_NATIVE) +# error Native SSE4.1 support requires native SSE3 support +# else +# warning Native SSE4.1 support requires native SSE3 support, disabling +# undef SIMDE_SSE4_1_NATIVE +# endif +# elif defined(SIMDE_SSE4_1_NEON) && !defined(SIMDE_SSE3_NEON) +# warning SSE4.1 NEON support requires SSE3 NEON support, disabling +# undef SIMDE_SSE4_1_NEON +# endif + +# if defined(SIMDE_SSE4_1_NATIVE) +# include +# else +# if defined(SIMDE_SSE4_1_NEON) +# include +# endif +# endif + +SIMDE__BEGIN_DECLS + +#if !defined(SIMDE_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES +#endif + +#if defined(SIMDE_SSE4_1_NATIVE) +# define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT +# define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF +# define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF +# define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO +# define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION + +# define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC +# define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC +#else +# define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00 +# define SIMDE_MM_FROUND_TO_NEG_INF 0x01 +# define SIMDE_MM_FROUND_TO_POS_INF 0x02 +# define SIMDE_MM_FROUND_TO_ZERO 0x03 +# define SIMDE_MM_FROUND_CUR_DIRECTION 0x04 + +# define SIMDE_MM_FROUND_RAISE_EXC 0x00 +# define SIMDE_MM_FROUND_NO_EXC 0x08 +#endif + +#define SIMDE_MM_FROUND_NINT \ + (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC) +#define SIMDE_MM_FROUND_FLOOR \ + (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC) +#define SIMDE_MM_FROUND_CEIL \ + (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC) +#define SIMDE_MM_FROUND_TRUNC \ + (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC) +#define SIMDE_MM_FROUND_RINT \ + (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC) +#define SIMDE_MM_FROUND_NEARBYINT \ + (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC) + +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT +# define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF +# define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF +# define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO +# define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION +# define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC +# define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT +# define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR +# define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL +# define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC +# define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT +# define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8) { + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = ((imm8 >> i) & 1) ? b.u16[i] : a.u16[i]; + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_blend_epi16(a, b, imm8) SIMDE__M128I_FROM_NATIVE(_mm_blend_epi16(a.n, b.n, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blend_epi16(a, b, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_blend_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8) { + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = ((imm8 >> i) & 1) ? b.f64[i] : a.f64[i]; + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_blend_pd(a, b, imm8) SIMDE__M128D_FROM_NATIVE(_mm_blend_pd(a.n, b.n, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blend_pd(a, b, imm8) SIMDE__M128D_TO_NATIVE(simde_mm_blend_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8) { + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = ((imm8 >> i) & 1) ? b.f32[i] : a.f32[i]; + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_blend_ps(a, b, imm8) SIMDE__M128_FROM_NATIVE(_mm_blend_ps(a.n, b.n, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blend_ps(a, b, imm8) SIMDE__M128_TO_NATIVE(simde_mm_blend_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_blendv_epi8(a.n, b.n, mask.n)); +#elif defined(SIMDE_SSE4_1_NEON) + simde__m128i mask_ = simde_mm_cmplt_epi8(mask, simde_mm_set1_epi8(0)); + return SIMDE__M128I_NEON_C(i8, vbslq_s8(mask_.neon_u8, b.neon_i8, a.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + if (mask.u8[i] & 0x80) { + r.u8[i] = b.u8[i]; + } else { + r.u8[i] = a.u8[i]; + } + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blendv_epi8(a, b, mask) SIMDE__M128I_TO_NATIVE(simde_mm_blendv_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b), SIMDE__M128_FROM_NATIVE(mask))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_blendv_pd(a.n, b.n, mask.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + if (mask.u64[i] & (UINT64_C(1) << 63)) { + r.f64[i] = b.f64[i]; + } else { + r.f64[i] = a.f64[i]; + } + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blendv_pd(a, b, mask) SIMDE__M128D_TO_NATIVE(simde_mm_blendv_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b), SIMDE__M128D_FROM_NATIVE(mask))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_blendv_ps(a.n, b.n, mask.n)); +#else + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + if (mask.u32[i] & (UINT32_C(1) << 31)) { + r.f32[i] = b.f32[i]; + } else { + r.f32[i] = a.f32[i]; + } + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_blendv_ps(a, b, mask) SIMDE__M128_TO_NATIVE(simde_mm_blendv_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), SIMDE__M128_FROM_NATIVE(mask))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_ceil_pd (simde__m128d a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_ceil_pd(a.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = ceil(a.f64[i]); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_ceil_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_ceil_pd(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_ceil_ps (simde__m128 a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_ceil_ps(a.n)); +#else + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = ceilf(a.f32[i]); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_ceil_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_ceil_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_ceil_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_ceil_sd(a.n, b.n)); +#else + return simde_mm_set_pd(a.f64[1], ceil(b.f64[0])); +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_ceil_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_ceil_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_ceil_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_ceil_ss(a.n, b.n)); +#else + return simde_mm_set_ps(a.f32[3], a.f32[2], a.f32[1], ceilf(b.f32[0])); +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_ceil_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_ceil_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cmpeq_epi64(a.n, b.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u64) / sizeof(r.u64[0])) ; i++) { + r.u64[i] = (a.u64[i] == b.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_epi64(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_cmpeq_epi64(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi8_epi16 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi8_epi16(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.i8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi8_epi16(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi8_epi16(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi8_epi32 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi8_epi32(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi8_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi8_epi32(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi8_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi8_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi8_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi8_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu8_epi16 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu8_epi16(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = a.u8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu8_epi16(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu8_epi16(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu8_epi32 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu8_epi32(a.n)); +#elif defined(SIMDE_SSE4_1_NEON) + uint8x16_t u8x16 = a.neon_u8; /* blendx blendx blendx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return SIMDE__M128I_NEON_C(u32, u32x4); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.u8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu8_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu8_epi32(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu8_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu8_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.u8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu8_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu8_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi16_epi32 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi16_epi32(a.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i32, vmovl_s16(vget_low_s16(a.neon_i16))); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi16_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi16_epi32(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu16_epi32 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu16_epi32(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.u16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu16_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu16_epi32(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu16_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu16_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.u16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu16_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu16_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi16_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi16_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi16_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi16_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepi32_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepi32_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.i32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepi32_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepi32_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_cvtepu32_epi64 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_cvtepu32_epi64(a.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = a.u32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_cvtepu32_epi64(a) SIMDE__M128I_TO_NATIVE(simde_mm_cvtepu32_epi64(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8) { + simde__m128d r; + simde_float64 sum = SIMDE_FLOAT64_C(0.0); + + SIMDE__VECTORIZE_REDUCTION(+:sum) + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + sum += ((imm8 >> (i + 4)) & 1) ? (a.f64[i] * b.f64[i]) : 0.0; + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0; + } + + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_dp_pd(a, b, imm8) SIMDE__M128D_FROM_NATIVE(_mm_dp_pd(a.n, b.n, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_dp_pd(a, b, imm8) SIMDE__M128D_TO_NATIVE(simde_mm_dp_pd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8) { + simde__m128 r; + simde_float32 sum = SIMDE_FLOAT32_C(0.0); + + SIMDE__VECTORIZE_REDUCTION(+:sum) + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + sum += ((imm8 >> (i + 4)) & 1) ? (a.f32[i] * b.f32[i]) : SIMDE_FLOAT32_C(0.0); + } + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0); + } + + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_dp_ps(a, b, imm8) SIMDE__M128_FROM_NATIVE(_mm_dp_ps(a.n, b.n, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_dp_ps(a, b, imm8) SIMDE__M128_TO_NATIVE(simde_mm_dp_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), imm8)) +#endif + +#if defined(simde_mm_extract_epi8) +# undef simde_mm_extract_epi8 +#endif +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_extract_epi8 (simde__m128i a, const int imm8) { + return a.u8[imm8&15]; +} +#if defined(SIMDE_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8) +# define simde_mm_extract_epi8(a, imm8) _mm_extract_epi8(a.n, imm8) +#elif defined(SIMDE_SSE4_1_NEON) +# define simde_mm_extract_epi8(a, imm8) (int32_t)((uint8_t)vgetq_lane_s8(a.neon_i8, imm8)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_extract_epi8(a, imm8) simde_mm_extract_epi8(SIMDE__M128I_FROM_NATIVE(a), imm8) +#endif + +#if defined(simde_mm_extract_epi32) +# undef simde_mm_extract_epi32 +#endif +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_extract_epi32 (simde__m128i a, const int imm8) { + return a.i32[imm8&3]; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a.n, imm8) +#elif defined(SIMDE_SSE4_1_NEON) +# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(a.neon_i32, imm8) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(SIMDE__M128I_FROM_NATIVE(a), imm8) +#endif + +#if defined(simde_mm_extract_epi64) +# undef simde_mm_extract_epi64 +#endif +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_extract_epi64 (simde__m128i a, const int imm8) { + return a.i64[imm8&1]; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a.n, imm8) +#elif defined(SIMDE_SSE4_1_NEON) +# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(a.neon_i64, imm8) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(SIMDE__M128I_FROM_NATIVE(a), imm8) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_floor_pd (simde__m128d a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_floor_pd(a.n)); +#else + simde__m128d r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + r.f64[i] = floor(a.f64[i]); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_floor_pd(a) SIMDE__M128D_TO_NATIVE(simde_mm_floor_pd(SIMDE__M128D_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_floor_ps (simde__m128 a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_floor_ps(a.n)); +#else + simde__m128 r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = floorf(a.f32[i]); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_floor_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_floor_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_floor_sd (simde__m128d a, simde__m128d b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128D_FROM_NATIVE(_mm_floor_sd(a.n, b.n)); +#else + simde__m128d r; + r.f64[0] = floor(b.f64[0]); + r.f64[1] = a.f64[1]; + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_floor_sd(a, b) SIMDE__M128D_TO_NATIVE(simde_mm_floor_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_floor_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128_FROM_NATIVE(_mm_floor_ss(a.n, b.n)); +#else + simde__m128 r; + r.f32[0] = floorf(b.f32[0]); + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_floor_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_floor_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) { + a.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i); + return a; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_insert_epi8(a, i, imm8) SIMDE__M128I_FROM_NATIVE(_mm_insert_epi8(a.n, i, imm8)); +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_insert_epi8(a, i, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_insert_epi8(SIMDE__M128I_FROM_NATIVE(a), i, imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) { + a.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i); + return a; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_insert_epi32(a, i, imm8) SIMDE__M128I_FROM_NATIVE(_mm_insert_epi32(a.n, i, imm8)); +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_insert_epi32(a, i, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_insert_epi32(SIMDE__M128I_FROM_NATIVE(a), i, imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) { + a.i64[imm8] = i; + return a; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_insert_epi64(a, i, imm8) SIMDE__M128I_FROM_NATIVE(_mm_insert_epi64(a.n, i, imm8)); +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_insert_epi64(a, i, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_insert_epi64(SIMDE__M128I_FROM_NATIVE(a), i, imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) { + simde__m128 r; + + a.f32[0] = b.f32[(imm8 >> 6) & 3]; + a.f32[(imm8 >> 4) & 3] = a.f32[0]; + + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a.f32[i]; + } + + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_insert_ps(a, b, imm8) SIMDE__M128_FROM_NATIVE(_mm_insert_ps((a).n, (b).n, imm8)); +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_insert_ps(a, b, imm8) SIMDE__M128_TO_NATIVE(simde_mm_insert_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epi8 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI) + return SIMDE__M128I_FROM_NATIVE(_mm_max_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i8, vmaxq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] > b.i8[i] ? a.i8[i] : b.i8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_max_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI) + return SIMDE__M128I_FROM_NATIVE(_mm_max_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i32, vmaxq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] > b.i32[i] ? a.i32[i] : b.i32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_max_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epu16 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_max_epu16(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(u16, vmaxq_u16(a.neon_u16, b.neon_u16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = a.u16[i] > b.u16[i] ? a.u16[i] : b.u16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_max_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_max_epu32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_max_epu32(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(u32, vmaxq_u32(a.neon_u32, b.neon_u32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] > b.u32[i] ? a.u32[i] : b.u32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_max_epu32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_max_epu32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epi8 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI) + return SIMDE__M128I_FROM_NATIVE(_mm_min_epi8(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i8, vminq_s8(a.neon_i8, b.neon_i8)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = a.i8[i] < b.i8[i] ? a.i8[i] : b.i8[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_min_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) && !defined(__PGI) + return SIMDE__M128I_FROM_NATIVE(_mm_min_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i32, vminq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] < b.i32[i] ? a.i32[i] : b.i32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_min_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epu16 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_min_epu16(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(u16, vminq_u16(a.neon_u16, b.neon_u16)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = a.u16[i] < b.u16[i] ? a.u16[i] : b.u16[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_min_epu16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epu16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_min_epu32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_min_epu32(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(u32, vminq_u32(a.neon_u32, b.neon_u32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] < b.u32[i] ? a.u32[i] : b.u32[i]; + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_min_epu32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_min_epu32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_minpos_epu16 (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_minpos_epu16(a.n)); +#else + simde__m128i r = simde_x_mm_set_epu16(0, 0, 0, 0, 0, 0, 0, UINT16_MAX); + + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + if (a.u16[i] < r.u16[0]) { + r.u16[0] = a.u16[i]; + r.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i); + } + } + + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_minpos_epu16(a) SIMDE__M128I_TO_NATIVE(simde_mm_minpos_epu16(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8) { + simde__m128i r; + const int a_offset = imm8 & 4; + const int b_offset = (imm8 & 3) << 2; + + for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r.u16) / sizeof(r.u16[0]))) ; i++) { + r.u16[i] = + HEDLEY_STATIC_CAST(uint16_t, abs(a.u8[a_offset + i + 0] - b.u8[b_offset + 0])) + + HEDLEY_STATIC_CAST(uint16_t, abs(a.u8[a_offset + i + 1] - b.u8[b_offset + 1])) + + HEDLEY_STATIC_CAST(uint16_t, abs(a.u8[a_offset + i + 2] - b.u8[b_offset + 2])) + + HEDLEY_STATIC_CAST(uint16_t, abs(a.u8[a_offset + i + 3] - b.u8[b_offset + 3])); + } + + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_mpsadbw_epu8(a, b, imm8) SIMDE__M128I_FROM_NATIVE(_mm_mpsadbw_epu8(a.n, b.n, imm8)); +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_mpsadbw_epu8(a, b, imm8) SIMDE__M128I_TO_NATIVE(simde_mm_mpsadbw_epu8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_mul_epi32(a.n, b.n)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i64) / sizeof(r.i64[0])) ; i++) { + r.i64[i] = + HEDLEY_STATIC_CAST(int64_t, a.i32[i * 2]) * + HEDLEY_STATIC_CAST(int64_t, b.i32[i * 2]); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_mul_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mul_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_mullo_epi32(a.n, b.n)); +#elif defined(SIMDE_SSE4_1_NEON) + return SIMDE__M128I_NEON_C(i32, vmulq_s32(a.neon_i32, b.neon_i32)); +#else + simde__m128i r; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b.i32[i]))) & 0xffffffff)); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_mullo_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mullo_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_packus_epi32(a.n, b.n)); +#else + simde__m128i r; + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.u16[i + 0] = (a.i32[i] < 0) ? UINT16_C(0) : ((a.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, a.i32[i])); + r.u16[i + 4] = (b.i32[i] < 0) ? UINT16_C(0) : ((b.i32[i] > UINT16_MAX) ? (UINT16_MAX) : HEDLEY_STATIC_CAST(uint16_t, b.i32[i])); + } + return r; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_packus_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_packus_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_round_pd (simde__m128d a, int rounding) { + simde__m128d r; + for (size_t i = 0 ; i < (sizeof(r.f64) / sizeof(r.f64[0])) ; i++) { + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_TO_NEAREST_INT: + r.f64[i] = nearbyint(a.f64[i]); + break; + case SIMDE_MM_FROUND_TO_NEG_INF: + r.f64[i] = floor(a.f64[i]); + break; + case SIMDE_MM_FROUND_TO_POS_INF: + r.f64[i] = ceil(a.f64[i]); + break; + case SIMDE_MM_FROUND_TO_ZERO: + r.f64[i] = trunc(a.f64[i]); + break; + case SIMDE_MM_FROUND_CUR_DIRECTION: + r.f64[i] = nearbyint(a.f64[i]); + break; + default: + HEDLEY_UNREACHABLE(); + break; + } + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_round_pd(a, rounding) SIMDE__M128D_FROM_NATIVE(_mm_round_pd((a).n, rounding)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_round_pd(a, rounding) SIMDE__M128D_TO_NATIVE(simde_mm_round_pd(SIMDE__M128D_FROM_NATIVE(a), rounding)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_round_ps (simde__m128 a, int rounding) { + simde__m128 r; + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_TO_NEAREST_INT: + r.f32[i] = nearbyintf(a.f32[i]); + break; + case SIMDE_MM_FROUND_TO_NEG_INF: + r.f32[i] = floorf(a.f32[i]); + break; + case SIMDE_MM_FROUND_TO_POS_INF: + r.f32[i] = ceilf(a.f32[i]); + break; + case SIMDE_MM_FROUND_TO_ZERO: + r.f32[i] = truncf(a.f32[i]); + break; + case SIMDE_MM_FROUND_CUR_DIRECTION: + r.f32[i] = nearbyintf (a.f32[i]); + break; + default: + HEDLEY_UNREACHABLE(); + break; + } + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_round_ps(a, rounding) SIMDE__M128_FROM_NATIVE(_mm_round_ps((a).n, rounding)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_round_ps(a, rounding) SIMDE__M128_TO_NATIVE(simde_mm_round_ps(SIMDE__M128_FROM_NATIVE(a), rounding)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) { + simde__m128d r = a; + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_TO_NEAREST_INT: + r.f64[0] = nearbyint(b.f64[0]); + break; + case SIMDE_MM_FROUND_TO_NEG_INF: + r.f64[0] = floor(b.f64[0]); + break; + case SIMDE_MM_FROUND_TO_POS_INF: + r.f64[0] = ceil(b.f64[0]); + break; + case SIMDE_MM_FROUND_TO_ZERO: + r.f64[0] = trunc(b.f64[0]); + break; + case SIMDE_MM_FROUND_CUR_DIRECTION: + r.f64[0] = nearbyint(b.f64[0]); + break; + default: + HEDLEY_UNREACHABLE(); + break; + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_round_sd(a, b, rounding) SIMDE__M128D_FROM_NATIVE(_mm_round_sd((a).n, (b).n, rounding)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_round_sd(a, b, rounding) SIMDE__M128D_TO_NATIVE(simde_mm_round_sd(SIMDE__M128D_FROM_NATIVE(a), SIMDE__M128D_FROM_NATIVE(b), rounding)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) { + simde__m128 r = a; + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_TO_NEAREST_INT: + r.f32[0] = nearbyintf(b.f32[0]); + break; + case SIMDE_MM_FROUND_TO_NEG_INF: + r.f32[0] = floorf(b.f32[0]); + break; + case SIMDE_MM_FROUND_TO_POS_INF: + r.f32[0] = ceilf(b.f32[0]); + break; + case SIMDE_MM_FROUND_TO_ZERO: + r.f32[0] = truncf(b.f32[0]); + break; + case SIMDE_MM_FROUND_CUR_DIRECTION: + r.f32[0] = nearbyintf (b.f32[0]); + break; + default: + HEDLEY_UNREACHABLE(); + break; + } + return r; +} +#if defined(SIMDE_SSE4_1_NATIVE) +# define simde_mm_round_ss(a, b, rounding) SIMDE__M128_FROM_NATIVE(_mm_round_ss((a).n, (b).n, rounding)) +#endif +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_round_ss(a, b, rounding) SIMDE__M128_TO_NATIVE(simde_mm_round_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), rounding)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { +#if defined(SIMDE_SSE4_1_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_stream_load_si128(HEDLEY_CONST_CAST(__m128i*, &(mem_addr->n)))); +#else + return *mem_addr; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_stream_load_si128(mem_addr) SIMDE__M128I_TO_NATIVE(simde_mm_stream_load_si128(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_test_all_ones (simde__m128i a) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_test_all_ones(a.n); +#else + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) { + if (a.u64[i] != ~UINT64_C(0)) + return 0; + } + return 1; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_test_all_ones(a) simde_mm_test_all_ones(SIMDE__M128I_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_test_all_zeros(a.n, mask.n); +#else + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) { + if ((a.u64[i] & mask.u64[i]) != 0) + return 0; + } + return 1; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(mask)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_test_mix_ones_zeros(a.n, mask.n); +#else + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) + if (((a.u64[i] & mask.u64[i]) != 0) && ((~a.u64[i] & mask.u64[i]) != 0)) + return 1; + return 0; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(mask)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_testc_si128(a.n, b.n); +#else + int_fast32_t r = 0; + + SIMDE__VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a.i32f) / sizeof(a.i32f[0])) ; i++) { + r |= ~a.i32f[i] & b.i32f[i]; + } + + return HEDLEY_STATIC_CAST(int, !r); +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_testc_si128(a, b) simde_mm_testc_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_testnzc_si128(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) { + if (((a.u64[i] & b.u64[i]) != 0) && ((~a.u64[i] & b.u64[i]) != 0)) + return 1; + } + return 0; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSE4_1_NATIVE) + return _mm_testz_si128(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(a.u64) / sizeof(a.u64[0])) ; i++) { + if ((a.u64[i] & b.u64[i]) == 0) + return 1; + } + return 0; +#endif +} +#if defined(SIMDE_SSE4_1_ENABLE_NATIVE_ALIASES) +# define _mm_testz_si128(a, b) simde_mm_testz_si128(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b)) +#endif + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE4_1_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/sse.h minimap2-2.17+dfsg/debian/include/simde/x86/sse.h --- minimap2-2.17+dfsg/debian/include/simde/x86/sse.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/sse.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,2990 @@ +/* Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2017-2019 Evan Nemerson + * 2015-2017 John W. Ratcliff + * 2015 Brandon Rowlett + * 2015 Ken Fast + */ + +#if !defined(SIMDE__SSE_H) +# if !defined(SIMDE__SSE_H) +# define SIMDE__SSE_H +# endif +# include "mmx.h" + +# if defined(SIMDE_SSE_NATIVE) +# undef SIMDE_SSE_NATIVE +# endif +# if defined(SIMDE_SSE_FORCE_NATIVE) +# define SIMDE_SSE_NATIVE +# elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# define SIMDE_SSE_NATIVE +# elif defined(_M_IX86_FP) && !defined(SIMDE_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# if (_M_IX86_FP >= 1) +# define SIMDE_SSE_NATIVE +# endif +# elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_SSE_NEON +# endif + +# if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE) +# if defined(SIMDE_SSE_FORCE_NATIVE) +# error Native SSE support requires native MMX support +# else +# warning Native SSE support requires native MMX support, disabling +# undef SIMDE_SSE_NATIVE +# endif +# elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON) +# warning SSE3 NEON support requires MMX NEON support, disabling +# undef SIMDE_SSE3_NEON +# endif + +# if defined(SIMDE_SSE_NATIVE) +# include +# else +# if defined(SIMDE_SSE_NEON) +# include +# endif + +# if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +# include +# elif defined(_WIN32) +# include +# endif +# endif + +#include +#include + +HEDLEY_DIAGNOSTIC_PUSH +# if HEDLEY_HAS_WARNING("-Wfloat-equal") +# pragma clang diagnostic ignored "-Wfloat-equal" +# endif + +SIMDE__BEGIN_DECLS + +typedef union { +#if defined(SIMDE__ENABLE_GCC_VEC_EXT) + SIMDE_ALIGN(16) int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); + #if defined(SIMDE__HAVE_INT128) + SIMDE_ALIGN(16) simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); + #endif + SIMDE_ALIGN(16) simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) int_fast32_t i32f __attribute__((__vector_size__(16), __may_alias__)); + SIMDE_ALIGN(16) uint_fast32_t u32f __attribute__((__vector_size__(16), __may_alias__)); +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; + #if defined(SIMDE__HAVE_INT128) + SIMDE_ALIGN(16) simde_int128 i128[1]; + SIMDE_ALIGN(16) simde_uint128 u128[1]; + #endif + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_SSE_NATIVE) + SIMDE_ALIGN(16) __m128 n; +#elif defined(SIMDE_SSE_NEON) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; +#endif +} simde__m128; + +#if defined(SIMDE_SSE_NATIVE) + HEDLEY_STATIC_ASSERT(sizeof(__m128) == sizeof(simde__m128), "__m128 size doesn't match simde__m128 size"); +#endif +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); + +#if !defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_SSE_ENABLE_NATIVE_ALIASES + typedef simde__m128 __m128; +#endif + +#if defined(SIMDE_SSE_NATIVE) + SIMDE__FUNCTION_ATTRIBUTES simde__m128 SIMDE__M128_FROM_NATIVE(__m128 v) { simde__m128 r; r.n = v; return r; } +# define SIMDE__M128_TO_NATIVE(v) (v.n) +#elif defined(SIMDE_SSE_NEON) + #define SIMDE__M128_NEON_C(T, expr) (simde__m128) { .neon_##T = expr } +#else +# define SIMDE__M128_FROM_NATIVE(val) (val) +# define SIMDE__M128_TO_NATIVE(val) (val) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_add_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_add_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.f32 = a.f32 + b.f32; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i] + b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_add_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_add_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_add_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_add_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t b0 = vgetq_lane_f32(b.neon_f32, 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + /* the upper values in the result must be the remnants of . */ + r.neon_f32 = vaddq_f32(a.neon_f32, value); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_add_ps(a, b).f32, 4, 1, 2, 3); +#else + r.f32[0] = a.f32[0] + b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_add_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_add_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_and_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_and_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32); +#elif defined(SIMDE__ENABLE_GCC_VEC_EXT) + r.i32 = a.i32 & b.i32; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = a.i32[i] & b.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_and_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_and_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_andnot_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_andnot_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = ~(a.i32[i]) & b.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_andnot_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_andnot_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_avg_pu16(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < 4 ; i++) { + r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; + } +#endif + + return r; +} +#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_avg_pu16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_avg_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pavgw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_avg_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_avg_pu8(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < 8 ; i++) { + r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; + } +#endif + + return r; +} +#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_avg_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_avg_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pavgb(a, b) SIMDE__M64_TO_NATIVE(simde_mm_avg_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpeq_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpeq_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpeq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpeq_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpeq_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpeq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpge_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpge_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpge_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpge_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpge_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpge_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpge_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpgt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpgt_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpgt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpgt_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpgt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpgt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmple_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmple_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmple_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmple_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmple_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmple_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmple_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmplt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmplt_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmplt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmplt_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmplt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmplt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpneq_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpneq_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpneq_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpneq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t e = vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + float32x4_t s = vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e))); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpneq_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = a.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpneq_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpneq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnge_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmplt_ps(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnge_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnge_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpnge_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmplt_ss(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnge_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnge_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpngt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmple_ps(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpngt_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpngt_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpngt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmple_ss(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpngt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpngt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnle_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmpgt_ps(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnle_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnle_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnle_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t s = vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#else + r = simde_mm_cmpgt_ss(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnle_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnle_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnlt_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); +#else + r = simde_mm_cmpge_ps(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnlt_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnlt_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpnlt_ss(a.n, b.n); +#else + r = simde_mm_cmpge_ss(a, b); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpnlt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpnlt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpord_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + /* Note: NEON does not have ordered compare builtin + Need to compare a eq a and b eq b to check for NaN + Do AND of results to get final */ + uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); + r.neon_u32 = vandq_u32(ceqaa, ceqbb); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0 : 0xffffffff; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpord_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpord_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpord_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); + float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb)); + float32x4_t t = vextq_f32(a.neon_f32, s, 1); + r.neon_f32 = vextq_f32(t, t, 3); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpord_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpord_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpord_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cmpunord_ps(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff : 0; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpunord_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpunord_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) + r.n = _mm_cmpunord_ss(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_cmpunord_ps(a, b).f32, 4, 1, 2, 3); +#else + r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cmpunord_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cmpunord_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comieq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] == b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comieq_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comieq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comige_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comige_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] >= b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comige_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comige_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comigt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] > b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comigt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comigt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comile_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comile_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] <= b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comile_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comile_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comilt_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] < b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comilt_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comilt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_comineq_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) ? 1 : 0; +#else + return a.f32[0] != b.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_comineq_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_comineq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_pi2ps(a.n, b.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.m64[0].f32, b.i32); + r.m64[1] = a.m64[1]; +#else + r.f32[0] = (simde_float32) b.i32[0]; + r.f32[1] = (simde_float32) b.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvt_pi2ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvt_pi2ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvt_ps2pi (simde__m128 a) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_ps2pi(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) && !defined(__clang__) + SIMDE__CONVERT_VECTOR(r.i32, a.m64[0].f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (int32_t) a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvt_ps2pi(a) SIMDE__M64_TO_NATIVE(simde_mm_cvt_ps2pi(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvt_si2ss (simde__m128 a, int32_t b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvt_si2ss(a.n, b); +#else + r.f32[0] = (simde_float32) b; + r.i32[1] = a.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvt_si2ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvt_si2ss(SIMDE__M128_FROM_NATIVE(a), b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvt_ss2si (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvt_ss2si(a.n); +#else + return (int32_t) a.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpi16_ps (simde__m64 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi16_ps(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f32, a.i16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (simde_float32) a.i16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpi16_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtpi16_ps(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi32_ps(a.n, b.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.m64[0].f32, b.i32); + r.m64[1] = a.m64[1]; +#else + r.f32[0] = (simde_float32) b.i32[0]; + r.f32[1] = (simde_float32) b.i32[1]; + r.i32[2] = a.i32[2]; + r.i32[3] = a.i32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpi32_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvtpi32_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi32x2_ps(a.n, b.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.m64[0].f32, a.i32); + SIMDE__CONVERT_VECTOR(r.m64[1].f32, b.i32); +#else + r.f32[0] = (simde_float32) a.i32[0]; + r.f32[1] = (simde_float32) a.i32[1]; + r.f32[2] = (simde_float32) b.i32[0]; + r.f32[3] = (simde_float32) b.i32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpi32x2_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvtpi32x2_ps(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpi8_ps (simde__m64 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpi8_ps(a.n); +#else + r.f32[0] = (simde_float32) a.i8[0]; + r.f32[1] = (simde_float32) a.i8[1]; + r.f32[2] = (simde_float32) a.i8[2]; + r.f32[3] = (simde_float32) a.i8[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpi8_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtpi8_ps(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtps_pi16 (simde__m128 a) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi16(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i16, a.f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (int16_t) a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtps_pi16(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtps_pi16(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtps_pi32 (simde__m128 a) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi32(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.m64[0].f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (int32_t) a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtps_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtps_pi32(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtps_pi8 (simde__m128 a) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtps_pi8(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(a.f32) / sizeof(a.f32[0])) ; i++) { + r.i8[i] = (int8_t) a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtps_pi8(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtps_pi8(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpu16_ps (simde__m64 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpu16_ps(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.f32, a.u16); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (simde_float32) a.u16[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpu16_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtpu16_ps(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtpu8_ps (simde__m64 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtpu8_ps(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < 4 ; i++) { + r.f32[i] = (simde_float32) a.u8[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtpu8_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_cvtpu8_ps(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 + simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtsi32_ss(a.n, b); +#else + r.f32[0] = (simde_float32) b; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < 4 ; i++) { + r.i32[i] = a.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi32_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvtsi32_ss(SIMDE__M128_FROM_NATIVE(a), b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if !defined(__PGI) + r.n = _mm_cvtsi64_ss(a.n, b); + #else + r.n = _mm_cvtsi64x_ss(a.n, b); + #endif +#else + r.f32[0] = (simde_float32) b; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < 4 ; i++) { + r.i32[i] = a.i32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtsi64_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_cvtsi64_ss(SIMDE__M128_FROM_NATIVE(a), b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde_float32 +simde_mm_cvtss_f32 (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtss_f32(a.n); +#elif defined(SIMDE_SSE_NEON) + return vgetq_lane_f32(a.neon_f32, 0); +#else + return a.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtss_f32(a) simde_mm_cvtss_f32(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvtss_si32 (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtss_si32(a.n); +#else + return (int32_t) a.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtss_si32(a) simde_mm_cvtss_si32(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvtss_si64 (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if !defined(__PGI) + return _mm_cvtss_si64(a.n); + #else + return _mm_cvtss_si64x(a.n); + #endif +#else + return (int64_t) a.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtss_si64(a) simde_mm_cvtss_si64(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_cvtt_ps2pi (simde__m128 a) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_cvtt_ps2pi(a.n); +#elif defined(SIMDE__CONVERT_VECTOR) + SIMDE__CONVERT_VECTOR(r.i32, a.m64[0].f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.i32[i] = SIMDE_CONVERT_FTOI(int32_t, truncf(a.f32[i])); + } +#endif + + return r; +} +#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtt_ps2pi(a) SIMDE__M64_TO_NATIVE(simde_mm_cvtt_ps2pi(SIMDE__M128_FROM_NATIVE(a))) +# define _mm_cvttps_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_cvttps_pi32(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int32_t +simde_mm_cvtt_ss2si (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_cvtt_ss2si(a.n); +#else + return SIMDE_CONVERT_FTOI(int32_t, truncf(a.f32[0])); +#endif +} +#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si(a) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si(SIMDE__M128_FROM_NATIVE(a)) +# define _mm_cvttss_si32(a) simde_mm_cvttss_si32(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int64_t +simde_mm_cvttss_si64 (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) + #if defined(__PGI) + return _mm_cvttss_si64x(a.n); + #else + return _mm_cvttss_si64(a.n); + #endif +#else + return SIMDE_CONVERT_FTOI(int64_t, truncf(a.f32[0])); +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_cvttss_si64(a) simde_mm_cvttss_si64(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_div_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_div_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recip0 = vrecpeq_f32(b.neon_f32); + float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32)); + r.neon_f32 = vmulq_f32(a.neon_f32, recip1); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i] / b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_div_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_div_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_div_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_div_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(simde_mm_div_ps(a, b).neon_f32, 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = a.f32[0] / b.f32[0]; + SIMDE__VECTORIZE + for (size_t i = 1 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_div_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_div_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int_fast32_t +simde_mm_extract_pi16 (simde__m64 a, const int imm8) { + return a.u16[imm8]; +} +#if defined(SIMDE_SSE_NATIVE) +# define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8) +#endif +#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8) + +enum { +#if defined(SIMDE_SSE_NATIVE) + simde_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, + simde_MM_ROUND_DOWN = _MM_ROUND_DOWN, + simde_MM_ROUND_UP = _MM_ROUND_UP, + simde_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO +#else + simde_MM_ROUND_NEAREST +#if defined(FE_TONEAREST) + = FE_TONEAREST +#endif + , + + simde_MM_ROUND_DOWN +#if defined(FE_DOWNWARD) + = FE_DOWNWARD +#endif + , + + simde_MM_ROUND_UP +#if defined(FE_UPWARD) + = FE_UPWARD +#endif + , + + simde_MM_ROUND_TOWARD_ZERO +#if defined(FE_TOWARDZERO) + = FE_TOWARDZERO +#endif +#endif +}; + +SIMDE__FUNCTION_ATTRIBUTES +unsigned int +simde_MM_GET_ROUNDING_MODE(void) { +#if defined(SIMDE_SSE_NATIVE) + return _MM_GET_ROUNDING_MODE(); +#else + return (unsigned int) fegetround(); +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16(SIMDE__M128_FROM_NATIVE(a), imm8) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_MM_SET_ROUNDING_MODE(unsigned int a) { +#if defined(SIMDE_SSE_NATIVE) + _MM_SET_ROUNDING_MODE(a); +#else + fesetround((int) a); +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _MM_SET_ROUNDING_MODE(a) simde_MM_SET_ROUNDING_MODE(a) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8) { + simde__m64 r; + r.i64[0] = a.i64[0]; + r.i16[imm8] = i; + return r; +} +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +# define simde_mm_insert_pi16(a, i, imm8) SIMDE__M64_FROM_NATIVE(_mm_insert_pi16((a).n, i, imm8)); +#endif +#define simde_m_pinsrw(a, i, imm8) SIMDE__M64_FROM_NATIVE(simde_mm_insert_pi16((a).n, i, imm8)); +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_insert_pi16(a, i, imm8) SIMDE__M64_TO_NATIVE(simde_mm_insert_pi16(SIMDE__M64_FROM_NATIVE(a), i, imm8)) +# define _m_pinsrw(a, i, imm8) SIMDE__M64_TO_NATIVE(simde_mm_insert_pi16(SIMDE__M64_FROM_NATIVE(a), i, imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { + simde__m128 r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ps(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_f32(mem_addr); +#else + memcpy(&r, mem_addr, sizeof(r.f32)); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_load_ps(mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_load_ps(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_load_ps1 (simde_float32 const* mem_addr) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ps1(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_dup_f32(mem_addr); +#else + const simde_float32 v = *mem_addr; + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.f32[i] = v; + } +#endif + + return r; +} +#define simde_mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_load_ps1(mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_load_ps1(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_load_ss (simde_float32 const* mem_addr) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_load_ss(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); +#else + r.f32[0] = *mem_addr; + r.i32[1] = 0; + r.i32[2] = 0; + r.i32[3] = 0; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_load_ss(mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_load_ss(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadh_pi(a.n, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = mem_addr->f32[0]; + r.f32[3] = mem_addr->f32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_loadh_pi(a, mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_loadh_pi(SIMDE__M128_FROM_NATIVE(a), (simde__m64 const*) (mem_addr))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadl_pi(a.n, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); +#else + r.f32[0] = mem_addr->f32[0]; + r.f32[1] = mem_addr->f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_loadl_pi(a, mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_loadl_pi(SIMDE__M128_FROM_NATIVE(a), (simde__m64 const*) (mem_addr))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { + simde__m128 r; + + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadr_ps(mem_addr); +#else + r.f32[0] = mem_addr[3]; + r.f32[1] = mem_addr[2]; + r.f32[2] = mem_addr[1]; + r.f32[3] = mem_addr[0]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_loadr_ps(mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_loadr_ps(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_loadu_ps(mem_addr); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vld1q_f32(mem_addr); +#else + r.f32[0] = mem_addr[0]; + r.f32[1] = mem_addr[1]; + r.f32[2] = mem_addr[2]; + r.f32[3] = mem_addr[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_loadu_ps(mem_addr) SIMDE__M128_TO_NATIVE(simde_mm_loadu_ps(mem_addr)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, char* mem_addr) { +#if defined(SIMDE_SSE_NATIVE) + _mm_maskmove_si64(a.n, mask.n, mem_addr); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(a.i8) / sizeof(a.i8[0])) ; i++) + if (mask.i8[i] < 0) + mem_addr[i] = a.i8[i]; +#endif +} +#define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(mask), mem_addr) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_max_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_pi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_max_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_max_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pmaxsw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_max_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_max_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_max_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_max_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_max_pu8 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_pu8(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_max_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_max_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pmaxub(a, b) SIMDE__M64_TO_NATIVE(simde_mm_max_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_max_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_max_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_max_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_max_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_min_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_pi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + } +#endif + + return r; +} +#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_min_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_min_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pminsw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_min_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_min_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_min_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_min_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_min_pu8 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_pu8(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + } +#endif + + return r; +} +#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_min_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_min_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_pminub(a, b) SIMDE__M64_TO_NATIVE(simde_mm_min_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_min_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_min_ss(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_min_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_min_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_move_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_move_ss(a.n, b.n); +#else + r.f32[0] = b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_move_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_move_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_movehl_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_movehl_ps(a.n, b.n); +#else + r.f32[0] = b.f32[2]; + r.f32[1] = b.f32[3]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_movehl_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_movehl_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_movelh_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_movelh_ps(a.n, b.n); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = b.f32[0]; + r.f32[3] = b.f32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_movelh_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_movelh_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_movemask_pi8 (simde__m64 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_movemask_pi8(a.n); +#else + int r = 0; + const size_t nmemb = sizeof(a.i8) / sizeof(a.i8[0]); + + SIMDE__VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < nmemb ; i++) { + r |= (a.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); + } + + return r; +#endif +} +#define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_movemask_pi8(a) simde_mm_movemask_pi8(SIMDE__M64_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_movemask_ps (simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_movemask_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + /* TODO: check to see if NEON version is faster than the portable version */ + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t0 = a.neon_u32; + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +#else + int r = 0; + + SIMDE__VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < sizeof(a.u32) / sizeof(a.u32[0]) ; i++) { + r |= (a.u32[i] >> ((sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i; + } + + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_movemask_ps(a) simde_mm_movemask_ps(SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mul_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mul_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i] * b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_mul_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_mul_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mul_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mul_ss(a.n, b.n); +#else + r.f32[0] = a.f32[0] * b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_mul_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_mul_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_mulhi_pu16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u16) / sizeof(r.u16[0])) ; i++) { + r.u16[i] = (a.u16[i] * b.u16[i]) >> 16; + } +#endif + + return r; +} +#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_mulhi_pu16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mulhi_pu16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_or_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_or_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] | b.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_or_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_or_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_prefetch (char const* p, int i) { + (void) p; + (void) i; +} +#if defined(SIMDE_SSE_NATIVE) +# define simde_mm_prefetch(p, i) _mm_prefetch(p, i) +#endif +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_prefetch(p, i) SIMDE__M128_TO_NATIVE(simde_mm_prefetch(p, i)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_rcp_ps (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rcp_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recip = vrecpeq_f32(a.neon_f32); + +# if !defined(SIMDE_MM_RCP_PS_ITERS) +# define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS +# endif + + for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS ; ++i) { + recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32)); + } + + r.neon_f32 = recip; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = 1.0f / a.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_rcp_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_rcp_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_rcp_ss (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rcp_ss(a.n); +#else + r.f32[0] = 1.0f / a.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_rcp_ss(a) SIMDE__M128_TO_NATIVE(simde_mm_rcp_ss(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_rsqrt_ps (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rsqrt_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vrsqrteq_f32(a.neon_f32); +#elif defined(__STDC_IEC_559__) + /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */ + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1); + +#if SIMDE_ACCURACY_ITERS > 2 + const float half = SIMDE_FLOAT32_C(0.5) * a.f32[i]; + for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++) + r.f32[i] *= SIMDE_FLOAT32_C(1.5) - (half * r.f32[i] * r.f32[i]); +#endif + } +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = 1.0f / sqrtf(a.f32[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_rsqrt_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_rsqrt_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_rsqrt_ss (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_rsqrt_ss(a.n); +#elif defined(__STDC_IEC_559__) + { + r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1); + +#if SIMDE_ACCURACY_ITERS > 2 + float half = SIMDE_FLOAT32_C(0.5) * a.f32[0]; + for (int ai = 2 ; ai < SIMDE_ACCURACY_ITERS ; ai++) + r.f32[0] *= SIMDE_FLOAT32_C(1.5) - (half * r.f32[0] * r.f32[0]); +#endif + } + r.f32[0] = 1.0f / sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#else + r.f32[0] = 1.0f / sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_rsqrt_ss(a) SIMDE__M128_TO_NATIVE(simde_mm_rsqrt_ss(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sad_pu8(a.n, b.n); +#else + uint16_t sum = 0; + + SIMDE__VECTORIZE_REDUCTION(+:sum) + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + sum += (uint8_t) abs(a.u8[i] - b.u8[i]); + } + + r.i16[0] = (int16_t) sum; + r.i16[1] = 0; + r.i16[2] = 0; + r.i16[3] = 0; +#endif + + return r; +} +#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sad_pu8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sad_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +# define _m_psadbw(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sad_pu8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set_ps(e3, e2, e1, e0); +#elif defined(SIMDE_SSE_NEON) + SIMDE_ALIGN(16) simde_float32 data[4] = { e0, e1, e2, e3 }; + r.neon_f32 = vld1q_f32(data); +#else + r.f32[0] = e0; + r.f32[1] = e1; + r.f32[2] = e2; + r.f32[3] = e3; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_set_ps(e3, e2, e1, e0) SIMDE__M128_TO_NATIVE(simde_mm_set_ps(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_set_ps1 (simde_float32 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set_ps1(a); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vdupq_n_f32(a); +#else + r = simde_mm_set_ps(a, a, a, a); +#endif + + return r; +} +#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_set_ps1(a) SIMDE__M128_TO_NATIVE(simde_mm_set_ps1(a)) +# define _mm_set1_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_set1_ps(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_set_ss (simde_float32 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_set_ss(a); +#else + r = simde_mm_set_ps(0, 0, 0, a); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_set_ss(a) SIMDE__M128_TO_NATIVE(simde_mm_set_ss(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_setr_ps(e3, e2, e1, e0); +#elif defined(SIMDE_SSE_NEON) + SIMDE_ALIGN(16) simde_float32 data[4] = { e3, e2, e1, e0 }; + r.neon_f32 = vld1q_f32(data); +#else + r = simde_mm_set_ps(e0, e1, e2, e3); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_setr_ps(e3, e2, e1, e0) SIMDE__M128_TO_NATIVE(simde_mm_setr_ps(e3, e2, e1, e0)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_setzero_ps (void) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_setzero_ps(); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vdupq_n_f32(0.0f); +#else + r = simde_mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_setzero_ps() SIMDE__M128_TO_NATIVE(simde_mm_setzero_ps()) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_sfence (void) { + /* TODO: Use Hedley. */ +#if defined(SIMDE_SSE_NATIVE) + _mm_sfence(); +#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +# if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +# else + atomic_thread_fence(memory_order_seq_cst); +# endif +#elif defined(_MSC_VER) + MemoryBarrier(); +#elif HEDLEY_HAS_EXTENSION(c_atomic) + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) + __sync_synchronize(); +#elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140)) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(_OPENMP) +# pragma omp critical(simde_mm_sfence_) + { } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sfence_ps()) SIMDE__M128_TO_NATIVE(simde_mm_sfence_ps()) +#endif + +#define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w) +#endif + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +# define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_FROM_NATIVE(_mm_shuffle_pi16(a.n, imm8)) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \ + const simde__m64 simde__tmp_a_ = a; \ + (simde__m64) { .i16 = \ + SIMDE__SHUFFLE_VECTOR(16, 8, \ + (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3)) }; })) +#else +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_shuffle_pi16 (simde__m64 a, const int imm8) { + simde__m64 r; + + for (size_t i = 0 ; i < sizeof(r.i16) / sizeof(r.i16[0]) ; i++) { + r.i16[i] = a.i16[(imm8 >> (i * 2)) & 3]; + } + +HEDLEY_DIAGNOSTIC_PUSH +#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized") +# pragma clang diagnostic ignored "-Wconditional-uninitialized" +#endif + return r; +HEDLEY_DIAGNOSTIC_POP +} +#endif +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +# define simde_m_pshufw(a, imm8) SIMDE__M64_FROM_NATIVE(_m_pshufw(a.n, imm8)) +#else +# define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) +#endif +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_pi16(a, imm8) SIMDE__M64_TO_NATIVE(simde_mm_shuffle_pi16(SIMDE__M64_FROM_NATIVE(a), imm8)) +# define _m_pshufw(a, imm8) SIMDE__M64_TO_NATIVE(simde_mm_shuffle_pi16(SIMDE__M64_FROM_NATIVE(a), imm8)) +#endif + +#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +# define simde_mm_shuffle_ps(a, b, imm8) SIMDE__M128_FROM_NATIVE(_mm_shuffle_ps(a.n, b.n, imm8)) +#elif defined(SIMDE__SHUFFLE_VECTOR) +# define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ + (simde__m128) { .f32 = \ + SIMDE__SHUFFLE_VECTOR(32, 16, \ + (a).f32, \ + (b).f32, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }; })) +#else +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) { + simde__m128 r; + r.f32[0] = a.f32[(imm8 >> 0) & 3]; + r.f32[1] = a.f32[(imm8 >> 2) & 3]; + r.f32[2] = b.f32[(imm8 >> 4) & 3]; + r.f32[3] = b.f32[(imm8 >> 6) & 3]; + return r; +} +#endif +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_ps(a, b, imm8) SIMDE__M128_TO_NATIVE(simde_mm_shuffle_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b), imm8)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_sqrt_ps (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sqrt_ps(a.n); +#elif defined(SIMDE_SSE_NEON) + float32x4_t recipsq = vrsqrteq_f32(a.neon_f32); + float32x4_t sq = vrecpeq_f32(recipsq); + /* ??? use step versions of both sqrt and recip for better accuracy? */ + r.neon_f32 = sq; +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < sizeof(r.f32) / sizeof(r.f32[0]) ; i++) { + r.f32[i] = sqrtf(a.f32[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sqrt_ps(a) SIMDE__M128_TO_NATIVE(simde_mm_sqrt_ps(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_sqrt_ss (simde__m128 a) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sqrt_ss(a.n); +#elif defined(SIMDE_SSE_NEON) + float32_t value = vgetq_lane_f32(simde_mm_sqrt_ps(a).neon_f32, 0); + r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#else + r.f32[0] = sqrtf(a.f32[0]); + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sqrt_ss(a) SIMDE__M128_TO_NATIVE(simde_mm_sqrt_ss(SIMDE__M128_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ps(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_f32(mem_addr, a.neon_f32); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr:16) + for (size_t i = 0 ; i < sizeof(a.f32) / sizeof(a.f32[0]) ; i++) { + mem_addr[i] = a.f32[i]; + } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_store_ps(mem_addr, a) simde_mm_store_ps(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_ps1 (simde_float32 mem_addr[4], simde__m128 a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ps1(mem_addr, a.n); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr:16) + for (size_t i = 0 ; i < sizeof(a.f32) / sizeof(a.f32[0]) ; i++) { + mem_addr[i] = a.f32[0]; + } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_store_ps1(mem_addr, a) simde_mm_store_ps1(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_store_ss(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_lane_f32(mem_addr, a.neon_f32, 0); +#else + *mem_addr = a.f32[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_store_ss(mem_addr, a) simde_mm_store_ss(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_store1_ps(mem_addr, a.n); +#else + simde_mm_store_ps1(mem_addr, a); +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_storeh_pi(&(mem_addr->n), a.n); +#else + mem_addr->f32[0] = a.f32[2]; + mem_addr->f32[1] = a.f32[3]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_storel_pi(&(mem_addr->n), a.n); +#else + mem_addr->f32[0] = a.f32[0]; + mem_addr->f32[1] = a.f32[1]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_storer_ps(mem_addr, a.n); +#else + SIMDE__VECTORIZE_ALIGNED(mem_addr:16) + for (size_t i = 0 ; i < sizeof(a.f32) / sizeof(a.f32[0]) ; i++) { + mem_addr[i] = a.f32[((sizeof(a.f32) / sizeof(a.f32[0])) - 1) - i]; + } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_storeu_ps(mem_addr, a.n); +#elif defined(SIMDE_SSE_NEON) + vst1q_f32(mem_addr, a.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < sizeof(a.f32) / sizeof(a.f32[0]) ; i++) { + mem_addr[i] = a.f32[i]; + } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_sub_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sub_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.f32) / sizeof(r.f32[0])) ; i++) { + r.f32[i] = a.f32[i] - b.f32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sub_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_sub_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_sub_ss (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_sub_ss(a.n, b.n); +#else + r.f32[0] = a.f32[0] - b.f32[0]; + r.f32[1] = a.f32[1]; + r.f32[2] = a.f32[2]; + r.f32[3] = a.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_sub_ss(a, b) SIMDE__M128_TO_NATIVE(simde_mm_sub_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomieq_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] == b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomige_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] >= b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomigt_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] > b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomile_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] <= b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomilt_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] < b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_ucomineq_ss(a.n, b.n); +#else + fenv_t envp; + int x = feholdexcept(&envp); + int r = a.f32[0] != b.f32[0]; + if (HEDLEY_LIKELY(x == 0)) + fesetenv(&envp); + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b)) +#endif + +#if defined(SIMDE_SSE_NATIVE) +# if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_undef128) +# define SIMDE__HAVE_UNDEFINED128 +# endif +# elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) +# define SIMDE__HAVE_UNDEFINED128 +# endif +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_undefined_ps (void) { + simde__m128 r; + +#if defined(SIMDE__HAVE_UNDEFINED128) + r.n = _mm_undefined_ps(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r = simde_mm_setzero_ps(); +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_undefined_ps() SIMDE__M128_TO_NATIVE(simde_mm_undefined_ps()) +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_unpackhi_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + float32x2_t a1 = vget_high_f32(a.neon_f32); + float32x2_t b1 = vget_high_f32(b.neon_f32); + float32x2x2_t result = vzip_f32(a1, b1); + r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, b.f32, 2, 6, 3, 7); +#else + r.f32[0] = a.f32[2]; + r.f32[1] = b.f32[2]; + r.f32[2] = a.f32[3]; + r.f32[3] = b.f32[3]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_unpackhi_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_unpackhi_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_unpacklo_ps(a.n, b.n); +#elif defined(SIMDE__SHUFFLE_VECTOR) + r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, b.f32, 0, 4, 1, 5); +#elif defined(SIMDE_SSE_NEON) + float32x2_t a1 = vget_low_f32(a.neon_f32); + float32x2_t b1 = vget_low_f32(b.neon_f32); + float32x2x2_t result = vzip_f32(a1, b1); + r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); +#else + r.f32[0] = a.f32[0]; + r.f32[1] = b.f32[0]; + r.f32[2] = a.f32[1]; + r.f32[3] = b.f32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_unpacklo_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_unpacklo_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_xor_ps (simde__m128 a, simde__m128 b) { + simde__m128 r; + +#if defined(SIMDE_SSE_NATIVE) + r.n = _mm_xor_ps(a.n, b.n); +#elif defined(SIMDE_SSE_NEON) + r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.u32) / sizeof(r.u32[0])) ; i++) { + r.u32[i] = a.u32[i] ^ b.u32[i]; + } +#endif + + return r; +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_xor_ps(a, b) SIMDE__M128_TO_NATIVE(simde_mm_xor_ps(SIMDE__M128_FROM_NATIVE(a), SIMDE__M128_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_stream_pi(&(mem_addr->n), a.n); +#else + mem_addr->i64[0] = a.i64[0]; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_SSE_NATIVE) + _mm_stream_ps(mem_addr, a.n); +#else + SIMDE__ASSUME_ALIGNED(mem_addr, 16); + memcpy(mem_addr, &a, sizeof(a)); +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(mem_addr, SIMDE__M128_FROM_NATIVE(a)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +uint32_t +simde_mm_getcsr (void) { +#if defined(SIMDE_SSE_NATIVE) + return _mm_getcsr(); +#else + uint32_t r = 0; + int rounding_mode = fegetround(); + + switch(rounding_mode) { +#if defined(FE_TONEAREST) + case FE_TONEAREST: + break; +#endif +#if defined(FE_UPWARD) + case FE_UPWARD: + r |= 2 << 13; + break; +#endif +#if defined(FE_DOWNWARD) + case FE_DOWNWARD: + r |= 1 << 13; + break; +#endif +#if defined(FE_TOWARDZERO) + case FE_TOWARDZERO: + r = 3 << 13; + break; +#endif + } + + return r; +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_getcsr() simde_mm_getcsr() +#endif + +SIMDE__FUNCTION_ATTRIBUTES +void +simde_mm_setcsr (uint32_t a) { +#if defined(SIMDE_SSE_NATIVE) + _mm_setcsr(a); +#else + switch((a >> 13) & 3) { +#if defined(FE_TONEAREST) + case 0: + fesetround(FE_TONEAREST); +#endif +#if defined(FE_DOWNWARD) + break; + case 1: + fesetround(FE_DOWNWARD); +#endif +#if defined(FE_UPWARD) + break; + case 2: + fesetround(FE_UPWARD); +#endif +#if defined(FE_TOWARDZERO) + break; + case 3: + fesetround(FE_TOWARDZERO); + break; +#endif + } +#endif +} +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _mm_setcsr(a) simde_mm_s-etcsr(a) +#endif + +#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + simde__m128 tmp3, tmp2, tmp1, tmp0; \ + tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \ + tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \ + tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \ + tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \ + row0 = simde_mm_movelh_ps(tmp0, tmp2); \ + row1 = simde_mm_movehl_ps(tmp2, tmp0); \ + row2 = simde_mm_movelh_ps(tmp1, tmp3); \ + row3 = simde_mm_movehl_ps(tmp3, tmp1); \ + } while (0) + +SIMDE__END_DECLS +#if defined(SIMDE_SSE_ENABLE_NATIVE_ALIASES) +# define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) +#endif + +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE__SSE_H) */ diff -Nru minimap2-2.17+dfsg/debian/include/simde/x86/ssse3.h minimap2-2.17+dfsg/debian/include/simde/x86/ssse3.h --- minimap2-2.17+dfsg/debian/include/simde/x86/ssse3.h 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/include/simde/x86/ssse3.h 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,747 @@ +/* Copyright (c) 2017 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(SIMDE__SSSE3_H) +# if !defined(SIMDE__SSSE3_H) +# define SIMDE__SSSE3_H +# endif +# include "sse3.h" + +# if defined(SIMDE_SSSE3_NATIVE) +# undef SIMDE_SSSE3_NATIVE +# endif +# if defined(SIMDE_SSSE3_FORCE_NATIVE) +# define SIMDE_SSSE3_NATIVE +# elif defined(__SSSE3__) && !defined(SIMDE_SSSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +# define SIMDE_SSSE3_NATIVE +# elif defined(__ARM_NEON) && !defined(SIMDE_SSSE3_NO_NEON) && !defined(SIMDE_NO_NEON) +# define SIMDE_SSSE3_NEON +# endif + +# if defined(SIMDE_SSSE3_NATIVE) && !defined(SIMDE_SSE3_NATIVE) +# if defined(SIMDE_SSSE3_FORCE_NATIVE) +# error Native SSSE3 support requires native SSE3 support +# else +# warning Native SSSE3 support requires native SSE3 support, disabling +# undef SIMDE_SSSE3_NATIVE +# endif +# elif defined(SIMDE_SSSE3_NEON) && !defined(SIMDE_SSE3_NEON) +# warning SSSE3 NEON support requires SSE3 NEON support, disabling +# undef SIMDE_SSSE3_NEON +# endif + +# if defined(SIMDE_SSSE3_NATIVE) +# include +# else +# if defined(SIMDE_SSSE3_NEON) +# include +# endif +# endif + +#if !defined(SIMDE_SSSE3_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +# define SIMDE_SSSE3_ENABLE_NATIVE_ALIASES +#endif + +SIMDE__BEGIN_DECLS + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_abs_epi8 (simde__m128i a) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_epi8(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a.i8[i] < 0) ? (- a.i8[i]) : a.i8[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_epi8(a) SIMDE__M128I_TO_NATIVE(simde_mm_abs_epi8(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_abs_epi16 (simde__m128i a) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_epi16(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a.i16[i] < 0) ? (- a.i16[i]) : a.i16[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_epi16(a) SIMDE__M128I_TO_NATIVE(simde_mm_abs_epi16(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_abs_epi32 (simde__m128i a) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_epi32(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a.i32[i] < 0) ? (- a.i32[i]) : a.i32[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_epi32(a) SIMDE__M128I_TO_NATIVE(simde_mm_abs_epi32(SIMDE__M128I_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_abs_pi8 (simde__m64 a) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_pi8(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a.i8[i] < 0) ? (- a.i8[i]) : a.i8[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_pi8(a) SIMDE__M64_TO_NATIVE(simde_mm_abs_pi8(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_abs_pi16 (simde__m64 a) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_pi16(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a.i16[i] < 0) ? (- a.i16[i]) : a.i16[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_pi16(a) SIMDE__M64_TO_NATIVE(simde_mm_abs_pi16(SIMDE__M64_FROM_NATIVE(a))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_abs_pi32 (simde__m64 a) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_abs_pi32(a.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a.i32[i] < 0) ? (- a.i32[i]) : a.i32[i]); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_abs_pi32(a) SIMDE__M64_TO_NATIVE(simde_mm_abs_pi32(SIMDE__M64_FROM_NATIVE(a))) +#endif + +#if defined(simde_mm_alignr_epi8) +# undef simde_mm_alignr_epi8 +#endif +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) { + simde__m128i r; + const int bits = (8 * count) % 64; + const int eo = count / 8; + + switch (eo) { + case 0: + r.u64[0] = b.u64[0] >> bits; + r.u64[0] |= b.u64[1] << (64 - bits); + r.u64[1] = b.u64[1] >> bits; + r.u64[1] |= a.u64[0] << (64 - bits); + break; + case 1: + r.u64[0] = b.u64[1] >> bits; + r.u64[0] |= a.u64[0] << (64 - bits); + r.u64[1] = a.u64[0] >> bits; + r.u64[1] |= a.u64[1] << (64 - bits); + break; + case 2: + r.u64[0] = a.u64[0] >> bits; + r.u64[0] |= a.u64[1] << (64 - bits); + r.u64[1] = a.u64[1] >> bits; + break; + case 3: + r.u64[0] = a.u64[1] >> bits; + r.u64[1] = 0; + break; + default: + HEDLEY_UNREACHABLE(); + break; + } + + return r; +} +#if defined(SIMDE_SSSE3_NATIVE) +# define simde_mm_alignr_epi8(a, b, count) SIMDE__M128I_FROM_NATIVE(_mm_alignr_epi8(a.n, b.n, count)) +#endif +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_alignr_epi8(a, b, count) SIMDE__M128I_TO_NATIVE(simde_mm_alignr_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b), count)) +#endif + +#if defined(simde_mm_alignr_pi8) +# undef simde_mm_alignr_pi8 +#endif +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_alignr_pi8 (simde__m64 a, simde__m64 b, const int count) { + simde__m64 r; + +#if defined(SIMDE__HAVE_INT128) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_INT128 + unsigned __int128 t = a.u64[0]; + t <<= 64; + t |= b.u64[0]; + t >>= count * 8; + r.u64[0] = HEDLEY_STATIC_CAST(uint64_t, t); +HEDLEY_DIAGNOSTIC_POP +#else + const int cb = count * 8; + + if (cb > 64) { + r.u64[0] = a.u64[0] >> (cb - 64); + } else { + r.u64[0] = (a.u64[0] << (64 - cb)) | (b.u64[0] >> cb); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_NATIVE) +# define simde_mm_alignr_pi8(a, b, count) SIMDE__M64_FROM_NATIVE(_mm_alignr_pi8(a.n, b.n, count)) +#endif +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_alignr_pi8(a, b, count) SIMDE__M64_TO_NATIVE(simde_mm_alignr_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b), count)) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_shuffle_epi8(a.n, b.n)); +#else + simde__m128i r; + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = a.u8[b.u8[i] & 15] * ((~(b.u8[i]) >> 7) & 1); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_shuffle_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_shuffle_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_shuffle_pi8(a.n, b.n)); +#else + simde__m64 r; + for (size_t i = 0 ; i < (sizeof(r.u8) / sizeof(r.u8[0])) ; i++) { + r.u8[i] = a.u8[b.u8[i] & 7] * ((~(b.u8[i]) >> 7) & 1); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_shuffle_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_shuffle_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hadd_epi16 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_hadd_epi16(a.n, b.n)); +#else + simde__m128i r; + + r.i16[0] = a.i16[0] + a.i16[1]; + r.i16[1] = a.i16[2] + a.i16[3]; + r.i16[2] = a.i16[4] + a.i16[5]; + r.i16[3] = a.i16[6] + a.i16[7]; + r.i16[4] = b.i16[0] + b.i16[1]; + r.i16[5] = b.i16[2] + b.i16[3]; + r.i16[6] = b.i16[4] + b.i16[5]; + r.i16[7] = b.i16[6] + b.i16[7]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hadd_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hadd_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_hadd_epi32(a.n, b.n)); +#else + simde__m128i r; + + r.i32[0] = a.i32[0] + a.i32[1]; + r.i32[1] = a.i32[2] + a.i32[3]; + r.i32[2] = b.i32[0] + b.i32[1]; + r.i32[3] = b.i32[2] + b.i32[3]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hadd_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hadd_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_hadd_pi16(a.n, b.n)); +#else + simde__m64 r; + + r.i16[0] = a.i16[0] + a.i16[1]; + r.i16[1] = a.i16[2] + a.i16[3]; + r.i16[2] = b.i16[0] + b.i16[1]; + r.i16[3] = b.i16[2] + b.i16[3]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hadd_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hadd_pi32 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hadd_pi32(a.n, b.n); +#else + r.i32[0] = a.i32[0] + a.i32[1]; + r.i32[1] = b.i32[0] + b.i32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadd_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hadd_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hadds_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hadds_epi16(a.n, b.n); +#else + for (size_t i = 0 ; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2) ; i++) { + int32_t ta = HEDLEY_STATIC_CAST(int32_t, a.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a.i16[(i * 2) + 1]); + r.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; + int32_t tb = HEDLEY_STATIC_CAST(int32_t, b.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b.i16[(i * 2) + 1]); + r.i16[i + 4] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadds_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hadds_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hadds_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hadds_pi16(a.n, b.n); +#else + for (size_t i = 0 ; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2) ; i++) { + int32_t ta = HEDLEY_STATIC_CAST(int32_t, a.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a.i16[(i * 2) + 1]); + r.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; + int32_t tb = HEDLEY_STATIC_CAST(int32_t, b.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b.i16[(i * 2) + 1]); + r.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hadds_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hadds_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hsub_epi16 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_hsub_epi16(a.n, b.n)); +#else + simde__m128i r; + + r.i16[0] = a.i16[0] - a.i16[1]; + r.i16[1] = a.i16[2] - a.i16[3]; + r.i16[2] = a.i16[4] - a.i16[5]; + r.i16[3] = a.i16[6] - a.i16[7]; + r.i16[4] = b.i16[0] - b.i16[1]; + r.i16[5] = b.i16[2] - b.i16[3]; + r.i16[6] = b.i16[4] - b.i16[5]; + r.i16[7] = b.i16[6] - b.i16[7]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hsub_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hsub_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_hsub_epi32(a.n, b.n)); +#else + simde__m128i r; + + r.i32[0] = a.i32[0] - a.i32[1]; + r.i32[1] = a.i32[2] - a.i32[3]; + r.i32[2] = b.i32[0] - b.i32[1]; + r.i32[3] = b.i32[2] - b.i32[3]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hsub_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hsub_pi16 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_hsub_pi16(a.n, b.n)); +#else + simde__m64 r; + + r.i16[0] = a.i16[0] - a.i16[1]; + r.i16[1] = a.i16[2] - a.i16[3]; + r.i16[2] = b.i16[0] - b.i16[1]; + r.i16[3] = b.i16[2] - b.i16[3]; + + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hsub_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hsub_pi32 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hsub_pi32(a.n, b.n); +#else + r.i32[0] = a.i32[0] - a.i32[1]; + r.i32[1] = b.i32[0] - b.i32[1]; +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsub_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hsub_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_hsubs_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hsubs_epi16(a.n, b.n); +#else + for (size_t i = 0 ; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2) ; i++) { + int32_t ta = HEDLEY_STATIC_CAST(int32_t, a.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, a.i16[(i * 2) + 1]); + r.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; + int32_t tb = HEDLEY_STATIC_CAST(int32_t, b.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, b.i16[(i * 2) + 1]); + r.i16[i + 4] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsubs_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_hsubs_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_hsubs_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_hsubs_pi16(a.n, b.n); +#else + for (size_t i = 0 ; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2) ; i++) { + int32_t ta = HEDLEY_STATIC_CAST(int32_t, a.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, a.i16[(i * 2) + 1]); + r.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; + int32_t tb = HEDLEY_STATIC_CAST(int32_t, b.i16[i * 2]) - HEDLEY_STATIC_CAST(int32_t, b.i16[(i * 2) + 1]); + r.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_hsubs_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_hsubs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_maddubs_epi16(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + const int idx = HEDLEY_STATIC_CAST(int, i) << 1; + int32_t ts = + (HEDLEY_STATIC_CAST(int16_t, a.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b.i8[ idx ])) + + (HEDLEY_STATIC_CAST(int16_t, a.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b.i8[idx + 1])); + r.i16[i] = HEDLEY_LIKELY(ts > INT16_MIN) ? (HEDLEY_LIKELY(ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_maddubs_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_maddubs_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_maddubs_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_maddubs_pi16(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + const int idx = HEDLEY_STATIC_CAST(int, i) << 1; + int32_t ts = + (HEDLEY_STATIC_CAST(int16_t, a.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b.i8[ idx ])) + + (HEDLEY_STATIC_CAST(int16_t, a.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b.i8[idx + 1])); + r.i16[i] = HEDLEY_LIKELY(ts > INT16_MIN) ? (HEDLEY_LIKELY(ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_maddubs_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_maddubs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_mulhrs_epi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b.i16[i])) + 0x4000) >> 15)); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_mulhrs_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_mulhrs_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_mulhrs_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_mulhrs_pi16(a.n, b.n); +#else + SIMDE__VECTORIZE + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b.i16[i])) + 0x4000) >> 15)); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_mulhrs_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_mulhrs_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_sign_epi8(a.n, b.n)); +#else + simde__m128i r; + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (b.i8[i] < 0) ? (- a.i8[i]) : ((b.i8[i] > 0) ? (a.i8[i]) : INT8_C(0)); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_epi8(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sign_epi8(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_sign_epi16(a.n, b.n)); +#else + simde__m128i r; + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (b.i16[i] < 0) ? (- a.i16[i]) : ((b.i16[i] > 0) ? (a.i16[i]) : INT16_C(0)); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_epi16(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sign_epi16(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M128I_FROM_NATIVE(_mm_sign_epi32(a.n, b.n)); +#else + simde__m128i r; + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (b.i32[i] < 0) ? (- a.i32[i]) : ((b.i32[i] > 0) ? (a.i32[i]) : INT32_C(0)); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_epi32(a, b) SIMDE__M128I_TO_NATIVE(simde_mm_sign_epi32(SIMDE__M128I_FROM_NATIVE(a), SIMDE__M128I_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sign_pi8(a.n, b.n)); +#else + simde__m64 r; + for (size_t i = 0 ; i < (sizeof(r.i8) / sizeof(r.i8[0])) ; i++) { + r.i8[i] = (b.i8[i] < 0) ? (- a.i8[i]) : ((b.i8[i] > 0) ? (a.i8[i]) : INT8_C(0)); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_pi8(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sign_pi8(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) { + simde__m64 r; + +#if defined(SIMDE_SSSE3_NATIVE) + r.n = _mm_sign_pi16(a.n, b.n); +#else + for (size_t i = 0 ; i < (sizeof(r.i16) / sizeof(r.i16[0])) ; i++) { + r.i16[i] = (b.i16[i] < 0) ? (- a.i16[i]) : ((b.i16[i] > 0) ? (a.i16[i]) : INT16_C(0)); + } +#endif + + return r; +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_pi16(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sign_pi16(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__FUNCTION_ATTRIBUTES +simde__m64 +simde_mm_sign_pi32 (simde__m64 a, simde__m64 b) { +#if defined(SIMDE_SSSE3_NATIVE) + return SIMDE__M64_FROM_NATIVE(_mm_sign_pi32(a.n, b.n)); +#else + simde__m64 r; + for (size_t i = 0 ; i < (sizeof(r.i32) / sizeof(r.i32[0])) ; i++) { + r.i32[i] = (b.i32[i] < 0) ? (- a.i32[i]) : ((b.i32[i] > 0) ? (a.i32[i]) : INT32_C(0)); + } + return r; +#endif +} +#if defined(SIMDE_SSSE3_ENABLE_NATIVE_ALIASES) +# define _mm_sign_pi32(a, b) SIMDE__M64_TO_NATIVE(simde_mm_sign_pi32(SIMDE__M64_FROM_NATIVE(a), SIMDE__M64_FROM_NATIVE(b))) +#endif + +SIMDE__END_DECLS + +#endif /* !defined(SIMDE__SSE2_H) */ diff -Nru minimap2-2.17+dfsg/debian/patches/series minimap2-2.17+dfsg/debian/patches/series --- minimap2-2.17+dfsg/debian/patches/series 2019-08-01 13:23:40.000000000 +0000 +++ minimap2-2.17+dfsg/debian/patches/series 2020-01-12 17:22:11.000000000 +0000 @@ -1,2 +1,3 @@ hardening.patch do_not_use_natbib.bst.patch +simde diff -Nru minimap2-2.17+dfsg/debian/patches/simde minimap2-2.17+dfsg/debian/patches/simde --- minimap2-2.17+dfsg/debian/patches/simde 1970-01-01 00:00:00.000000000 +0000 +++ minimap2-2.17+dfsg/debian/patches/simde 2020-01-12 17:22:11.000000000 +0000 @@ -0,0 +1,1354 @@ +Author: Michael R. Crusoe +Description: Add support for more architectures + +using the SIMD Everywhere library +--- minimap2.orig/ksw2_extd2_sse.c ++++ minimap2/ksw2_extd2_sse.c +@@ -3,81 +3,70 @@ + #include + #include "ksw2.h" + +-#ifdef __SSE2__ +-#include ++#include "debian/include/simde/x86/sse4.1.h" + +-#ifdef KSW_SSE2_ONLY +-#undef __SSE4_1__ +-#endif +- +-#ifdef __SSE4_1__ +-#include +-#endif +- +-#ifdef KSW_CPU_DISPATCH +-#ifdef __SSE4_1__ ++#if defined(SIMDE_SSE4_1_NATIVE) + void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#else ++#elif defined(SIMDE_SSE2_NATIVE) + void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, +- int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#endif ++ int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) + #else + void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, +- int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#endif // ~KSW_CPU_DISPATCH ++ int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) ++#endif + { + #define __dp_code_block1 \ +- z = _mm_load_si128(&s[t]); \ +- xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ +- xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ ++ z = simde_mm_load_si128(&s[t]); \ ++ xt1 = simde_mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ ++ xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ +- vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ +- vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ ++ vt1 = simde_mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ ++ vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ +- a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ +- ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ +- b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ +- x2t1= _mm_load_si128(&x2[t]); \ +- tmp = _mm_srli_si128(x2t1, 15); \ +- x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \ ++ a = simde_mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ ++ ut = simde_mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ ++ b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ ++ x2t1= simde_mm_load_si128(&x2[t]); \ ++ tmp = simde_mm_srli_si128(x2t1, 15); \ ++ x2t1= simde_mm_or_si128(simde_mm_slli_si128(x2t1, 1), x21_); \ + x21_= tmp; \ +- a2= _mm_add_epi8(x2t1, vt1); \ +- b2= _mm_add_epi8(_mm_load_si128(&y2[t]), ut); ++ a2= simde_mm_add_epi8(x2t1, vt1); \ ++ b2= simde_mm_add_epi8(simde_mm_load_si128(&y2[t]), ut); + + #define __dp_code_block2 \ +- _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ +- _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ +- tmp = _mm_sub_epi8(z, q_); \ +- a = _mm_sub_epi8(a, tmp); \ +- b = _mm_sub_epi8(b, tmp); \ +- tmp = _mm_sub_epi8(z, q2_); \ +- a2= _mm_sub_epi8(a2, tmp); \ +- b2= _mm_sub_epi8(b2, tmp); ++ simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ ++ simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ ++ tmp = simde_mm_sub_epi8(z, q_); \ ++ a = simde_mm_sub_epi8(a, tmp); \ ++ b = simde_mm_sub_epi8(b, tmp); \ ++ tmp = simde_mm_sub_epi8(z, q2_); \ ++ a2= simde_mm_sub_epi8(a2, tmp); \ ++ b2= simde_mm_sub_epi8(b2, tmp); + + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; +- __m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_, sc_N_; +- __m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0; ++ simde__m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_, sc_N_; ++ simde__m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0; + + ksw_reset_extz(ez); + if (m <= 1 || qlen <= 0 || tlen <= 0) return; + + if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2 + +- zero_ = _mm_set1_epi8(0); +- q_ = _mm_set1_epi8(q); +- q2_ = _mm_set1_epi8(q2); +- qe_ = _mm_set1_epi8(q + e); +- qe2_ = _mm_set1_epi8(q2 + e2); +- sc_mch_ = _mm_set1_epi8(mat[0]); +- sc_mis_ = _mm_set1_epi8(mat[1]); +- sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e2) : _mm_set1_epi8(mat[m*m-1]); +- m1_ = _mm_set1_epi8(m - 1); // wildcard ++ zero_ = simde_mm_set1_epi8(0); ++ q_ = simde_mm_set1_epi8(q); ++ q2_ = simde_mm_set1_epi8(q2); ++ qe_ = simde_mm_set1_epi8(q + e); ++ qe2_ = simde_mm_set1_epi8(q2 + e2); ++ sc_mch_ = simde_mm_set1_epi8(mat[0]); ++ sc_mis_ = simde_mm_set1_epi8(mat[1]); ++ sc_N_ = mat[m*m-1] == 0? simde_mm_set1_epi8(-e2) : simde_mm_set1_epi8(mat[m*m-1]); ++ m1_ = simde_mm_set1_epi8(m - 1); // wildcard + + if (w < 0) w = tlen > qlen? tlen : qlen; + wl = wr = w; +@@ -97,7 +86,7 @@ + long_diff = long_thres * (e - e2) - (q2 - q) - e2; + + mem = (uint8_t*)kcalloc(km, tlen_ * 8 + qlen_ + 1, 16); +- u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned ++ u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_, y2 = x2 + tlen_; + s = y2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + memset(u, -q - e, tlen_ * 16); +@@ -112,7 +101,7 @@ + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16); +- p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); ++ p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } +@@ -125,7 +114,7 @@ + int8_t x1, x21, v1; + uint8_t *qrr = qr + (qlen - 1 - r); + int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v, *x8 = (int8_t*)x, *x28 = (int8_t*)x2; +- __m128i x1_, x21_, v1_; ++ simde__m128i x1_, x21_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; +@@ -156,160 +145,99 @@ + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { +- __m128i sq, st, tmp, mask; +- sq = _mm_loadu_si128((__m128i*)&sf[t]); +- st = _mm_loadu_si128((__m128i*)&qrr[t]); +- mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); +- tmp = _mm_cmpeq_epi8(sq, st); +-#ifdef __SSE4_1__ +- tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); +- tmp = _mm_blendv_epi8(tmp, sc_N_, mask); +-#else +- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); +- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); +-#endif +- _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp); ++ simde__m128i sq, st, tmp, mask; ++ sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]); ++ st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]); ++ mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_)); ++ tmp = simde_mm_cmpeq_epi8(sq, st); ++ tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp); ++ tmp = simde_mm_blendv_epi8(tmp, sc_N_, mask); ++ simde_mm_storeu_si128((simde__m128i*)((int8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop +- x1_ = _mm_cvtsi32_si128((uint8_t)x1); +- x21_ = _mm_cvtsi32_si128((uint8_t)x21); +- v1_ = _mm_cvtsi32_si128((uint8_t)v1); ++ x1_ = simde_mm_cvtsi32_si128((uint8_t)x1); ++ x21_ = simde_mm_cvtsi32_si128((uint8_t)x21); ++ v1_ = simde_mm_cvtsi32_si128((uint8_t)v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { +- __m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; ++ simde__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +-#ifdef __SSE4_1__ +- z = _mm_max_epi8(z, a); +- z = _mm_max_epi8(z, b); +- z = _mm_max_epi8(z, a2); +- z = _mm_max_epi8(z, b2); +- z = _mm_min_epi8(z, sc_mch_); ++ z = simde_mm_max_epi8(z, a); ++ z = simde_mm_max_epi8(z, b); ++ z = simde_mm_max_epi8(z, a2); ++ z = simde_mm_max_epi8(z, b2); ++ z = simde_mm_min_epi8(z, sc_mch_); + __dp_code_block2; // save u[] and v[]; update a, b, a2 and b2 +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); +- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_)); +- _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_)); +-#else +- tmp = _mm_cmpgt_epi8(a, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(b, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(a2, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); +- tmp = _mm_cmpgt_epi8(b2, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); +- tmp = _mm_cmplt_epi8(sc_mch_, z); +- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +- __dp_code_block2; +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); +- tmp = _mm_cmpgt_epi8(a2, zero_); +- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); +- tmp = _mm_cmpgt_epi8(b2, zero_); +- _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); +-#endif ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_max_epi8(a, zero_), qe_)); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_max_epi8(b, zero_), qe_)); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_max_epi8(a2, zero_), qe2_)); ++ simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_max_epi8(b2, zero_), qe2_)); + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment +- __m128i *pr = p + (size_t)r * n_col_ - st_; ++ simde__m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; ++ simde__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +-#ifdef __SSE4_1__ +- d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0 +- z = _mm_max_epi8(z, a); +- d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d +- z = _mm_max_epi8(z, b); +- d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d +- z = _mm_max_epi8(z, a2); +- d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d +- z = _mm_max_epi8(z, b2); +- z = _mm_min_epi8(z, sc_mch_); +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- tmp = _mm_cmpgt_epi8(a, z); +- d = _mm_and_si128(tmp, _mm_set1_epi8(1)); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(b, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2))); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(a2, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); +- tmp = _mm_cmpgt_epi8(b2, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4))); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); +- tmp = _mm_cmplt_epi8(sc_mch_, z); +- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +-#endif ++ d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), simde_mm_set1_epi8(1)); // d = a > z? 1 : 0 ++ z = simde_mm_max_epi8(z, a); ++ d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(2), simde_mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d ++ z = simde_mm_max_epi8(z, b); ++ d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(3), simde_mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d ++ z = simde_mm_max_epi8(z, a2); ++ d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(4), simde_mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d ++ z = simde_mm_max_epi8(z, b2); ++ z = simde_mm_min_epi8(z, sc_mch_); + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 +- tmp = _mm_cmpgt_epi8(a2, zero_); +- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 +- tmp = _mm_cmpgt_epi8(b2, zero_); +- _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(a, zero_); ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, a), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 ++ tmp = simde_mm_cmpgt_epi8(b, zero_); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, b), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 ++ tmp = simde_mm_cmpgt_epi8(a2, zero_); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, a2), qe2_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 ++ tmp = simde_mm_cmpgt_epi8(b2, zero_); ++ simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, b2), qe2_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 ++ simde_mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment +- __m128i *pr = p + (size_t)r * n_col_ - st_; ++ simde__m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; ++ simde__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +-#ifdef __SSE4_1__ +- d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1 +- z = _mm_max_epi8(z, a); +- d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 +- z = _mm_max_epi8(z, b); +- d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 +- z = _mm_max_epi8(z, a2); +- d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 +- z = _mm_max_epi8(z, b2); +- z = _mm_min_epi8(z, sc_mch_); +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- tmp = _mm_cmpgt_epi8(z, a); +- d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(z, b); +- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2))); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(z, a2); +- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); +- tmp = _mm_cmpgt_epi8(z, b2); +- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4))); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2)); +- tmp = _mm_cmplt_epi8(sc_mch_, z); +- z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +-#endif ++ d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), simde_mm_set1_epi8(1)); // d = z > a? 0 : 1 ++ z = simde_mm_max_epi8(z, a); ++ d = simde_mm_blendv_epi8(simde_mm_set1_epi8(2), d, simde_mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 ++ z = simde_mm_max_epi8(z, b); ++ d = simde_mm_blendv_epi8(simde_mm_set1_epi8(3), d, simde_mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 ++ z = simde_mm_max_epi8(z, a2); ++ d = simde_mm_blendv_epi8(simde_mm_set1_epi8(4), d, simde_mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 ++ z = simde_mm_max_epi8(z, b2); ++ z = simde_mm_min_epi8(z, sc_mch_); + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(zero_, a); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 +- tmp = _mm_cmpgt_epi8(zero_, b); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 +- tmp = _mm_cmpgt_epi8(zero_, a2); +- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), qe2_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 +- tmp = _mm_cmpgt_epi8(zero_, b2); +- _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b2), qe2_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(zero_, a); ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 ++ tmp = simde_mm_cmpgt_epi8(zero_, b); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 ++ tmp = simde_mm_cmpgt_epi8(zero_, a2); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a2), qe2_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 ++ tmp = simde_mm_cmpgt_epi8(zero_, b2); ++ simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b2), qe2_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 ++ simde_mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array +@@ -317,29 +245,24 @@ + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; +- __m128i max_H_, max_t_; ++ simde__m128i max_H_, max_t_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element + max_t = en0; +- max_H_ = _mm_set1_epi32(max_H); +- max_t_ = _mm_set1_epi32(max_t); ++ max_H_ = simde_mm_set1_epi32(max_H); ++ max_t_ = simde_mm_set1_epi32(max_t); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; +- __m128i H1, tmp, t_; +- H1 = _mm_loadu_si128((__m128i*)&H[t]); +- t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); +- H1 = _mm_add_epi32(H1, t_); +- _mm_storeu_si128((__m128i*)&H[t], H1); +- t_ = _mm_set1_epi32(t); +- tmp = _mm_cmpgt_epi32(H1, max_H_); +-#ifdef __SSE4_1__ +- max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); +- max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); +-#else +- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); +- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); +-#endif ++ simde__m128i H1, tmp, t_; ++ H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]); ++ t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); ++ H1 = simde_mm_add_epi32(H1, t_); ++ simde_mm_storeu_si128((simde__m128i*)&H[t], H1); ++ t_ = simde_mm_set1_epi32(t); ++ tmp = simde_mm_cmpgt_epi32(H1, max_H_); ++ max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp); ++ max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp); + } +- _mm_storeu_si128((__m128i*)HH, max_H_); +- _mm_storeu_si128((__m128i*)tt, max_t_); ++ simde_mm_storeu_si128((simde__m128i*)HH, max_H_); ++ simde_mm_storeu_si128((simde__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE +@@ -391,4 +314,3 @@ + kfree(km, mem2); kfree(km, off); + } + } +-#endif // __SSE2__ +--- minimap2.orig/ksw2_exts2_sse.c ++++ minimap2/ksw2_exts2_sse.c +@@ -3,76 +3,65 @@ + #include + #include "ksw2.h" + +-#ifdef __SSE2__ +-#include ++#include "debian/include/simde/x86/sse4.1.h" + +-#ifdef KSW_SSE2_ONLY +-#undef __SSE4_1__ +-#endif +- +-#ifdef __SSE4_1__ +-#include +-#endif +- +-#ifdef KSW_CPU_DISPATCH +-#ifdef __SSE4_1__ ++#if defined(SIMDE_SSE4_1_NATIVE) + void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) +-#else ++#elif defined(SIMDE_SSE2_NATIVE) + void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) +-#endif + #else + void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez) +-#endif // ~KSW_CPU_DISPATCH ++#endif + { + #define __dp_code_block1 \ +- z = _mm_load_si128(&s[t]); \ +- xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ +- xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ ++ z = simde_mm_load_si128(&s[t]); \ ++ xt1 = simde_mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ ++ xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ +- vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ +- vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ ++ vt1 = simde_mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ ++ vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ +- a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ +- ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ +- b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ +- x2t1= _mm_load_si128(&x2[t]); \ +- tmp = _mm_srli_si128(x2t1, 15); \ +- x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \ ++ a = simde_mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ ++ ut = simde_mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ ++ b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ ++ x2t1= simde_mm_load_si128(&x2[t]); \ ++ tmp = simde_mm_srli_si128(x2t1, 15); \ ++ x2t1= simde_mm_or_si128(simde_mm_slli_si128(x2t1, 1), x21_); \ + x21_= tmp; \ +- a2 = _mm_add_epi8(x2t1, vt1); \ +- a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t])); ++ a2 = simde_mm_add_epi8(x2t1, vt1); \ ++ a2a = simde_mm_add_epi8(a2, simde_mm_load_si128(&acceptor[t])); + + #define __dp_code_block2 \ +- _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ +- _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ +- tmp = _mm_sub_epi8(z, q_); \ +- a = _mm_sub_epi8(a, tmp); \ +- b = _mm_sub_epi8(b, tmp); \ +- a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_)); ++ simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ ++ simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ ++ tmp = simde_mm_sub_epi8(z, q_); \ ++ a = simde_mm_sub_epi8(a, tmp); \ ++ b = simde_mm_sub_epi8(b, tmp); \ ++ a2= simde_mm_sub_epi8(a2, simde_mm_sub_epi8(z, q2_)); + + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; +- __m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, sc_N_, m1_; +- __m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor; ++ simde__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, sc_N_, m1_; ++ simde__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor; + + ksw_reset_extz(ez); + if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; + +- zero_ = _mm_set1_epi8(0); +- q_ = _mm_set1_epi8(q); +- q2_ = _mm_set1_epi8(q2); +- qe_ = _mm_set1_epi8(q + e); +- sc_mch_ = _mm_set1_epi8(mat[0]); +- sc_mis_ = _mm_set1_epi8(mat[1]); +- sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]); +- m1_ = _mm_set1_epi8(m - 1); // wildcard ++ zero_ = simde_mm_set1_epi8(0); ++ q_ = simde_mm_set1_epi8(q); ++ q2_ = simde_mm_set1_epi8(q2); ++ qe_ = simde_mm_set1_epi8(q + e); ++ sc_mch_ = simde_mm_set1_epi8(mat[0]); ++ sc_mis_ = simde_mm_set1_epi8(mat[1]); ++ sc_N_ = mat[m*m-1] == 0? simde_mm_set1_epi8(-e) : simde_mm_set1_epi8(mat[m*m-1]); ++ m1_ = simde_mm_set1_epi8(m - 1); // wildcard + + tlen_ = (tlen + 15) / 16; + n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1; +@@ -89,7 +78,7 @@ + long_diff = long_thres * e - (q2 - q); + + mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16); +- u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned ++ u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_; + donor = x2 + tlen_, acceptor = donor + tlen_; + s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; +@@ -101,7 +90,7 @@ + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16); +- p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); ++ p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } +@@ -167,7 +156,7 @@ + int st = 0, en = tlen - 1, st0, en0, st_, en_; + int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v; + uint8_t *qrr = qr + (qlen - 1 - r); +- __m128i x1_, x21_, v1_; ++ simde__m128i x1_, x21_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; +@@ -189,146 +178,91 @@ + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { +- __m128i sq, st, tmp, mask; +- sq = _mm_loadu_si128((__m128i*)&sf[t]); +- st = _mm_loadu_si128((__m128i*)&qrr[t]); +- mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); +- tmp = _mm_cmpeq_epi8(sq, st); +-#ifdef __SSE4_1__ +- tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); +- tmp = _mm_blendv_epi8(tmp, sc_N_, mask); +-#else +- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); +- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); +-#endif +- _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp); ++ simde__m128i sq, st, tmp, mask; ++ sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]); ++ st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]); ++ mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_)); ++ tmp = simde_mm_cmpeq_epi8(sq, st); ++ tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp); ++ tmp = simde_mm_blendv_epi8(tmp, sc_N_, mask); ++ simde_mm_storeu_si128((simde__m128i*)((int8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop +- x1_ = _mm_cvtsi32_si128((uint8_t)x1); +- x21_ = _mm_cvtsi32_si128((uint8_t)x21); +- v1_ = _mm_cvtsi32_si128((uint8_t)v1); ++ x1_ = simde_mm_cvtsi32_si128((uint8_t)x1); ++ x21_ = simde_mm_cvtsi32_si128((uint8_t)x21); ++ v1_ = simde_mm_cvtsi32_si128((uint8_t)v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { +- __m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp; ++ simde__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +-#ifdef __SSE4_1__ +- z = _mm_max_epi8(z, a); +- z = _mm_max_epi8(z, b); +- z = _mm_max_epi8(z, a2a); ++ z = simde_mm_max_epi8(z, a); ++ z = simde_mm_max_epi8(z, b); ++ z = simde_mm_max_epi8(z, a2a); + __dp_code_block2; // save u[] and v[]; update a, b and a2 +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); +- tmp = _mm_load_si128(&donor[t]); +- _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_)); +-#else +- tmp = _mm_cmpgt_epi8(a, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(b, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(a2a, z); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a)); +- __dp_code_block2; +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); +- tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct +- tmp = _mm_cmpgt_epi8(a2, tmp); +- tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2)); +- _mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_)); +-#endif ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_max_epi8(a, zero_), qe_)); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_max_epi8(b, zero_), qe_)); ++ tmp = simde_mm_load_si128(&donor[t]); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_max_epi8(a2, tmp), q2_)); + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment +- __m128i *pr = p + r * n_col_ - st_; ++ simde__m128i *pr = p + r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; ++ simde__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; + __dp_code_block1; +-#ifdef __SSE4_1__ +- d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0 +- z = _mm_max_epi8(z, a); +- d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d +- z = _mm_max_epi8(z, b); +- d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d +- z = _mm_max_epi8(z, a2a); +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- tmp = _mm_cmpgt_epi8(a, z); +- d = _mm_and_si128(tmp, _mm_set1_epi8(1)); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(b, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2))); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(a2a, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); +- z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a)); +-#endif ++ d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), simde_mm_set1_epi8(1)); // d = a > z? 1 : 0 ++ z = simde_mm_max_epi8(z, a); ++ d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(2), simde_mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d ++ z = simde_mm_max_epi8(z, b); ++ d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(3), simde_mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d ++ z = simde_mm_max_epi8(z, a2a); + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 +- +- tmp2 = _mm_load_si128(&donor[t]); +- tmp = _mm_cmpgt_epi8(a2, tmp2); +-#ifdef __SSE4_1__ +- tmp2 = _mm_max_epi8(a2, tmp2); +-#else +- tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2)); +-#endif +- _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(a, zero_); ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, a), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 ++ tmp = simde_mm_cmpgt_epi8(b, zero_); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, b), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 ++ ++ tmp2 = simde_mm_load_si128(&donor[t]); ++ tmp = simde_mm_cmpgt_epi8(a2, tmp2); ++ tmp2 = simde_mm_max_epi8(a2, tmp2); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(tmp2, q2_)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x20))); ++ simde_mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment +- __m128i *pr = p + r * n_col_ - st_; ++ simde__m128i *pr = p + r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; ++ simde__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; + __dp_code_block1; +-#ifdef __SSE4_1__ +- d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1 +- z = _mm_max_epi8(z, a); +- d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 +- z = _mm_max_epi8(z, b); +- d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3 +- z = _mm_max_epi8(z, a2a); +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- tmp = _mm_cmpgt_epi8(z, a); +- d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a)); +- tmp = _mm_cmpgt_epi8(z, b); +- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2))); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b)); +- tmp = _mm_cmpgt_epi8(z, a2a); +- d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); +- z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a)); +-#endif ++ d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), simde_mm_set1_epi8(1)); // d = z > a? 0 : 1 ++ z = simde_mm_max_epi8(z, a); ++ d = simde_mm_blendv_epi8(simde_mm_set1_epi8(2), d, simde_mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 ++ z = simde_mm_max_epi8(z, b); ++ d = simde_mm_blendv_epi8(simde_mm_set1_epi8(3), d, simde_mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3 ++ z = simde_mm_max_epi8(z, a2a); + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(zero_, a); +- _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 +- tmp = _mm_cmpgt_epi8(zero_, b); +- _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 +- +- tmp2 = _mm_load_si128(&donor[t]); +- tmp = _mm_cmpgt_epi8(tmp2, a2); +-#ifdef __SSE4_1__ +- tmp2 = _mm_max_epi8(tmp2, a2); +-#else +- tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2)); +-#endif +- _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(zero_, a); ++ simde_mm_store_si128(&x[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 ++ tmp = simde_mm_cmpgt_epi8(zero_, b); ++ simde_mm_store_si128(&y[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b), qe_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 ++ ++ tmp2 = simde_mm_load_si128(&donor[t]); ++ tmp = simde_mm_cmpgt_epi8(tmp2, a2); ++ tmp2 = simde_mm_max_epi8(tmp2, a2); ++ simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(tmp2, q2_)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 ++ simde_mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array +@@ -336,29 +270,24 @@ + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; +- __m128i max_H_, max_t_; ++ simde__m128i max_H_, max_t_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element + max_t = en0; +- max_H_ = _mm_set1_epi32(max_H); +- max_t_ = _mm_set1_epi32(max_t); ++ max_H_ = simde_mm_set1_epi32(max_H); ++ max_t_ = simde_mm_set1_epi32(max_t); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; +- __m128i H1, tmp, t_; +- H1 = _mm_loadu_si128((__m128i*)&H[t]); +- t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); +- H1 = _mm_add_epi32(H1, t_); +- _mm_storeu_si128((__m128i*)&H[t], H1); +- t_ = _mm_set1_epi32(t); +- tmp = _mm_cmpgt_epi32(H1, max_H_); +-#ifdef __SSE4_1__ +- max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); +- max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); +-#else +- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); +- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); +-#endif ++ simde__m128i H1, tmp, t_; ++ H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]); ++ t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); ++ H1 = simde_mm_add_epi32(H1, t_); ++ simde_mm_storeu_si128((simde__m128i*)&H[t], H1); ++ t_ = simde_mm_set1_epi32(t); ++ tmp = simde_mm_cmpgt_epi32(H1, max_H_); ++ max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp); ++ max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp); + } +- _mm_storeu_si128((__m128i*)HH, max_H_); +- _mm_storeu_si128((__m128i*)tt, max_t_); ++ simde_mm_storeu_si128((simde__m128i*)HH, max_H_); ++ simde_mm_storeu_si128((simde__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE +@@ -406,4 +335,3 @@ + kfree(km, mem2); kfree(km, off); + } + } +-#endif // __SSE2__ +--- minimap2.orig/ksw2_extz2_sse.c ++++ minimap2/ksw2_extz2_sse.c +@@ -2,72 +2,61 @@ + #include + #include "ksw2.h" + +-#ifdef __SSE2__ +-#include + +-#ifdef KSW_SSE2_ONLY +-#undef __SSE4_1__ +-#endif +- +-#ifdef __SSE4_1__ +-#include +-#endif +- +-#ifdef KSW_CPU_DISPATCH +-#ifdef __SSE4_1__ ++#include "debian/include/simde/x86/sse4.1.h" ++#if defined(SIMDE_SSE4_1_NATIVE) + void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#else ++#elif defined(SIMDE_SSE2_NATIVE) + void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#endif + #else + void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +-#endif // ~KSW_CPU_DISPATCH ++#endif + { + #define __dp_code_block1 \ +- z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \ +- xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ +- xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ ++ z = simde_mm_add_epi8(simde_mm_load_si128(&s[t]), qe2_); \ ++ xt1 = simde_mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ ++ xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ +- vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ +- tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ +- vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ ++ vt1 = simde_mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ ++ tmp = simde_mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ ++ vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ +- a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ +- ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ +- b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ ++ a = simde_mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ ++ ut = simde_mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ ++ b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ + + #define __dp_code_block2 \ +- z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \ +- z = _mm_min_epu8(z, max_sc_); \ +- _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ +- _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ +- z = _mm_sub_epi8(z, q_); \ +- a = _mm_sub_epi8(a, z); \ +- b = _mm_sub_epi8(b, z); ++ z = simde_mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \ ++ z = simde_mm_min_epu8(z, max_sc_); \ ++ simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ ++ simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ ++ z = simde_mm_sub_epi8(z, q_); \ ++ a = simde_mm_sub_epi8(a, z); \ ++ b = simde_mm_sub_epi8(b, z); + + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; +- __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_; +- __m128i *u, *v, *x, *y, *s, *p = 0; ++ simde__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_; ++ simde__m128i *u, *v, *x, *y, *s, *p = 0; + + ksw_reset_extz(ez); + if (m <= 0 || qlen <= 0 || tlen <= 0) return; + +- zero_ = _mm_set1_epi8(0); +- q_ = _mm_set1_epi8(q); +- qe2_ = _mm_set1_epi8((q + e) * 2); +- flag1_ = _mm_set1_epi8(1); +- flag2_ = _mm_set1_epi8(2); +- flag8_ = _mm_set1_epi8(0x08); +- flag16_ = _mm_set1_epi8(0x10); +- sc_mch_ = _mm_set1_epi8(mat[0]); +- sc_mis_ = _mm_set1_epi8(mat[1]); +- sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]); +- m1_ = _mm_set1_epi8(m - 1); // wildcard +- max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2); ++ zero_ = simde_mm_set1_epi8(0); ++ q_ = simde_mm_set1_epi8(q); ++ qe2_ = simde_mm_set1_epi8((q + e) * 2); ++ flag1_ = simde_mm_set1_epi8(1); ++ flag2_ = simde_mm_set1_epi8(2); ++ flag8_ = simde_mm_set1_epi8(0x08); ++ flag16_ = simde_mm_set1_epi8(0x10); ++ sc_mch_ = simde_mm_set1_epi8(mat[0]); ++ sc_mis_ = simde_mm_set1_epi8(mat[1]); ++ sc_N_ = mat[m*m-1] == 0? simde_mm_set1_epi8(-e) : simde_mm_set1_epi8(mat[m*m-1]); ++ m1_ = simde_mm_set1_epi8(m - 1); // wildcard ++ max_sc_ = simde_mm_set1_epi8(mat[0] + (q + e) * 2); + + if (w < 0) w = tlen > qlen? tlen : qlen; + wl = wr = w; +@@ -82,7 +71,7 @@ + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches + + mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16); +- u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned ++ u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + if (!approx_max) { + H = (int32_t*)kmalloc(km, tlen_ * 16 * 4); +@@ -90,7 +79,7 @@ + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16); +- p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); ++ p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } +@@ -102,7 +91,7 @@ + int st = 0, en = tlen - 1, st0, en0, st_, en_; + int8_t x1, v1; + uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v; +- __m128i x1_, v1_; ++ simde__m128i x1_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; +@@ -124,101 +113,70 @@ + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { +- __m128i sq, st, tmp, mask; +- sq = _mm_loadu_si128((__m128i*)&sf[t]); +- st = _mm_loadu_si128((__m128i*)&qrr[t]); +- mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); +- tmp = _mm_cmpeq_epi8(sq, st); +-#ifdef __SSE4_1__ +- tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); +- tmp = _mm_blendv_epi8(tmp, sc_N_, mask); +-#else +- tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); +- tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); +-#endif +- _mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp); ++ simde__m128i sq, st, tmp, mask; ++ sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]); ++ st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]); ++ mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_)); ++ tmp = simde_mm_cmpeq_epi8(sq, st); ++ tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp); ++ tmp = simde_mm_blendv_epi8(tmp, sc_N_, mask); ++ simde_mm_storeu_si128((simde__m128i*)((uint8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop +- x1_ = _mm_cvtsi32_si128(x1); +- v1_ = _mm_cvtsi32_si128(v1); ++ x1_ = simde_mm_cvtsi32_si128(x1); ++ v1_ = simde_mm_cvtsi32_si128(v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { +- __m128i z, a, b, xt1, vt1, ut, tmp; ++ simde__m128i z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; +-#ifdef __SSE4_1__ +- z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() +- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; +- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative +-#endif ++ z = simde_mm_max_epi8(z, a); // z = z > a? z : a (signed) + __dp_code_block2; +-#ifdef __SSE4_1__ +- _mm_store_si128(&x[t], _mm_max_epi8(a, zero_)); +- _mm_store_si128(&y[t], _mm_max_epi8(b, zero_)); +-#else +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_and_si128(a, tmp)); +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_and_si128(b, tmp)); +-#endif ++ simde_mm_store_si128(&x[t], simde_mm_max_epi8(a, zero_)); ++ simde_mm_store_si128(&y[t], simde_mm_max_epi8(b, zero_)); + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment +- __m128i *pr = p + (size_t)r * n_col_ - st_; ++ simde__m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, xt1, vt1, ut, tmp; ++ simde__m128i d, z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; +- d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0 +-#ifdef __SSE4_1__ +- z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) +- tmp = _mm_cmpgt_epi8(b, z); +- d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; +- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative +- tmp = _mm_cmpgt_epi8(b, z); +- d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv +-#endif ++ d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0 ++ z = simde_mm_max_epi8(z, a); // z = z > a? z : a (signed) ++ tmp = simde_mm_cmpgt_epi8(b, z); ++ d = simde_mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(a, zero_); +- _mm_store_si128(&x[t], _mm_and_si128(tmp, a)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0 +- tmp = _mm_cmpgt_epi8(b, zero_); +- _mm_store_si128(&y[t], _mm_and_si128(tmp, b)); +- d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0 +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(a, zero_); ++ simde_mm_store_si128(&x[t], simde_mm_and_si128(tmp, a)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0 ++ tmp = simde_mm_cmpgt_epi8(b, zero_); ++ simde_mm_store_si128(&y[t], simde_mm_and_si128(tmp, b)); ++ d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0 ++ simde_mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment +- __m128i *pr = p + (size_t)r * n_col_ - st_; ++ simde__m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { +- __m128i d, z, a, b, xt1, vt1, ut, tmp; ++ simde__m128i d, z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; +- d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1 +-#ifdef __SSE4_1__ +- z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) +- tmp = _mm_cmpgt_epi8(z, b); +- d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2 +-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() +- z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; +- z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative +- tmp = _mm_cmpgt_epi8(z, b); +- d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv +-#endif ++ d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1 ++ z = simde_mm_max_epi8(z, a); // z = z > a? z : a (signed) ++ tmp = simde_mm_cmpgt_epi8(z, b); ++ d = simde_mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2 + __dp_code_block2; +- tmp = _mm_cmpgt_epi8(zero_, a); +- _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08 +- tmp = _mm_cmpgt_epi8(zero_, b); +- _mm_store_si128(&y[t], _mm_andnot_si128(tmp, b)); +- d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10 +- _mm_store_si128(&pr[t], d); ++ tmp = simde_mm_cmpgt_epi8(zero_, a); ++ simde_mm_store_si128(&x[t], simde_mm_andnot_si128(tmp, a)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08 ++ tmp = simde_mm_cmpgt_epi8(zero_, b); ++ simde_mm_store_si128(&y[t], simde_mm_andnot_si128(tmp, b)); ++ d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10 ++ simde_mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array +@@ -226,31 +184,26 @@ + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; +- __m128i max_H_, max_t_, qe_; ++ simde__m128i max_H_, max_t_, qe_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element + max_t = en0; +- max_H_ = _mm_set1_epi32(max_H); +- max_t_ = _mm_set1_epi32(max_t); +- qe_ = _mm_set1_epi32(q + e); ++ max_H_ = simde_mm_set1_epi32(max_H); ++ max_t_ = simde_mm_set1_epi32(max_t); ++ qe_ = simde_mm_set1_epi32(q + e); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; +- __m128i H1, tmp, t_; +- H1 = _mm_loadu_si128((__m128i*)&H[t]); +- t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); +- H1 = _mm_add_epi32(H1, t_); +- H1 = _mm_sub_epi32(H1, qe_); +- _mm_storeu_si128((__m128i*)&H[t], H1); +- t_ = _mm_set1_epi32(t); +- tmp = _mm_cmpgt_epi32(H1, max_H_); +-#ifdef __SSE4_1__ +- max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); +- max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); +-#else +- max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); +- max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); +-#endif ++ simde__m128i H1, tmp, t_; ++ H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]); ++ t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); ++ H1 = simde_mm_add_epi32(H1, t_); ++ H1 = simde_mm_sub_epi32(H1, qe_); ++ simde_mm_storeu_si128((simde__m128i*)&H[t], H1); ++ t_ = simde_mm_set1_epi32(t); ++ tmp = simde_mm_cmpgt_epi32(H1, max_H_); ++ max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp); ++ max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp); + } +- _mm_storeu_si128((__m128i*)HH, max_H_); +- _mm_storeu_si128((__m128i*)tt, max_t_); ++ simde_mm_storeu_si128((simde__m128i*)HH, max_H_); ++ simde_mm_storeu_si128((simde__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE +@@ -302,4 +255,3 @@ + kfree(km, mem2); kfree(km, off); + } + } +-#endif // __SSE2__ +--- minimap2.orig/ksw2_ll_sse.c ++++ minimap2/ksw2_ll_sse.c +@@ -1,7 +1,7 @@ + #include + #include + #include +-#include ++#include "debian/include/simde/x86/sse2.h" + #include "ksw2.h" + + #ifdef __GNUC__ +@@ -15,7 +15,7 @@ + typedef struct { + int qlen, slen; + uint8_t shift, mdiff, max, size; +- __m128i *qp, *H0, *H1, *E, *Hmax; ++ simde__m128i *qp, *H0, *H1, *E, *Hmax; + } kswq_t; + + /** +@@ -35,10 +35,10 @@ + int slen, a, tmp, p; + + size = size > 1? 2 : 1; +- p = 8 * (3 - size); // # values per __m128i ++ p = 8 * (3 - size); // # values per simde__m128i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory +- q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory ++ q->qp = (simde__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; +@@ -81,63 +81,63 @@ + { + kswq_t *q = (kswq_t*)q_; + int slen, i, gmax = 0, qlen8; +- __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; ++ simde__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + uint16_t *H8; + + #define __max_8(ret, xx) do { \ +- (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ +- (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ +- (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ +- (ret) = _mm_extract_epi16((xx), 0); \ ++ (xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 8)); \ ++ (xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 4)); \ ++ (xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 2)); \ ++ (ret) = simde_mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + *qe = *te = -1; +- zero = _mm_set1_epi32(0); +- gapoe = _mm_set1_epi16(_gapo + _gape); +- gape = _mm_set1_epi16(_gape); ++ zero = simde_mm_set1_epi32(0); ++ gapoe = simde_mm_set1_epi16(_gapo + _gape); ++ gape = simde_mm_set1_epi16(_gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen, qlen8 = slen * 8; +- memset(E, 0, slen * sizeof(__m128i)); +- memset(H0, 0, slen * sizeof(__m128i)); +- memset(Hmax, 0, slen * sizeof(__m128i)); ++ memset(E, 0, slen * sizeof(simde__m128i)); ++ memset(H0, 0, slen * sizeof(simde__m128i)); ++ memset(Hmax, 0, slen * sizeof(simde__m128i)); + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; +- __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector +- h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example +- h = _mm_slli_si128(h, 2); ++ simde__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector ++ h = simde_mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example ++ h = simde_mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { +- h = _mm_adds_epi16(h, *S++); +- e = _mm_load_si128(E + j); +- h = _mm_max_epi16(h, e); +- h = _mm_max_epi16(h, f); +- max = _mm_max_epi16(max, h); +- _mm_store_si128(H1 + j, h); +- h = _mm_subs_epu16(h, gapoe); +- e = _mm_subs_epu16(e, gape); +- e = _mm_max_epi16(e, h); +- _mm_store_si128(E + j, e); +- f = _mm_subs_epu16(f, gape); +- f = _mm_max_epi16(f, h); +- h = _mm_load_si128(H0 + j); ++ h = simde_mm_adds_epi16(h, *S++); ++ e = simde_mm_load_si128(E + j); ++ h = simde_mm_max_epi16(h, e); ++ h = simde_mm_max_epi16(h, f); ++ max = simde_mm_max_epi16(max, h); ++ simde_mm_store_si128(H1 + j, h); ++ h = simde_mm_subs_epu16(h, gapoe); ++ e = simde_mm_subs_epu16(e, gape); ++ e = simde_mm_max_epi16(e, h); ++ simde_mm_store_si128(E + j, e); ++ f = simde_mm_subs_epu16(f, gape); ++ f = simde_mm_max_epi16(f, h); ++ h = simde_mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 8); ++k) { +- f = _mm_slli_si128(f, 2); ++ f = simde_mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { +- h = _mm_load_si128(H1 + j); +- h = _mm_max_epi16(h, f); +- _mm_store_si128(H1 + j, h); +- h = _mm_subs_epu16(h, gapoe); +- f = _mm_subs_epu16(f, gape); +- if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16; ++ h = simde_mm_load_si128(H1 + j); ++ h = simde_mm_max_epi16(h, f); ++ simde_mm_store_si128(H1 + j, h); ++ h = simde_mm_subs_epu16(h, gapoe); ++ f = simde_mm_subs_epu16(f, gape); ++ if(UNLIKELY(!simde_mm_movemask_epi8(simde_mm_cmpgt_epi16(f, h)))) goto end_loop_i16; + } + } + end_loop_i16: + __max_8(imax, max); + if (imax >= gmax) { + gmax = imax; *te = i; +- memcpy(Hmax, H1, slen * sizeof(__m128i)); ++ memcpy(Hmax, H1, slen * sizeof(simde__m128i)); + } + S = H1; H1 = H0; H0 = S; + } +--- minimap2.orig/Makefile ++++ minimap2/Makefile +@@ -6,21 +6,17 @@ + PROG_EXTRA= sdust minimap2-lite + LIBS= -lm -lz -lpthread + +-ifeq ($(arm_neon),) # if arm_neon is not defined +-ifeq ($(sse2only),) # if sse2only is not defined ++OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ++ ++ifneq ($(amd64),) + OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o +-else # if sse2only is defined +- OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ++else ifneq ($(i386),) ++ OBJS+=ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o + endif +-else # if arm_neon is defined +- OBJS+=ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o +- INCLUDES+=-Isse2neon +-ifeq ($(aarch64),) #if aarch64 is not defined +- CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char +-else #if aarch64 is defined ++ ++ifneq ($(aarch64),) #if aarch64 is defined + CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char + endif +-endif + + .PHONY:all extra clean depend + .SUFFIXES:.c .o +@@ -46,7 +42,7 @@ + + # SSE-specific targets on x86/x86_64 + +-ifeq ($(arm_neon),) # if arm_neon is defined, compile this target with the default setting (i.e. no -msse2) ++ifneq ($(amd64),) # if amd64 is not defined, compile this target with the default setting (i.e. no -msse2) + ksw2_ll_sse.o:ksw2_ll_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@ + endif +@@ -72,17 +68,6 @@ + ksw2_dispatch.o:ksw2_dispatch.c ksw2.h + $(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ + +-# NEON-specific targets on ARM +- +-ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h +- $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ +- +-ksw2_extd2_neon.o:ksw2_extd2_sse.c ksw2.h kalloc.h +- $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ +- +-ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h +- $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@ +- + # other non-file targets + + clean: diff -Nru minimap2-2.17+dfsg/debian/rules minimap2-2.17+dfsg/debian/rules --- minimap2-2.17+dfsg/debian/rules 2019-08-01 13:23:40.000000000 +0000 +++ minimap2-2.17+dfsg/debian/rules 2020-01-12 17:22:11.000000000 +0000 @@ -2,6 +2,7 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all include /usr/share/dpkg/architecture.mk +export DEB_CFLAGS_MAINT_APPEND += -fopenmp-simd -O3 -DSIMDE_ENABLE_OPENMP %: dh $@ @@ -10,12 +11,13 @@ dh_auto_clean cd tex && make clean -ifneq (,$(filter $(DEB_HOST_ARCH_CPU)-$(DEB_HOST_ARCH_ABI),arm-eabihf arm64-base)) -build_vars += arm_neon=1 -ifneq (,$(filter $(DEB_HOST_ARCH_CPU),arm64)) +ifneq (,$(filter $(DEB_HOST_ARCH_CPU),amd64)) +build_vars += amd64=1 +else ifneq (,$(filter $(DEB_HOST_ARCH_CPU),amd64)) +build_vars += i386=1 +else ifneq (,$(filter $(DEB_HOST_ARCH_CPU),arm64)) build_vars += aarch64=1 endif -endif override_dh_auto_build: dh_auto_build -- $(build_vars) diff -Nru minimap2-2.17+dfsg/debian/upstream/metadata minimap2-2.17+dfsg/debian/upstream/metadata --- minimap2-2.17+dfsg/debian/upstream/metadata 2019-08-01 13:23:40.000000000 +0000 +++ minimap2-2.17+dfsg/debian/upstream/metadata 2020-01-12 17:22:11.000000000 +0000 @@ -1,19 +1,21 @@ Reference: - - Author: Heng Li - Title: "Minimap2: pairwise alignment for nucleotide sequences" - Journal: Bioinformatics - Year: 2018 - Pages: 2103-2110 - PMID: 29750242 - DOI: 10.1093/bioinformatics/bty191 - URL: "https://academic.oup.com/bioinformatics/advance-article/doi/\ - 10.1093/bioinformatics/bty191/4994778" - eprint: "https://academic.oup.com/bioinformatics/advance-article-pdf/doi/\ - 10.1093/bioinformatics/bty191/24814363/bty191.pdf" +- Author: Heng Li + Title: 'Minimap2: pairwise alignment for nucleotide sequences' + Journal: Bioinformatics + Year: 2018 + Pages: 2103-2110 + PMID: 29750242 + DOI: 10.1093/bioinformatics/bty191 + URL: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/bty191/4994778 + eprint: https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/bty191/24814363/bty191.pdf Registry: - - Name: OMICtools - Entry: OMICS_31658 - - Name: bio.tools - Entry: NA - - Name: SciCrunch - Entry: NA +- Name: OMICtools + Entry: OMICS_31658 +- Name: bio.tools + Entry: NA +- Name: SciCrunch + Entry: NA +- Name: conda:bioconda + Entry: minimap2 +Bug-Database: https://github.com/lh3/minimap2/issues +Bug-Submit: https://github.com/lh3/minimap2/issues/new